Compare commits
156 Commits
master
...
refactorin
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
363d04ce5d | ||
|
|
510b39b537 | ||
|
|
223d3e6ed0 | ||
|
|
9efa37ae87 | ||
|
|
f9019d6625 | ||
|
|
66c65ce900 | ||
|
|
0e7791394f | ||
|
|
0f6e87b8a6 | ||
|
|
f12ef4b8ed | ||
|
|
7360226e98 | ||
|
|
43688d0f0b | ||
|
|
effc69c42f | ||
|
|
0be5849df1 | ||
|
|
6a7cff5bf5 | ||
|
|
fc0f19c5f1 | ||
|
|
fdbc49ccba | ||
|
|
61371153f6 | ||
|
|
86bd96db67 | ||
|
|
46146cc886 | ||
|
|
be0c643f75 | ||
|
|
4ec7cb8d7b | ||
|
|
dcdfe03f43 | ||
|
|
77c86078eb | ||
|
|
e952d19c68 | ||
|
|
2bcac91dea | ||
|
|
7facedb38a | ||
|
|
3113d5cb5d | ||
|
|
ba901473fe | ||
|
|
e8b4467265 | ||
|
|
4c65d906b8 | ||
|
|
667b4a4858 | ||
|
|
83e6dc3ce7 | ||
|
|
fb69eb7f5c | ||
|
|
f98256d7e9 | ||
|
|
cbb3a8cc61 | ||
|
|
9f9face8f0 | ||
|
|
f2af040c5b | ||
|
|
6dbe3b6fc9 | ||
|
|
a3fece8096 | ||
|
|
26180373a0 | ||
|
|
186b4530f0 | ||
|
|
a1ccda4ea9 | ||
|
|
25d35e2349 | ||
|
|
daea7d2bf7 | ||
|
|
d5e501a05d | ||
|
|
d9d363834a | ||
|
|
5dc13e7137 | ||
|
|
826cd3b6a9 | ||
|
|
4f788af35b | ||
|
|
10ea584143 | ||
|
|
7676a8148e | ||
|
|
cee5e69a4b | ||
|
|
e715c86f8d | ||
|
|
c5ba489931 | ||
|
|
3772ca021a | ||
|
|
c4eeb956ca | ||
|
|
d823ebf7c6 | ||
|
|
71ffb28381 | ||
|
|
9dfbe9a142 | ||
|
|
0eb57056ba | ||
|
|
70802d6341 | ||
|
|
52776494cb | ||
|
|
7d8842b4ac | ||
|
|
9e77e25afb | ||
|
|
b3480491be | ||
|
|
3d0c2396ee | ||
|
|
f8c2d691b2 | ||
|
|
ced1cd9559 | ||
|
|
738c51a337 | ||
|
|
48f6aebc13 | ||
|
|
73d546367c | ||
|
|
cfe4b58e38 | ||
|
|
839a264816 | ||
|
|
fd57fe99b7 | ||
|
|
5e51fd1d10 | ||
|
|
9c7c5e315f | ||
|
|
3da613af94 | ||
|
|
30e6350881 | ||
|
|
384f0e5f28 | ||
|
|
4d181448b6 | ||
|
|
a5cd3d6ec9 | ||
|
|
893622a73e | ||
|
|
4d11a157e5 | ||
|
|
4c10d521e2 | ||
|
|
0f6cbec1d5 | ||
|
|
54484d9ad0 | ||
|
|
ca190721d6 | ||
|
|
5611314ff3 | ||
|
|
4ecfe16df5 | ||
|
|
38c0614396 | ||
|
|
64565f9cb0 | ||
|
|
232c6bed4b | ||
|
|
8d34873d1c | ||
|
|
78a951a319 | ||
|
|
8d57d2043d | ||
|
|
41fdda4955 | ||
|
|
4dfdd579a2 | ||
|
|
e831ab1382 | ||
|
|
6fead2d9b9 | ||
|
|
1012988475 | ||
|
|
5bc1550eae | ||
|
|
29741fc5da | ||
|
|
4772e3037c | ||
|
|
dd6ab94aa2 | ||
|
|
eaca8725de | ||
|
|
4af202f098 | ||
|
|
1199845cdf | ||
|
|
4578413748 | ||
|
|
d5d67cb064 | ||
|
|
d8542762e6 | ||
|
|
caef416077 | ||
|
|
a8708ffc56 | ||
|
|
3f0bbf0fc7 | ||
|
|
2fec39eda6 | ||
|
|
16cc0007ed | ||
|
|
3d83489819 | ||
|
|
3134021596 | ||
|
|
3cb857d830 | ||
|
|
194102939e | ||
|
|
5d1d9516b5 | ||
|
|
77f85e9de1 | ||
|
|
c00081b2bc | ||
|
|
619f67f1fd | ||
|
|
a97f8def7c | ||
|
|
65e9735bd9 | ||
|
|
689be75478 | ||
|
|
acf46a7a48 | ||
|
|
0f11441b20 | ||
|
|
fa1fa15cc8 | ||
|
|
17c40c996a | ||
|
|
99af2943b5 | ||
|
|
0e6cb495e8 | ||
|
|
012e705e70 | ||
|
|
8327794685 | ||
|
|
72bc52dc7b | ||
|
|
557d091a54 | ||
|
|
b540cfd0f2 | ||
|
|
8824c5c3ea | ||
|
|
94e9210faf | ||
|
|
06d6863cc5 | ||
|
|
dfd87cb4b0 | ||
|
|
cd5457840b | ||
|
|
eee2f0e256 | ||
|
|
9d2f166fbf | ||
|
|
97fb4b645d | ||
|
|
00e53fb54d | ||
|
|
4be91de036 | ||
|
|
8c6b940364 | ||
|
|
cdb12baccd | ||
|
|
ac84494613 | ||
|
|
77f565c652 | ||
|
|
47e657aaa3 | ||
|
|
b592497b75 | ||
|
|
c0d961bc39 | ||
|
|
8260ae58f9 | ||
|
|
068f75d35b |
@ -10,7 +10,7 @@ omit =
|
||||
*/build_venv/*
|
||||
*/incl/*
|
||||
source =
|
||||
cv_analysis
|
||||
cv_analysis
|
||||
relative_files = True
|
||||
data_file = .coverage
|
||||
|
||||
@ -46,4 +46,4 @@ ignore_errors = True
|
||||
directory = reports
|
||||
|
||||
[xml]
|
||||
output = reports/coverage.xml
|
||||
output = reports/coverage.xml
|
||||
@ -97,4 +97,4 @@ target/
|
||||
*.swp
|
||||
*/*.swp
|
||||
*/*/*.swp
|
||||
*/*/*/*.swp
|
||||
*/*/*/*.swp
|
||||
@ -1,10 +1,7 @@
|
||||
[core]
|
||||
remote = azure_remote
|
||||
remote = vector
|
||||
autostage = true
|
||||
['remote "vector"']
|
||||
url = ssh://vector.iqser.com/research/nonml_cv_doc_parsing/
|
||||
port = 22
|
||||
['remote "azure_remote"']
|
||||
url = azure://cv-sa-dvc/
|
||||
connection_string = "DefaultEndpointsProtocol=https;AccountName=cvsacricket;AccountKey=KOuTAQ6Mp00ePTT5ObYmgaHlxwS1qukY4QU4Kuk7gy/vldneA+ZiKjaOpEFtqKA6Mtym2gQz8THy+ASts/Y1Bw==;EndpointSuffix=core.windows.net"
|
||||
['remote "local"']
|
||||
url = ../dvc_local_remote
|
||||
|
||||
|
||||
77
.gitignore
vendored
77
.gitignore
vendored
@ -1,52 +1,27 @@
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
.pytest*
|
||||
.python-version
|
||||
.DS_Store
|
||||
|
||||
# Project folders
|
||||
scratch/
|
||||
*.vscode/
|
||||
.idea
|
||||
*_app
|
||||
*pytest_cache
|
||||
*joblib
|
||||
*tmp
|
||||
*profiling
|
||||
*logs
|
||||
*docker
|
||||
*drivers
|
||||
*bamboo-specs/target
|
||||
|
||||
# Python specific files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*.ipynb
|
||||
*.ipynb_checkpoints
|
||||
|
||||
# file extensions
|
||||
*.log
|
||||
*.csv
|
||||
*.json
|
||||
*.pkl
|
||||
*.profile
|
||||
*.cbm
|
||||
|
||||
# temp files
|
||||
*.swp
|
||||
*~
|
||||
*.un~
|
||||
|
||||
# keep files
|
||||
!notebooks/*.ipynb
|
||||
|
||||
# keep folders
|
||||
!secrets
|
||||
!data/*
|
||||
!drivers
|
||||
|
||||
# unignore files
|
||||
!bom.*
|
||||
*.egg-info/
|
||||
deskew_model/
|
||||
build_venv/
|
||||
/pdfs/
|
||||
/results/
|
||||
/pdfs/
|
||||
/env/
|
||||
/.idea/
|
||||
/.idea/.gitignore
|
||||
/.idea/misc.xml
|
||||
/.idea/inspectionProfiles/profiles_settings.xml
|
||||
/.idea/table_parsing.iml
|
||||
/.idea/vcs.xml
|
||||
/results/
|
||||
/table_parsing.egg-info
|
||||
/target/
|
||||
/tests/
|
||||
/cv_analysis.egg-info/dependency_links.txt
|
||||
/cv_analysis.egg-info/PKG-INFO
|
||||
/cv_analysis.egg-info/SOURCES.txt
|
||||
/cv_analysis.egg-info/top_level.txt
|
||||
/.vscode/
|
||||
/cv_analysis/test/test_data/example_pages.json
|
||||
/data/metadata_testing_files.csv
|
||||
.coverage
|
||||
/data/
|
||||
|
||||
@ -1,30 +0,0 @@
|
||||
include:
|
||||
- project: "Gitlab/gitlab"
|
||||
ref: 0.3.0
|
||||
file: "/ci-templates/research/dvc-versioning-build-release.gitlab-ci.yml"
|
||||
|
||||
variables:
|
||||
NEXUS_PROJECT_DIR: red
|
||||
IMAGENAME: "${CI_PROJECT_NAME}"
|
||||
|
||||
#################################
|
||||
# temp. disable integration tests, b/c they don't cover the CV analysis case yet
|
||||
trigger integration tests:
|
||||
rules:
|
||||
- when: never
|
||||
|
||||
release build:
|
||||
stage: release
|
||||
needs:
|
||||
- job: set custom version
|
||||
artifacts: true
|
||||
optional: true
|
||||
- job: calculate patch version
|
||||
artifacts: true
|
||||
optional: true
|
||||
- job: calculate minor version
|
||||
artifacts: true
|
||||
optional: true
|
||||
- job: build docker nexus
|
||||
artifacts: true
|
||||
#################################
|
||||
@ -1,35 +0,0 @@
|
||||
# CI for services, check gitlab repo for python package CI
|
||||
include:
|
||||
- project: "Gitlab/gitlab"
|
||||
ref: main
|
||||
file: "/ci-templates/research/versioning-build-test-release.gitlab-ci.yml"
|
||||
- project: "Gitlab/gitlab"
|
||||
ref: main
|
||||
file: "/ci-templates/research/docs.gitlab-ci.yml"
|
||||
|
||||
# set project variables here
|
||||
variables:
|
||||
NEXUS_PROJECT_DIR: red # subfolder in Nexus docker-gin where your container will be stored
|
||||
IMAGENAME: $CI_PROJECT_NAME # if the project URL is gitlab.example.com/group-name/project-1, CI_PROJECT_NAME is project-1
|
||||
|
||||
pages:
|
||||
only:
|
||||
- master # KEEP THIS, necessary because `master` branch and not `main` branch
|
||||
|
||||
###################
|
||||
# INTEGRATION TESTS
|
||||
trigger-integration-tests:
|
||||
extends: .integration-tests
|
||||
# ADD THE MODEL BUILD WHICH SHOULD TRIGGER THE INTEGRATION TESTS
|
||||
# needs:
|
||||
# - job: docker-build::model_name
|
||||
# artifacts: true
|
||||
rules:
|
||||
- when: never
|
||||
|
||||
#########
|
||||
# RELEASE
|
||||
release:
|
||||
extends: .release
|
||||
needs:
|
||||
- !reference [.needs-versioning, needs] # leave this line as is
|
||||
@ -1,61 +0,0 @@
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import semver
|
||||
from loguru import logger
|
||||
from semver.version import Version
|
||||
|
||||
logger.remove()
|
||||
logger.add(sys.stdout, level="INFO")
|
||||
|
||||
|
||||
def bashcmd(cmds: list) -> str:
|
||||
try:
|
||||
logger.debug(f"running: {' '.join(cmds)}")
|
||||
return subprocess.run(cmds, check=True, capture_output=True, text=True).stdout.strip("\n")
|
||||
except:
|
||||
logger.warning(f"Error executing the following bash command: {' '.join(cmds)}.")
|
||||
raise
|
||||
|
||||
|
||||
def get_highest_existing_git_version_tag() -> str:
|
||||
"""Get highest versions from git tags depending on bump level"""
|
||||
try:
|
||||
git_tags = bashcmd(["git", "tag", "-l"]).split()
|
||||
semver_compat_tags = list(filter(Version.is_valid, git_tags))
|
||||
highest_git_version_tag = max(semver_compat_tags, key=semver.version.Version.parse)
|
||||
logger.info(f"Highest git version tag: {highest_git_version_tag}")
|
||||
return highest_git_version_tag
|
||||
except:
|
||||
logger.warning("Error getting git version tags")
|
||||
raise
|
||||
|
||||
|
||||
def auto_bump_version() -> bool:
|
||||
active = Path(".autoversion").is_file()
|
||||
logger.debug(f"Automated version bump is set to '{active}'")
|
||||
return active
|
||||
|
||||
|
||||
def main() -> None:
|
||||
poetry_project_version = bashcmd(["poetry", "version", "-s"])
|
||||
|
||||
logger.info(f"Poetry project version: {poetry_project_version}")
|
||||
|
||||
highest_git_version_tag = get_highest_existing_git_version_tag()
|
||||
|
||||
comparison_result = semver.compare(poetry_project_version, highest_git_version_tag)
|
||||
|
||||
if comparison_result in (-1, 0):
|
||||
logger.warning("Poetry version must be greater than git tag version.")
|
||||
if auto_bump_version():
|
||||
logger.info(bashcmd(["poetry", "version", highest_git_version_tag]))
|
||||
sys.exit(0)
|
||||
sys.exit(1)
|
||||
else:
|
||||
logger.info(f"All good: {poetry_project_version} > {highest_git_version_tag}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,72 +0,0 @@
|
||||
# See https://pre-commit.com for more information
|
||||
# See https://pre-commit.com/hooks.html for more hooks
|
||||
exclude: ^(docs/|notebooks/|data/|src/configs/|tests/|.hooks/|bom.json)
|
||||
default_language_version:
|
||||
python: python3.10
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v5.0.0
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
- id: check-yaml
|
||||
args: [--unsafe] # needed for .gitlab-ci.yml
|
||||
- id: check-toml
|
||||
- id: detect-private-key
|
||||
- id: check-added-large-files
|
||||
args: ['--maxkb=10000']
|
||||
- id: check-case-conflict
|
||||
- id: mixed-line-ending
|
||||
|
||||
# - repo: https://github.com/pre-commit/mirrors-pylint
|
||||
# rev: v3.0.0a5
|
||||
# hooks:
|
||||
# - id: pylint
|
||||
# args:
|
||||
# - --disable=C0111,R0903,E0401
|
||||
# - --max-line-length=120
|
||||
|
||||
- repo: https://github.com/pre-commit/mirrors-isort
|
||||
rev: v5.10.1
|
||||
hooks:
|
||||
- id: isort
|
||||
args:
|
||||
- --profile black
|
||||
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 24.10.0
|
||||
hooks:
|
||||
- id: black
|
||||
# exclude: ^(docs/|notebooks/|data/|src/secrets/)
|
||||
args:
|
||||
- --line-length=120
|
||||
|
||||
- repo: https://github.com/compilerla/conventional-pre-commit
|
||||
rev: v4.0.0
|
||||
hooks:
|
||||
- id: conventional-pre-commit
|
||||
pass_filenames: false
|
||||
stages: [commit-msg]
|
||||
# args: [] # optional: list of Conventional Commits types to allow e.g. [feat, fix, ci, chore, test]
|
||||
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: version-checker
|
||||
name: version-checker
|
||||
entry: python .hooks/poetry_version_check.py
|
||||
language: python
|
||||
always_run: true
|
||||
additional_dependencies:
|
||||
- "semver"
|
||||
- "loguru"
|
||||
|
||||
# - repo: local
|
||||
# hooks:
|
||||
# - id: docker-build-test
|
||||
# name: testing docker build
|
||||
# entry: ./scripts/ops/docker-compose-build-run.sh
|
||||
# language: script
|
||||
# # always_run: true
|
||||
# pass_filenames: false
|
||||
# args: []
|
||||
# stages: [pre-commit]
|
||||
84
Dockerfile
84
Dockerfile
@ -1,78 +1,30 @@
|
||||
###############
|
||||
# BUILDER IMAGE
|
||||
FROM python:3.10-slim as builder
|
||||
FROM python:3.10
|
||||
|
||||
ARG GITLAB_USER
|
||||
ARG GITLAB_ACCESS_TOKEN
|
||||
RUN python -m venv /app/venv
|
||||
ENV PATH="/app/venv/bin:$PATH"
|
||||
|
||||
ARG PYPI_REGISTRY_RESEARCH=https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
|
||||
ARG POETRY_SOURCE_REF_RESEARCH=gitlab-research
|
||||
RUN python -m pip install --upgrade pip
|
||||
|
||||
ARG PYPI_REGISTRY_RED=https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
|
||||
ARG POETRY_SOURCE_REF_RED=gitlab-red
|
||||
WORKDIR /app/service
|
||||
|
||||
ARG PYPI_REGISTRY_FFORESIGHT=https://gitlab.knecon.com/api/v4/groups/269/-/packages/pypi
|
||||
ARG POETRY_SOURCE_REF_FFORESIGHT=gitlab-fforesight
|
||||
COPY ./requirements.txt ./requirements.txt
|
||||
RUN python3 -m pip install -r requirements.txt
|
||||
|
||||
ARG VERSION=dev
|
||||
COPY ./incl/pyinfra/requirements.txt ./incl/pyinfra/requirements.txt
|
||||
RUN python -m pip install -r incl/pyinfra/requirements.txt
|
||||
|
||||
LABEL maintainer="Research <research@knecon.com>"
|
||||
LABEL version="${VERSION}"
|
||||
COPY ./incl/pdf2image/requirements.txt ./incl/pdf2image/requirements.txt
|
||||
RUN python -m pip install -r incl/pdf2image/requirements.txt
|
||||
|
||||
WORKDIR /app
|
||||
COPY ./incl ./incl
|
||||
|
||||
###########
|
||||
# ENV SETUP
|
||||
ENV PYTHONDONTWRITEBYTECODE=true
|
||||
ENV PYTHONUNBUFFERED=true
|
||||
ENV POETRY_HOME=/opt/poetry
|
||||
ENV PATH="$POETRY_HOME/bin:$PATH"
|
||||
RUN python3 -m pip install -e incl/pyinfra
|
||||
RUN python3 -m pip install -e incl/pdf2image
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y curl git bash build-essential libffi-dev libssl-dev && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN curl -sSL https://install.python-poetry.org | python3 -
|
||||
RUN poetry --version
|
||||
|
||||
COPY pyproject.toml poetry.lock ./
|
||||
|
||||
RUN poetry config virtualenvs.create true && \
|
||||
poetry config virtualenvs.in-project true && \
|
||||
poetry config installer.max-workers 10 && \
|
||||
poetry config repositories.${POETRY_SOURCE_REF_RESEARCH} ${PYPI_REGISTRY_RESEARCH} && \
|
||||
poetry config http-basic.${POETRY_SOURCE_REF_RESEARCH} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
|
||||
poetry config repositories.${POETRY_SOURCE_REF_RED} ${PYPI_REGISTRY_RED} && \
|
||||
poetry config http-basic.${POETRY_SOURCE_REF_RED} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
|
||||
poetry config repositories.${POETRY_SOURCE_REF_FFORESIGHT} ${PYPI_REGISTRY_FFORESIGHT} && \
|
||||
poetry config http-basic.${POETRY_SOURCE_REF_FFORESIGHT} ${GITLAB_USER} ${GITLAB_ACCESS_TOKEN} && \
|
||||
poetry install --without=dev,docs,test -vv --no-interaction --no-root
|
||||
|
||||
##################
|
||||
# COPY SOURCE CODE
|
||||
COPY ./config ./config
|
||||
COPY ./src ./src
|
||||
COPY ./cv_analysis ./cv_analysis
|
||||
COPY ./setup.py ./setup.py
|
||||
|
||||
###############
|
||||
# WORKING IMAGE
|
||||
FROM python:3.10-slim
|
||||
RUN python3 -m pip install -e .
|
||||
|
||||
# COPY BILL OF MATERIALS (BOM)
|
||||
COPY bom.json /bom.json
|
||||
|
||||
# COPY SOURCE CODE FROM BUILDER IMAGE
|
||||
COPY --from=builder /app /app
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENV PATH="/app/.venv/bin:$PATH"
|
||||
|
||||
############
|
||||
# NETWORKING
|
||||
EXPOSE 5000
|
||||
EXPOSE 8080
|
||||
|
||||
################
|
||||
# LAUNCH COMMAND
|
||||
CMD [ "python", "src/serve.py"]
|
||||
CMD ["python3", "-u", "src/serve.py"]
|
||||
94
Makefile
94
Makefile
@ -1,94 +0,0 @@
|
||||
.PHONY: \
|
||||
poetry in-project-venv dev-env use-env install install-dev tests \
|
||||
update-version sync-version-with-git \
|
||||
docker docker-build-run docker-build docker-run \
|
||||
docker-rm docker-rm-container docker-rm-image \
|
||||
pre-commit get-licenses prep-commit \
|
||||
docs sphinx_html sphinx_apidoc bom
|
||||
.DEFAULT_GOAL := run
|
||||
|
||||
export DOCKER=docker
|
||||
export DOCKERFILE=Dockerfile
|
||||
export IMAGE_NAME=cv_analysis_service-image
|
||||
export CONTAINER_NAME=cv_analysis_service-container
|
||||
export HOST_PORT=9999
|
||||
export CONTAINER_PORT=9999
|
||||
export PYTHON_VERSION=python3.10
|
||||
|
||||
# all commands should be executed in the root dir or the project,
|
||||
# specific environments should be deactivated
|
||||
|
||||
poetry: in-project-venv use-env dev-env
|
||||
|
||||
in-project-venv:
|
||||
poetry config virtualenvs.in-project true
|
||||
|
||||
use-env:
|
||||
poetry env use ${PYTHON_VERSION}
|
||||
|
||||
dev-env:
|
||||
poetry install --with dev && poetry update
|
||||
|
||||
install:
|
||||
poetry add $(pkg)
|
||||
|
||||
install-dev:
|
||||
poetry add --dev $(pkg)
|
||||
|
||||
requirements:
|
||||
poetry export --without-hashes --output requirements.txt
|
||||
|
||||
update-version:
|
||||
poetry version prerelease
|
||||
|
||||
sync-version-with-git:
|
||||
git pull -p && poetry version $(git rev-list --tags --max-count=1 | git describe --tags --abbrev=0)
|
||||
|
||||
bom:
|
||||
cyclonedx-py poetry -o bom.json
|
||||
|
||||
docker: docker-rm docker-build-run
|
||||
|
||||
docker-build-run: docker-build docker-run
|
||||
|
||||
docker-build:
|
||||
$(DOCKER) build \
|
||||
--no-cache --progress=plain \
|
||||
-t $(IMAGE_NAME) -f $(DOCKERFILE) \
|
||||
--build-arg USERNAME=${USERNAME} \
|
||||
--build-arg TOKEN=${GITLAB_TOKEN} \
|
||||
.
|
||||
|
||||
docker-run:
|
||||
$(DOCKER) run -it --rm -p $(HOST_PORT):$(CONTAINER_PORT)/tcp --name $(CONTAINER_NAME) $(IMAGE_NAME)
|
||||
|
||||
docker-rm: docker-rm-container docker-rm-image
|
||||
|
||||
docker-rm-container:
|
||||
-$(DOCKER) rm $(CONTAINER_NAME)
|
||||
|
||||
docker-rm-image:
|
||||
-$(DOCKER) image rm $(IMAGE_NAME)
|
||||
|
||||
tests:
|
||||
poetry run pytest ./tests
|
||||
|
||||
prep-commit:
|
||||
docs get-license sync-version-with-git update-version pre-commit
|
||||
|
||||
pre-commit:
|
||||
pre-commit run --all-files
|
||||
|
||||
get-licenses:
|
||||
pip-licenses --format=json --order=license --with-urls > pkg-licenses.json
|
||||
|
||||
docs: sphinx_apidoc sphinx_html
|
||||
|
||||
sphinx_html:
|
||||
poetry run sphinx-build -b html docs/source/ docs/build/html -E -a
|
||||
|
||||
sphinx_apidoc:
|
||||
cp ./README.md ./docs/source/README.md && cp -r ./data ./docs/source/data/ && poetry run sphinx-apidoc ./src -o ./docs/source/modules --no-toc --module-first --follow-links --separate --force
|
||||
|
||||
bom:
|
||||
cyclonedx-py poetry -o bom.json
|
||||
57
README.md
57
README.md
@ -1,60 +1,8 @@
|
||||
# cv-analysis - Visual (CV-Based) Document Parsing
|
||||
# cv-analysis — Visual (CV-Based) Document Parsing
|
||||
|
||||
parse_pdf()
|
||||
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
|
||||
previous redactions in documents.
|
||||
|
||||
## API
|
||||
|
||||
Input message:
|
||||
|
||||
```json
|
||||
{
|
||||
"targetFilePath": {
|
||||
"pdf": "absolute file path",
|
||||
"vlp_output": "absolute file path"
|
||||
},
|
||||
"responseFilePath": "absolute file path",
|
||||
"operation": "table_image_inference"
|
||||
}
|
||||
```
|
||||
|
||||
Response is uploaded to the storage as specified in the `responseFilePath` field. The structure is as follows:
|
||||
|
||||
```json
|
||||
{
|
||||
...,
|
||||
"data": [
|
||||
{
|
||||
'pageNum': 0,
|
||||
'bbox': {
|
||||
'x1': 55.3407,
|
||||
'y1': 247.0246,
|
||||
'x2': 558.5602,
|
||||
'y2': 598.0585
|
||||
},
|
||||
'uuid': '2b10c1a2-393c-4fca-b9e3-0ad5b774ac84',
|
||||
'label': 'table',
|
||||
'tableLines': [
|
||||
{
|
||||
'x1': 0,
|
||||
'y1': 16,
|
||||
'x2': 1399,
|
||||
'y2': 16
|
||||
},
|
||||
...
|
||||
],
|
||||
'imageInfo': {
|
||||
'height': 693,
|
||||
'width': 1414
|
||||
}
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
@ -83,9 +31,10 @@ The below snippet shows hot to find the outlines of previous redactions.
|
||||
|
||||
```python
|
||||
from cv_analysis.redaction_detection import find_redactions
|
||||
import pdf2image
|
||||
import pdf2image
|
||||
import numpy as np
|
||||
|
||||
|
||||
pdf_path = ...
|
||||
page_index = ...
|
||||
|
||||
|
||||
40
bamboo-specs/pom.xml
Normal file
40
bamboo-specs/pom.xml
Normal file
@ -0,0 +1,40 @@
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
<parent>
|
||||
<groupId>com.atlassian.bamboo</groupId>
|
||||
<artifactId>bamboo-specs-parent</artifactId>
|
||||
<version>7.1.2</version>
|
||||
<relativePath/>
|
||||
</parent>
|
||||
|
||||
<artifactId>bamboo-specs</artifactId>
|
||||
<version>1.0.0-SNAPSHOT</version>
|
||||
<packaging>jar</packaging>
|
||||
|
||||
<properties>
|
||||
<sonar.skip>true</sonar.skip>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
<dependency>
|
||||
<groupId>com.atlassian.bamboo</groupId>
|
||||
<artifactId>bamboo-specs-api</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.atlassian.bamboo</groupId>
|
||||
<artifactId>bamboo-specs</artifactId>
|
||||
</dependency>
|
||||
|
||||
<!-- Test dependencies -->
|
||||
<dependency>
|
||||
<groupId>junit</groupId>
|
||||
<artifactId>junit</artifactId>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
|
||||
<!-- run 'mvn test' to perform offline validation of the plan -->
|
||||
<!-- run 'mvn -Ppublish-specs' to upload the plan to your Bamboo server -->
|
||||
</project>
|
||||
178
bamboo-specs/src/main/java/buildjob/PlanSpec.java
Normal file
178
bamboo-specs/src/main/java/buildjob/PlanSpec.java
Normal file
@ -0,0 +1,178 @@
|
||||
package buildjob;
|
||||
|
||||
import static com.atlassian.bamboo.specs.builders.task.TestParserTask.createJUnitParserTask;
|
||||
|
||||
import java.time.LocalTime;
|
||||
|
||||
import com.atlassian.bamboo.specs.api.BambooSpec;
|
||||
import com.atlassian.bamboo.specs.api.builders.BambooKey;
|
||||
import com.atlassian.bamboo.specs.api.builders.docker.DockerConfiguration;
|
||||
import com.atlassian.bamboo.specs.api.builders.permission.PermissionType;
|
||||
import com.atlassian.bamboo.specs.api.builders.permission.Permissions;
|
||||
import com.atlassian.bamboo.specs.api.builders.permission.PlanPermissions;
|
||||
import com.atlassian.bamboo.specs.api.builders.plan.Job;
|
||||
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
|
||||
import com.atlassian.bamboo.specs.api.builders.plan.PlanIdentifier;
|
||||
import com.atlassian.bamboo.specs.api.builders.plan.Stage;
|
||||
import com.atlassian.bamboo.specs.api.builders.plan.branches.BranchCleanup;
|
||||
import com.atlassian.bamboo.specs.api.builders.plan.branches.PlanBranchManagement;
|
||||
import com.atlassian.bamboo.specs.api.builders.project.Project;
|
||||
import com.atlassian.bamboo.specs.builders.task.CheckoutItem;
|
||||
import com.atlassian.bamboo.specs.builders.task.InjectVariablesTask;
|
||||
import com.atlassian.bamboo.specs.builders.task.ScriptTask;
|
||||
import com.atlassian.bamboo.specs.builders.task.VcsCheckoutTask;
|
||||
import com.atlassian.bamboo.specs.builders.task.CleanWorkingDirectoryTask;
|
||||
import com.atlassian.bamboo.specs.builders.task.VcsTagTask;
|
||||
import com.atlassian.bamboo.specs.builders.trigger.BitbucketServerTrigger;
|
||||
import com.atlassian.bamboo.specs.builders.trigger.ScheduledTrigger;
|
||||
import com.atlassian.bamboo.specs.model.task.InjectVariablesScope;
|
||||
import com.atlassian.bamboo.specs.api.builders.Variable;
|
||||
import com.atlassian.bamboo.specs.util.BambooServer;
|
||||
import com.atlassian.bamboo.specs.builders.task.ScriptTask;
|
||||
import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
|
||||
|
||||
/**
|
||||
* Plan configuration for Bamboo.
|
||||
* Learn more on: <a href="https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs">https://confluence.atlassian.com/display/BAMBOO/Bamboo+Specs</a>
|
||||
*/
|
||||
@BambooSpec
|
||||
public class PlanSpec {
|
||||
|
||||
private static final String SERVICE_NAME = "cv-analysis";
|
||||
|
||||
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_","");
|
||||
|
||||
/**
|
||||
* Run main to publish plan on Bamboo
|
||||
*/
|
||||
public static void main(final String[] args) throws Exception {
|
||||
//By default credentials are read from the '.credentials' file.
|
||||
BambooServer bambooServer = new BambooServer("http://localhost:8085");
|
||||
|
||||
Plan plan = new PlanSpec().createDockerBuildPlan();
|
||||
bambooServer.publish(plan);
|
||||
PlanPermissions planPermission = new PlanSpec().createPlanPermission(plan.getIdentifier());
|
||||
bambooServer.publish(planPermission);
|
||||
|
||||
Plan secPlan = new PlanSpec().createSecBuild();
|
||||
bambooServer.publish(secPlan);
|
||||
PlanPermissions secPlanPermission = new PlanSpec().createPlanPermission(secPlan.getIdentifier());
|
||||
bambooServer.publish(secPlanPermission);
|
||||
}
|
||||
|
||||
private PlanPermissions createPlanPermission(PlanIdentifier planIdentifier) {
|
||||
Permissions permission = new Permissions()
|
||||
.userPermissions("atlbamboo", PermissionType.EDIT, PermissionType.VIEW, PermissionType.ADMIN, PermissionType.CLONE, PermissionType.BUILD)
|
||||
.groupPermissions("research", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
|
||||
.groupPermissions("Development", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
|
||||
.groupPermissions("QA", PermissionType.EDIT, PermissionType.VIEW, PermissionType.CLONE, PermissionType.BUILD)
|
||||
.loggedInUserPermissions(PermissionType.VIEW)
|
||||
.anonymousUserPermissionView();
|
||||
return new PlanPermissions(planIdentifier.getProjectKey(), planIdentifier.getPlanKey()).permissions(permission);
|
||||
}
|
||||
|
||||
private Project project() {
|
||||
return new Project()
|
||||
.name("RED")
|
||||
.key(new BambooKey("RED"));
|
||||
}
|
||||
|
||||
public Plan createDockerBuildPlan() {
|
||||
return new Plan(
|
||||
project(),
|
||||
SERVICE_NAME, new BambooKey(SERVICE_KEY))
|
||||
// .description("Docker build for cv-analysis.")
|
||||
// .variables()
|
||||
.stages(new Stage("Build Stage")
|
||||
.jobs(
|
||||
new Job("Build Job", new BambooKey("BUILD"))
|
||||
.tasks(
|
||||
new CleanWorkingDirectoryTask()
|
||||
.description("Clean working directory.")
|
||||
.enabled(true),
|
||||
new VcsCheckoutTask()
|
||||
.description("Checkout default repository.")
|
||||
.checkoutItems(new CheckoutItem().defaultRepository()),
|
||||
new ScriptTask()
|
||||
.description("Set config and keys.")
|
||||
.location(Location.FILE)
|
||||
.fileFromPath("bamboo-specs/src/main/resources/scripts/key-prepare.sh"),
|
||||
new ScriptTask()
|
||||
.description("Build Docker container.")
|
||||
.location(Location.FILE)
|
||||
.fileFromPath("bamboo-specs/src/main/resources/scripts/docker-build.sh")
|
||||
.argument(SERVICE_NAME),
|
||||
new InjectVariablesTask()
|
||||
.description("Inject git tag.")
|
||||
.path("git.tag")
|
||||
.namespace("g")
|
||||
.scope(InjectVariablesScope.LOCAL),
|
||||
new VcsTagTask()
|
||||
.description("${bamboo.g.gitTag}")
|
||||
.tagName("${bamboo.g.gitTag}")
|
||||
.defaultRepository())
|
||||
.dockerConfiguration(
|
||||
new DockerConfiguration()
|
||||
.image("nexus.iqser.com:5001/infra/release_build:4.5.0")
|
||||
.volume("/var/run/docker.sock", "/var/run/docker.sock")),
|
||||
new Job("Licence Job", new BambooKey("LICENCE"))
|
||||
.enabled(false)
|
||||
.tasks(
|
||||
new VcsCheckoutTask()
|
||||
.description("Checkout default repository.")
|
||||
.checkoutItems(new CheckoutItem().defaultRepository()),
|
||||
new ScriptTask()
|
||||
.description("Build licence.")
|
||||
.location(Location.FILE)
|
||||
.fileFromPath("bamboo-specs/src/main/resources/scripts/create-licence.sh"))
|
||||
.dockerConfiguration(
|
||||
new DockerConfiguration()
|
||||
.image("nexus.iqser.com:5001/infra/maven:3.6.2-jdk-13-3.0.0")
|
||||
.volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
|
||||
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
|
||||
.linkedRepositories("RR / " + SERVICE_NAME)
|
||||
.triggers(
|
||||
new BitbucketServerTrigger())
|
||||
.planBranchManagement(
|
||||
new PlanBranchManagement()
|
||||
.createForVcsBranch()
|
||||
.delete(
|
||||
new BranchCleanup()
|
||||
.whenInactiveInRepositoryAfterDays(14))
|
||||
.notificationForCommitters());
|
||||
}
|
||||
|
||||
public Plan createSecBuild() {
|
||||
return new Plan(project(), SERVICE_NAME + "-Sec", new BambooKey(SERVICE_KEY + "SEC")).description("Security Analysis Plan")
|
||||
.stages(new Stage("Default Stage").jobs(
|
||||
new Job("Sonar Job", new BambooKey("SONAR"))
|
||||
.tasks(
|
||||
new CleanWorkingDirectoryTask()
|
||||
.description("Clean working directory.")
|
||||
.enabled(true),
|
||||
new VcsCheckoutTask()
|
||||
.description("Checkout default repository.")
|
||||
.checkoutItems(new CheckoutItem().defaultRepository()),
|
||||
new ScriptTask()
|
||||
.description("Set config and keys.")
|
||||
.location(Location.FILE)
|
||||
.fileFromPath("bamboo-specs/src/main/resources/scripts/key-prepare.sh"),
|
||||
new ScriptTask()
|
||||
.description("Run Sonarqube scan.")
|
||||
.location(Location.FILE)
|
||||
.fileFromPath("bamboo-specs/src/main/resources/scripts/sonar-scan.sh")
|
||||
.argument(SERVICE_NAME))
|
||||
.dockerConfiguration(
|
||||
new DockerConfiguration()
|
||||
.image("nexus.iqser.com:5001/infra/release_build:4.2.0")
|
||||
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
|
||||
.linkedRepositories("RR / " + SERVICE_NAME)
|
||||
.triggers(
|
||||
new ScheduledTrigger()
|
||||
.scheduleOnceDaily(LocalTime.of(23, 00)))
|
||||
.planBranchManagement(
|
||||
new PlanBranchManagement()
|
||||
.createForVcsBranchMatching("release.*")
|
||||
.notificationForCommitters());
|
||||
}
|
||||
}
|
||||
19
bamboo-specs/src/main/resources/scripts/create-licence.sh
Executable file
19
bamboo-specs/src/main/resources/scripts/create-licence.sh
Executable file
@ -0,0 +1,19 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
if [[ \"${bamboo_version_tag}\" != \"dev\" ]]
|
||||
then
|
||||
${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
|
||||
-f ${bamboo_build_working_directory}/pom.xml \
|
||||
versions:set \
|
||||
-DnewVersion=${bamboo_version_tag}
|
||||
|
||||
${bamboo_capability_system_builder_mvn3_Maven_3}/bin/mvn \
|
||||
-f ${bamboo_build_working_directory}/pom.xml \
|
||||
-B clean deploy \
|
||||
-e -DdeployAtEnd=true \
|
||||
-Dmaven.wagon.http.ssl.insecure=true \
|
||||
-Dmaven.wagon.http.ssl.allowall=true \
|
||||
-Dmaven.wagon.http.ssl.ignore.validity.dates=true \
|
||||
-DaltDeploymentRepository=iqser_release::default::https://nexus.iqser.com/repository/gin4-platform-releases
|
||||
fi
|
||||
53
bamboo-specs/src/main/resources/scripts/docker-build.sh
Executable file
53
bamboo-specs/src/main/resources/scripts/docker-build.sh
Executable file
@ -0,0 +1,53 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
SERVICE_NAME=$1
|
||||
|
||||
if [[ "$bamboo_planRepository_branchName" == "master" ]]
|
||||
then
|
||||
branchVersion=$(cat version.yaml | grep -Eo "version: .*" | sed -s 's|version: \(.*\)\..*\..*|\1|g')
|
||||
latestVersion=$( semver $(git tag -l "${branchVersion}.*" ) | tail -n1 )
|
||||
newVersion="$(semver $latestVersion -p -i minor)"
|
||||
echo "new release on master with version $newVersion"
|
||||
elif [[ "$bamboo_planRepository_branchName" == release* ]]
|
||||
then
|
||||
branchVersion=$(echo $bamboo_planRepository_branchName | sed -s 's|release\/\([0-9]\+\.[0-9]\+\)\.x|\1|')
|
||||
latestVersion=$( semver $(git tag -l "${branchVersion}.*" ) | tail -n1 )
|
||||
newVersion="$(semver $latestVersion -p -i patch)"
|
||||
echo "new release on $bamboo_planRepository_branchName with version $newVersion"
|
||||
elif [[ "${bamboo_version_tag}" != "dev" ]]
|
||||
then
|
||||
newVersion="${bamboo_version_tag}"
|
||||
echo "new special version bild with $newVersion"
|
||||
else
|
||||
newVersion="${bamboo_planRepository_1_branch}_${bamboo_buildNumber}"
|
||||
echo "gitTag=${newVersion}" > git.tag
|
||||
echo "dev build with tag ${newVersion}"
|
||||
python3 -m venv build_venv
|
||||
source build_venv/bin/activate
|
||||
python3 -m pip install --upgrade pip
|
||||
|
||||
pip install dvc
|
||||
pip install 'dvc[ssh]'
|
||||
dvc pull
|
||||
|
||||
echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
|
||||
echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
|
||||
docker build -f Dockerfile .
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "gitTag=${newVersion}" > git.tag
|
||||
|
||||
python3 -m venv build_venv
|
||||
source build_venv/bin/activate
|
||||
python3 -m pip install --upgrade pip
|
||||
|
||||
pip install dvc
|
||||
pip install 'dvc[ssh]'
|
||||
dvc pull
|
||||
|
||||
echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
|
||||
docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion} .
|
||||
echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
|
||||
docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${newVersion}
|
||||
8
bamboo-specs/src/main/resources/scripts/key-prepare.sh
Executable file
8
bamboo-specs/src/main/resources/scripts/key-prepare.sh
Executable file
@ -0,0 +1,8 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
mkdir -p ~/.ssh
|
||||
echo "${bamboo_agent_ssh}" | base64 -d >> ~/.ssh/id_rsa
|
||||
echo "host vector.iqser.com" > ~/.ssh/config
|
||||
echo " user bamboo-agent" >> ~/.ssh/config
|
||||
chmod 600 ~/.ssh/config ~/.ssh/id_rsa
|
||||
67
bamboo-specs/src/main/resources/scripts/sonar-scan.sh
Executable file
67
bamboo-specs/src/main/resources/scripts/sonar-scan.sh
Executable file
@ -0,0 +1,67 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
export JAVA_HOME=/usr/bin/sonar-scanner/jre
|
||||
|
||||
python3 -m venv build_venv
|
||||
source build_venv/bin/activate
|
||||
python3 -m pip install --upgrade pip
|
||||
|
||||
echo "dev setup for unit test and coverage"
|
||||
|
||||
pip install -e incl/pyinfra
|
||||
pip install -r incl/pyinfra/requirements.txt
|
||||
|
||||
pip install -e incl/pdf2image
|
||||
pip install -r incl/pdf2image/requirements.txt
|
||||
|
||||
pip install -e .
|
||||
pip install -r requirements.txt
|
||||
|
||||
|
||||
echo "DVC pull step"
|
||||
dvc pull
|
||||
|
||||
echo "coverage calculation"
|
||||
coverage run -m pytest
|
||||
echo "coverage report generation"
|
||||
coverage report -m
|
||||
coverage xml
|
||||
|
||||
SERVICE_NAME=$1
|
||||
|
||||
echo "dependency-check:aggregate"
|
||||
mkdir -p reports
|
||||
dependency-check --enableExperimental -f JSON -f HTML -f XML \
|
||||
--disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
|
||||
--exclude "build_venv/**" --exclude "**/__pycache__/**"
|
||||
|
||||
if [[ -z "${bamboo_repository_pr_key}" ]]
|
||||
then
|
||||
echo "Sonar Scan for branch: ${bamboo_planRepository_1_branch}"
|
||||
/usr/bin/sonar-scanner/bin/sonar-scanner -X\
|
||||
-Dsonar.projectKey=RED_$SERVICE_NAME \
|
||||
-Dsonar.sources=src,cv_analysis \
|
||||
-Dsonar.host.url=https://sonarqube.iqser.com \
|
||||
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
|
||||
-Dsonar.branch.name=${bamboo_planRepository_1_branch} \
|
||||
-Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
|
||||
-Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
|
||||
-Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
|
||||
-Dsonar.python.coverage.reportPaths=reports/coverage.xml
|
||||
|
||||
else
|
||||
echo "Sonar Scan for PR with key1: ${bamboo_repository_pr_key}"
|
||||
/usr/bin/sonar-scanner/bin/sonar-scanner \
|
||||
-Dsonar.projectKey=RED_$SERVICE_NAME \
|
||||
-Dsonar.sources=src,cv_analysis \
|
||||
-Dsonar.host.url=https://sonarqube.iqser.com \
|
||||
-Dsonar.login=${bamboo_sonarqube_api_token_secret} \
|
||||
-Dsonar.pullrequest.key=${bamboo_repository_pr_key} \
|
||||
-Dsonar.pullrequest.branch=${bamboo_repository_pr_sourceBranch} \
|
||||
-Dsonar.pullrequest.base=${bamboo_repository_pr_targetBranch} \
|
||||
-Dsonar.dependencyCheck.jsonReportPath=reports/dependency-check-report.json \
|
||||
-Dsonar.dependencyCheck.xmlReportPath=reports/dependency-check-report.xml \
|
||||
-Dsonar.dependencyCheck.htmlReportPath=reports/dependency-check-report.html \
|
||||
-Dsonar.python.coverage.reportPaths=reports/coverage.xml
|
||||
fi
|
||||
22
bamboo-specs/src/test/java/buildjob/PlanSpecTest.java
Normal file
22
bamboo-specs/src/test/java/buildjob/PlanSpecTest.java
Normal file
@ -0,0 +1,22 @@
|
||||
package buildjob;
|
||||
|
||||
|
||||
import com.atlassian.bamboo.specs.api.builders.plan.Plan;
|
||||
import com.atlassian.bamboo.specs.api.exceptions.PropertiesValidationException;
|
||||
import com.atlassian.bamboo.specs.api.util.EntityPropertiesBuilders;
|
||||
import org.junit.Test;
|
||||
|
||||
public class PlanSpecTest {
|
||||
@Test
|
||||
public void checkYourPlanOffline() throws PropertiesValidationException {
|
||||
Plan plan = new PlanSpec().createDockerBuildPlan();
|
||||
|
||||
EntityPropertiesBuilders.build(plan);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void checkYourSecPlanOffline() throws PropertiesValidationException {
|
||||
Plan secPlan = new PlanSpec().createSecBuild();
|
||||
EntityPropertiesBuilders.build(secPlan);
|
||||
}
|
||||
}
|
||||
@ -1,67 +0,0 @@
|
||||
|
||||
[asyncio]
|
||||
max_concurrent_tasks = 10
|
||||
|
||||
[dynamic_tenant_queues]
|
||||
enabled = true
|
||||
|
||||
[metrics.prometheus]
|
||||
enabled = true
|
||||
prefix = "redactmanager_cv_analysis_service"
|
||||
|
||||
[tracing]
|
||||
enabled = true
|
||||
# possible values "opentelemetry" | "azure_monitor" (Excpects APPLICATIONINSIGHTS_CONNECTION_STRING environment variable.)
|
||||
type = "azure_monitor"
|
||||
|
||||
[tracing.opentelemetry]
|
||||
endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
|
||||
service_name = "redactmanager_cv_analysis_service"
|
||||
exporter = "otlp"
|
||||
|
||||
[webserver]
|
||||
host = "0.0.0.0"
|
||||
port = 8080
|
||||
|
||||
[rabbitmq]
|
||||
host = "localhost"
|
||||
port = 5672
|
||||
username = ""
|
||||
password = ""
|
||||
heartbeat = 60
|
||||
# Has to be a divider of heartbeat, and shouldn't be too big, since only in these intervals queue interactions happen (like receiving new messages)
|
||||
# This is also the minimum time the service needs to process a message
|
||||
connection_sleep = 5
|
||||
input_queue = "request_queue"
|
||||
output_queue = "response_queue"
|
||||
dead_letter_queue = "dead_letter_queue"
|
||||
|
||||
tenant_event_queue_suffix = "_tenant_event_queue"
|
||||
tenant_event_dlq_suffix = "_tenant_events_dlq"
|
||||
tenant_exchange_name = "tenants-exchange"
|
||||
queue_expiration_time = 300000 # 5 minutes in milliseconds
|
||||
service_request_queue_prefix = "cv_analysis_request_queue"
|
||||
service_request_exchange_name = "cv_analysis_request_exchange"
|
||||
service_response_exchange_name = "cv_analysis_response_exchange"
|
||||
service_dlq_name = "cv_analysis_dlq"
|
||||
|
||||
[storage]
|
||||
backend = "s3"
|
||||
|
||||
[storage.s3]
|
||||
bucket = "redaction"
|
||||
endpoint = "http://127.0.0.1:9000"
|
||||
key = ""
|
||||
secret = ""
|
||||
region = "eu-central-1"
|
||||
|
||||
[storage.azure]
|
||||
container = "redaction"
|
||||
connection_string = ""
|
||||
|
||||
[storage.tenant_server]
|
||||
public_key = ""
|
||||
endpoint = "http://tenant-user-management:8081/internal-api/tenants"
|
||||
|
||||
[kubernetes]
|
||||
pod_name = "test_pod"
|
||||
@ -1,19 +0,0 @@
|
||||
[logging]
|
||||
level = "INFO"
|
||||
visual_logging_level = "DISABLED"
|
||||
visual_logging_output_folder = "/tmp/debug"
|
||||
|
||||
[table_parsing]
|
||||
skip_pages_without_images = true
|
||||
|
||||
[paths]
|
||||
root = "@format {env[ROOT_PATH]}"
|
||||
dvc_data_dir = "${paths.root}/data"
|
||||
pdf_for_testing = "${paths.dvc_data_dir}/pdfs_for_testing"
|
||||
png_for_testing = "${paths.dvc_data_dir}/pngs_for_testing"
|
||||
png_figures_detected = "${paths.png_for_testing}/figures_detected"
|
||||
png_tables_detected = "${paths.png_for_testing}/tables_detected_by_tp"
|
||||
hashed_pdfs_for_testing = "${paths.pdf_for_testing}/hashed"
|
||||
metadata_test_files = "${paths.dvc_data_dir}/metadata_testing_files.csv"
|
||||
test_dir = "${paths.dvc_data_dir}/test"
|
||||
test_data_dir = "${paths.dvc_data_dir}/test/test_data"
|
||||
31
cv_analysis/config.py
Normal file
31
cv_analysis/config.py
Normal file
@ -0,0 +1,31 @@
|
||||
import os
|
||||
|
||||
|
||||
def get_config():
|
||||
return Config()
|
||||
|
||||
|
||||
class Config:
|
||||
def __init__(self):
|
||||
self.logging_level_root = os.environ.get("LOGGING_LEVEL_ROOT", "INFO")
|
||||
self.table_parsing_skip_pages_without_images = os.environ.get("TABLE_PARSING_SKIP_PAGES_WITHOUT_IMAGES", True)
|
||||
|
||||
# visual_logging_level: NOTHING > INFO > DEBUG > ALL
|
||||
self.visual_logging_level = "DISABLED"
|
||||
self.visual_logging_output_folder = "/tmp/debug"
|
||||
|
||||
# locations
|
||||
# FIXME: is everything here necessary?
|
||||
root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
self.dvc_data_dir = os.path.join(root, "data")
|
||||
self.pdf_for_testing = os.path.join(self.dvc_data_dir, "pdfs_for_testing")
|
||||
self.png_for_testing = os.path.join(self.dvc_data_dir, "pngs_for_testing")
|
||||
self.png_figures_detected = os.path.join(self.png_for_testing, "figures_detected")
|
||||
self.png_tables_detected = os.path.join(self.png_for_testing, "tables_detected_by_tp")
|
||||
self.hashed_pdfs_for_testing = os.path.join(self.pdf_for_testing, "hashed")
|
||||
self.metadata_test_files = os.path.join(self.dvc_data_dir, "metadata_testing_files.csv")
|
||||
self.test_dir = os.path.join(root, "test")
|
||||
self.test_data_dir = os.path.join(self.test_dir, "test_data")
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.__getattribute__(key)
|
||||
@ -1,40 +1,38 @@
|
||||
from functools import partial
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from funcy import lmap
|
||||
|
||||
from cv_analysis.figure_detection.figures import detect_large_coherent_structures
|
||||
from cv_analysis.figure_detection.text import remove_primary_text_regions
|
||||
from cv_analysis.utils.conversion import contour_to_rectangle
|
||||
from cv_analysis.utils.filters import (
|
||||
has_acceptable_format,
|
||||
is_large_enough,
|
||||
is_not_too_large,
|
||||
has_acceptable_format,
|
||||
is_small_enough,
|
||||
)
|
||||
from cv_analysis.utils.postprocessing import remove_included
|
||||
from cv_analysis.utils.structures import Rectangle
|
||||
|
||||
|
||||
def detect_figures(image: np.ndarray):
|
||||
def detect_figures(image: np.array):
|
||||
max_area = image.shape[0] * image.shape[1] * 0.99
|
||||
min_area = 5000
|
||||
max_width_to_height_ratio = 6
|
||||
figure_filter = partial(is_likely_figure, min_area, max_area, max_width_to_height_ratio)
|
||||
|
||||
image = remove_primary_text_regions(image)
|
||||
cnts = detect_large_coherent_structures(image)
|
||||
cnts = filter(figure_filter, cnts)
|
||||
contours = detect_large_coherent_structures(image)
|
||||
contours = filter(figure_filter, contours)
|
||||
|
||||
# rects = map(compose(Rectangle.from_xywh, cv2.boundingRect), (cnts))
|
||||
rectangles = lmap(contour_to_rectangle, contours)
|
||||
rectangles = remove_included(rectangles)
|
||||
|
||||
bounding_rects = map(cv2.boundingRect, cnts)
|
||||
rects: list[Rectangle] = remove_included(map(Rectangle.from_xywh, rects))
|
||||
|
||||
return rects
|
||||
return rectangles
|
||||
|
||||
|
||||
def is_likely_figure(min_area, max_area, max_width_to_height_ratio, cnts):
|
||||
def is_likely_figure(min_area, max_area, max_width_to_height_ratio, contours):
|
||||
return (
|
||||
is_not_too_large(cnts, max_area)
|
||||
and is_large_enough(cnts, min_area)
|
||||
and has_acceptable_format(cnts, max_width_to_height_ratio)
|
||||
is_small_enough(contours, max_area)
|
||||
and is_large_enough(contours, min_area)
|
||||
and has_acceptable_format(contours, max_width_to_height_ratio)
|
||||
)
|
||||
@ -1,25 +1,33 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
from cv_analysis.utils.common import find_contours_and_hierarchies
|
||||
|
||||
def detect_large_coherent_structures(image: np.ndarray):
|
||||
"""Detects large coherent structures on an image.
|
||||
|
||||
def detect_large_coherent_structures(image: np.array):
|
||||
"""Detects large coherent structures in an image.
|
||||
Expects an image with binary color space (e.g. threshold applied).
|
||||
|
||||
Args:
|
||||
image (np.array): Image to look for large coherent structures in.
|
||||
|
||||
Returns:
|
||||
contours
|
||||
list: List of contours.
|
||||
|
||||
References:
|
||||
https://stackoverflow.com/questions/60259169/how-to-group-nearby-contours-in-opencv-python-zebra-crossing-detection
|
||||
"""
|
||||
assert len(image.shape) == 2
|
||||
|
||||
# FIXME: Parameterize via factory
|
||||
dilate_kernel = cv2.getStructuringElement(cv2.MORPH_OPEN, (5, 5))
|
||||
# FIXME: Parameterize via factory
|
||||
dilate = cv2.dilate(image, dilate_kernel, iterations=4)
|
||||
|
||||
# FIXME: Parameterize via factory
|
||||
close_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20))
|
||||
close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1)
|
||||
# FIXME: Parameterize via factory
|
||||
close = cv2.morphologyEx(dilate, cv2.MORPH_CLOSE, close_kernel, iterations=1) # TODO: Tweak iterations
|
||||
|
||||
cnts, _ = cv2.findContours(close, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
contours, _ = find_contours_and_hierarchies(close)
|
||||
|
||||
return cnts
|
||||
return contours
|
||||
@ -1,5 +1,7 @@
|
||||
import cv2
|
||||
|
||||
from cv_analysis.utils.common import normalize_to_gray_scale
|
||||
|
||||
|
||||
def remove_primary_text_regions(image):
|
||||
"""Removes regions of primary text, meaning no figure descriptions for example, but main text body paragraphs.
|
||||
@ -35,6 +37,7 @@ def remove_primary_text_regions(image):
|
||||
|
||||
def apply_threshold_to_image(image):
|
||||
"""Converts an image to black and white."""
|
||||
image = normalize_to_gray_scale(image)
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
|
||||
return cv2.threshold(image, 253, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
||||
|
||||
80
cv_analysis/layout_parsing.py
Normal file
80
cv_analysis/layout_parsing.py
Normal file
@ -0,0 +1,80 @@
|
||||
from functools import partial
|
||||
from typing import Iterable, List
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from funcy import compose, rcompose, lkeep
|
||||
|
||||
from cv_analysis.utils import lstarkeep
|
||||
from cv_analysis.utils.common import (
|
||||
find_contours_and_hierarchies,
|
||||
dilate_page_components,
|
||||
normalize_to_gray_scale,
|
||||
threshold_image,
|
||||
invert_image,
|
||||
fill_rectangles,
|
||||
)
|
||||
from cv_analysis.utils.conversion import contour_to_rectangle
|
||||
from cv_analysis.utils.merging import merge_related_rectangles
|
||||
from cv_analysis.utils.postprocessing import remove_included, has_no_parent
|
||||
from cv_analysis.utils.rectangle import Rectangle
|
||||
|
||||
|
||||
def parse_layout(image: np.array) -> List[Rectangle]:
|
||||
"""Parse the layout of a page.
|
||||
|
||||
Args:
|
||||
image: Image of the page.
|
||||
|
||||
Returns:
|
||||
List of rectangles representing the layout of the page as identified page elements.
|
||||
"""
|
||||
rectangles = rcompose(
|
||||
find_segments,
|
||||
remove_included,
|
||||
merge_related_rectangles,
|
||||
remove_included,
|
||||
)(image)
|
||||
|
||||
return rectangles
|
||||
|
||||
|
||||
def find_segments(image: np.ndarray) -> List[Rectangle]:
|
||||
"""Find segments in a page. Segments are structural elements of a page, such as text blocks, tables, etc."""
|
||||
rectangles = rcompose(
|
||||
prepare_for_initial_detection,
|
||||
__find_segments,
|
||||
partial(prepare_for_meta_detection, image.copy()),
|
||||
__find_segments,
|
||||
)(image)
|
||||
|
||||
return rectangles
|
||||
|
||||
|
||||
def prepare_for_initial_detection(image: np.ndarray) -> np.ndarray:
|
||||
return compose(dilate_page_components, normalize_to_gray_scale)(image)
|
||||
|
||||
|
||||
def __find_segments(image: np.ndarray) -> List[Rectangle]:
|
||||
def to_rectangle_if_valid(contour, hierarchy):
|
||||
return contour_to_rectangle(contour) if is_likely_segment(contour) and has_no_parent(hierarchy) else None
|
||||
|
||||
rectangles = lstarkeep(to_rectangle_if_valid, zip(*find_contours_and_hierarchies(image)))
|
||||
|
||||
return rectangles
|
||||
|
||||
|
||||
def prepare_for_meta_detection(image: np.ndarray, rectangles: Iterable[Rectangle]) -> np.ndarray:
|
||||
image = rcompose(
|
||||
fill_rectangles,
|
||||
threshold_image,
|
||||
invert_image,
|
||||
normalize_to_gray_scale,
|
||||
)(image, rectangles)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def is_likely_segment(rectangle: Rectangle, min_area: float = 100) -> bool:
|
||||
# FIXME: Parameterize via factory
|
||||
return cv2.contourArea(rectangle, False) > min_area
|
||||
14
cv_analysis/locations.py
Normal file
14
cv_analysis/locations.py
Normal file
@ -0,0 +1,14 @@
|
||||
"""Defines constant paths relative to a root path."""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
MODULE_PATH = Path(__file__).resolve().parents[0]
|
||||
PACKAGE_ROOT_PATH = MODULE_PATH.parents[0]
|
||||
REPO_ROOT_PATH = PACKAGE_ROOT_PATH
|
||||
|
||||
TEST_DIR_PATH = REPO_ROOT_PATH / "test"
|
||||
TEST_DATA_DIR = TEST_DIR_PATH / "data"
|
||||
TEST_DATA_DIR_DVC = TEST_DIR_PATH / "data.dvc"
|
||||
TEST_DATA_SYNTHESIS_DIR = TEST_DATA_DIR / "synthesis"
|
||||
TEST_PAGE_TEXTURES_DIR = TEST_DATA_SYNTHESIS_DIR / "paper"
|
||||
TEST_SMILES_FILE = TEST_DATA_SYNTHESIS_DIR / "smiles.csv"
|
||||
84
cv_analysis/logging.py
Normal file
84
cv_analysis/logging.py
Normal file
@ -0,0 +1,84 @@
|
||||
import sys
|
||||
from functools import wraps
|
||||
from operator import attrgetter
|
||||
from typing import Callable, Any
|
||||
|
||||
import loguru
|
||||
from funcy import log_calls, log_enters, log_exits
|
||||
|
||||
logger = loguru.logger
|
||||
logger.remove()
|
||||
|
||||
debug_logger = loguru.logger
|
||||
debug_logger.add(
|
||||
sink=sys.stderr,
|
||||
format="<blue>{time:YYYY-MM-DD at HH:mm:ss}</blue> | <level>{level: <8}</level> | <cyan>{name}</cyan>: <level>{message}</level>",
|
||||
level="DEBUG",
|
||||
)
|
||||
|
||||
dev_logger = loguru.logger
|
||||
dev_logger.add(
|
||||
sink=sys.stderr,
|
||||
format="<green>{time:YYYY-MM-DD at HH:mm:ss}</green> | <level>{level: <8}</level> | <cyan>{name}</cyan>: <level>{message}</level>",
|
||||
level="DEBUG",
|
||||
)
|
||||
|
||||
prod_logger = loguru.logger
|
||||
prod_logger.add(
|
||||
sink=sys.stderr,
|
||||
format="<white>{time:YYYY-MM-DD at HH:mm:ss}</white> | <level>{level: <8}</level> | <cyan>{name}</cyan>: <level>{message}</level>",
|
||||
level="INFO",
|
||||
enqueue=True,
|
||||
)
|
||||
|
||||
# logger.remove()
|
||||
# logger.add(sink=sys.stderr, level="DEBUG", enqueue=True)
|
||||
|
||||
|
||||
def __log(logger, level: str, enters=True, exits=True) -> Callable:
|
||||
print_func = get_print_func(logger, level)
|
||||
|
||||
def dec():
|
||||
if enters and exits:
|
||||
fn = log_calls
|
||||
elif enters:
|
||||
fn = log_enters
|
||||
elif exits:
|
||||
fn = log_exits
|
||||
else:
|
||||
raise ValueError("Must log either enters or exits")
|
||||
|
||||
return fn(print_func=print_func)
|
||||
|
||||
def inner(func: Callable) -> Callable:
|
||||
@dec()
|
||||
@wraps(func)
|
||||
def inner(*args, **kwargs) -> Any:
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return inner
|
||||
|
||||
return inner
|
||||
|
||||
|
||||
def get_print_func(logger, level: str):
|
||||
return attrgetter(level.lower())(logger)
|
||||
|
||||
|
||||
def debug_log(level: str = "TRACE", enters=True, exits=True) -> Callable:
|
||||
return __log(debug_logger, level, enters=enters, exits=exits)
|
||||
|
||||
|
||||
def dev_log(level: str = "TRACE", enters=True, exits=True) -> Callable:
|
||||
return __log(dev_logger, level, enters=enters, exits=exits)
|
||||
|
||||
|
||||
def prod_log(level: str = "TRACE", enters=True, exits=True) -> Callable:
|
||||
return __log(prod_logger, level, enters=enters, exits=exits)
|
||||
|
||||
|
||||
def delay(fn, *args, **kwargs):
|
||||
def inner():
|
||||
return fn(*args, **kwargs)
|
||||
|
||||
return inner
|
||||
@ -2,17 +2,17 @@ from functools import partial
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from iteration_utilities import first, starfilter # type: ignore
|
||||
from iteration_utilities import starfilter, first
|
||||
|
||||
from cv_analysis.utils.filters import is_boxy, is_filled, is_large_enough
|
||||
from cv_analysis.utils.visual_logging import vizlogger
|
||||
from cv_analysis.utils.filters import is_large_enough, is_filled, is_boxy
|
||||
from cv_analysis.utils.visual_logger import vizlogger
|
||||
|
||||
|
||||
def is_likely_redaction(contour, hierarchy, min_area):
|
||||
return is_filled(hierarchy) and is_boxy(contour) and is_large_enough(contour, min_area)
|
||||
|
||||
|
||||
def find_redactions(image: np.ndarray, min_normalized_area=200000):
|
||||
def find_redactions(image: np.array, min_normalized_area=200000):
|
||||
vizlogger.debug(image, "redactions01_start.png")
|
||||
min_normalized_area /= 200 # Assumes 200 DPI PDF -> image conversion resolution
|
||||
|
||||
@ -29,14 +29,13 @@ def find_redactions(image: np.ndarray, min_normalized_area=200000):
|
||||
contours, hierarchies = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
|
||||
|
||||
try:
|
||||
return list(
|
||||
map(
|
||||
first,
|
||||
starfilter(
|
||||
partial(is_likely_redaction, min_area=min_normalized_area),
|
||||
zip(contours, hierarchies[0]),
|
||||
),
|
||||
)
|
||||
contours = map(
|
||||
first,
|
||||
starfilter(
|
||||
partial(is_likely_redaction, min_area=min_normalized_area),
|
||||
zip(contours, hierarchies[0]),
|
||||
),
|
||||
)
|
||||
return list(contours)
|
||||
except:
|
||||
return []
|
||||
60
cv_analysis/server/pipeline.py
Normal file
60
cv_analysis/server/pipeline.py
Normal file
@ -0,0 +1,60 @@
|
||||
from dataclasses import asdict
|
||||
from operator import truth
|
||||
|
||||
from funcy import lmap, flatten
|
||||
|
||||
from cv_analysis.figure_detection.figure_detection import detect_figures
|
||||
from cv_analysis.table_parsing import parse_tables
|
||||
from cv_analysis.utils.rectangle import Rectangle
|
||||
from pdf2img.conversion import convert_pages_to_images
|
||||
from pdf2img.default_objects.image import ImagePlus, ImageInfo
|
||||
from pdf2img.default_objects.rectangle import RectanglePlus
|
||||
|
||||
|
||||
def make_analysis_pipeline_for_element_type(segment_type, **kwargs):
|
||||
if segment_type == "table":
|
||||
return make_analysis_pipeline(parse_tables, table_parsing_formatter, dpi=200, **kwargs)
|
||||
elif segment_type == "figure":
|
||||
return make_analysis_pipeline(detect_figures, figure_detection_formatter, dpi=200, **kwargs)
|
||||
else:
|
||||
raise ValueError(f"Unknown segment type {segment_type}.")
|
||||
|
||||
|
||||
def make_analysis_pipeline(analysis_fn, formatter, dpi, skip_pages_without_images=False):
|
||||
def analysis_pipeline(pdf: bytes, index=None):
|
||||
def parse_page(page: ImagePlus):
|
||||
image = page.asarray()
|
||||
rectangles = analysis_fn(image)
|
||||
if not rectangles:
|
||||
return
|
||||
infos = formatter(rectangles, page, dpi)
|
||||
return infos
|
||||
|
||||
pages = convert_pages_to_images(pdf, index=index, dpi=dpi, skip_pages_without_images=skip_pages_without_images)
|
||||
results = map(parse_page, pages)
|
||||
|
||||
yield from flatten(filter(truth, results))
|
||||
|
||||
return analysis_pipeline
|
||||
|
||||
|
||||
def table_parsing_formatter(rectangles, page: ImagePlus, dpi):
|
||||
def format_rectangle(rectangle: Rectangle):
|
||||
rectangle_plus = RectanglePlus.from_pixels(*rectangle_to_xyxy(rectangle), page.info, alpha=False, dpi=dpi)
|
||||
return rectangle_plus.asdict(derotate=True)
|
||||
|
||||
bboxes = lmap(format_rectangle, rectangles)
|
||||
|
||||
return {"pageInfo": page.asdict(natural_index=True), "tableCells": bboxes}
|
||||
|
||||
|
||||
def figure_detection_formatter(rectangles, page, dpi):
|
||||
def format_rectangle(rectangle: Rectangle):
|
||||
rect_plus = RectanglePlus.from_pixels(*rectangle_to_xyxy(rectangle), page.info, alpha=False, dpi=dpi)
|
||||
return asdict(ImageInfo(page.info, rect_plus.asbbox(derotate=False), rect_plus.alpha))
|
||||
|
||||
return lmap(format_rectangle, rectangles)
|
||||
|
||||
|
||||
def rectangle_to_xyxy(rectangle: Rectangle):
|
||||
return rectangle.x1, rectangle.y1, rectangle.x2, rectangle.y2
|
||||
129
cv_analysis/table_parsing.py
Normal file
129
cv_analysis/table_parsing.py
Normal file
@ -0,0 +1,129 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
from funcy import lmap, lfilter
|
||||
|
||||
from cv_analysis.layout_parsing import parse_layout
|
||||
from cv_analysis.utils.conversion import box_to_rectangle
|
||||
from cv_analysis.utils.postprocessing import remove_isolated
|
||||
from cv_analysis.utils.visual_logger import vizlogger
|
||||
|
||||
|
||||
def add_external_contours(image, image_h_w_lines_only):
|
||||
|
||||
contours, _ = cv2.findContours(image_h_w_lines_only, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
|
||||
for cnt in contours:
|
||||
x, y, w, h = cv2.boundingRect(cnt)
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), 255, 1)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def apply_motion_blur(image: np.array, angle, size=80):
|
||||
"""Solidifies and slightly extends detected lines.
|
||||
|
||||
Args:
|
||||
image (np.array): page image as array
|
||||
angle: direction in which to apply blur, 0 or 90
|
||||
size (int): kernel size; 80 found empirically to work well
|
||||
|
||||
Returns:
|
||||
np.ndarray
|
||||
"""
|
||||
k = np.zeros((size, size), dtype=np.float32)
|
||||
vizlogger.debug(k, "tables08_blur_kernel1.png")
|
||||
k[(size - 1) // 2, :] = np.ones(size, dtype=np.float32)
|
||||
vizlogger.debug(k, "tables09_blur_kernel2.png")
|
||||
k = cv2.warpAffine(
|
||||
k,
|
||||
cv2.getRotationMatrix2D((size / 2 - 0.5, size / 2 - 0.5), angle, 1.0),
|
||||
(size, size),
|
||||
)
|
||||
vizlogger.debug(k, "tables10_blur_kernel3.png")
|
||||
k = k * (1.0 / np.sum(k))
|
||||
vizlogger.debug(k, "tables11_blur_kernel4.png")
|
||||
blurred = cv2.filter2D(image, -1, k)
|
||||
return blurred
|
||||
|
||||
|
||||
def isolate_vertical_and_horizontal_components(img_bin):
|
||||
"""Identifies and reinforces horizontal and vertical lines in a binary image.
|
||||
|
||||
Args:
|
||||
img_bin (np.array): array corresponding to single binarized page image
|
||||
|
||||
Returns:
|
||||
np.ndarray
|
||||
"""
|
||||
line_min_width = 48
|
||||
kernel_h = np.ones((1, line_min_width), np.uint8)
|
||||
kernel_v = np.ones((line_min_width, 1), np.uint8)
|
||||
|
||||
img_bin_h = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_h)
|
||||
img_bin_v = cv2.morphologyEx(img_bin, cv2.MORPH_OPEN, kernel_v)
|
||||
img_lines_raw = img_bin_v | img_bin_h
|
||||
|
||||
kernel_h = np.ones((1, 30), np.uint8)
|
||||
kernel_v = np.ones((30, 1), np.uint8)
|
||||
img_bin_h = cv2.dilate(img_bin_h, kernel_h, iterations=2)
|
||||
img_bin_v = cv2.dilate(img_bin_v, kernel_v, iterations=2)
|
||||
|
||||
img_bin_h = apply_motion_blur(img_bin_h, 0)
|
||||
img_bin_v = apply_motion_blur(img_bin_v, 90)
|
||||
|
||||
img_bin_extended = img_bin_h | img_bin_v
|
||||
|
||||
th1, img_bin_extended = cv2.threshold(img_bin_extended, 120, 255, cv2.THRESH_BINARY)
|
||||
img_bin_final = cv2.dilate(img_bin_extended, np.ones((1, 1), np.uint8), iterations=1)
|
||||
# add contours before lines are extended by blurring
|
||||
img_bin_final = add_external_contours(img_bin_final, img_lines_raw)
|
||||
|
||||
return img_bin_final
|
||||
|
||||
|
||||
def find_table_layout_boxes(image: np.array):
|
||||
def is_large_enough(box):
|
||||
(x, y, w, h) = box
|
||||
if w * h >= 100000:
|
||||
return box_to_rectangle(box)
|
||||
|
||||
layout_boxes = parse_layout(image)
|
||||
return lmap(is_large_enough, layout_boxes)
|
||||
|
||||
|
||||
def preprocess(image: np.array):
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
|
||||
_, image = cv2.threshold(image, 195, 255, cv2.THRESH_BINARY)
|
||||
return ~image
|
||||
|
||||
|
||||
def turn_connected_components_into_rectangles(image: np.array):
|
||||
def is_large_enough(stat):
|
||||
x1, y1, w, h, area = stat
|
||||
return area > 2000 and w > 35 and h > 25
|
||||
|
||||
_, _, stats, _ = cv2.connectedComponentsWithStats(~image, connectivity=8, ltype=cv2.CV_32S)
|
||||
|
||||
stats = lfilter(is_large_enough, stats)
|
||||
if stats:
|
||||
stats = np.vstack(stats)
|
||||
return stats[:, :-1][2:]
|
||||
return []
|
||||
|
||||
|
||||
def parse_tables(image: np.array):
|
||||
"""Runs the full table parsing process.
|
||||
|
||||
Args:
|
||||
image (np.array): single PDF page, converted to a numpy array
|
||||
|
||||
Returns:
|
||||
list: list of rectangles corresponding to table cells
|
||||
"""
|
||||
|
||||
image = preprocess(image)
|
||||
image = isolate_vertical_and_horizontal_components(image)
|
||||
boxes = turn_connected_components_into_rectangles(image)
|
||||
rectangles = lmap(box_to_rectangle, boxes)
|
||||
rectangles = remove_isolated(rectangles)
|
||||
|
||||
return rectangles
|
||||
@ -1,13 +1,13 @@
|
||||
def make_art():
|
||||
art = r"""
|
||||
__
|
||||
_ |@@|
|
||||
__
|
||||
_ |@@|
|
||||
/ \ \--/ __ .__ .__
|
||||
) O|----| | __ ___ __ _____ ____ _____ | | ___.__. _____|__| ______
|
||||
/ / \ }{ /\ )_ / _\\ \/ / ______ \__ \ / \\__ \ | | | | |/ ___/ |/ ___/
|
||||
)/ /\__/\ \__O (__ \ / /_____/ / __ \| | \/ __ \| |_\___ |\___ \| |\___ \
|
||||
|/ (--/\--) \__/ \_/ (______/___|__(______/____/\____/_____/|__/_____/
|
||||
/ _)( )(_
|
||||
`---''---`
|
||||
|/ (--/\--) \__/ \_/ (______/___|__(______/____/\____/_____/|__/_____/
|
||||
/ _)( )(_
|
||||
`---''---`
|
||||
"""
|
||||
return art
|
||||
51
cv_analysis/utils/common.py
Normal file
51
cv_analysis/utils/common.py
Normal file
@ -0,0 +1,51 @@
|
||||
from functools import reduce
|
||||
from typing import Iterable
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from funcy import first
|
||||
|
||||
from cv_analysis.utils.rectangle import Rectangle
|
||||
|
||||
|
||||
def find_contours_and_hierarchies(image):
|
||||
contours, hierarchies = cv2.findContours(image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||
return contours, first(hierarchies) if hierarchies is not None else None
|
||||
|
||||
|
||||
def dilate_page_components(image: np.ndarray) -> np.ndarray:
|
||||
# FIXME: Parameterize via factory
|
||||
image = cv2.GaussianBlur(image, (7, 7), 0)
|
||||
# FIXME: Parameterize via factory
|
||||
thresh = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
||||
# FIXME: Parameterize via factory
|
||||
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5, 5))
|
||||
# FIXME: Parameterize via factory
|
||||
dilate = cv2.dilate(thresh, kernel, iterations=4)
|
||||
return dilate
|
||||
|
||||
|
||||
def normalize_to_gray_scale(image: np.ndarray) -> np.ndarray:
|
||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) > 2 else image
|
||||
return image
|
||||
|
||||
|
||||
def threshold_image(image: np.ndarray) -> np.ndarray:
|
||||
# FIXME: Parameterize via factory
|
||||
_, image = cv2.threshold(image, 254, 255, cv2.THRESH_BINARY)
|
||||
return image
|
||||
|
||||
|
||||
def invert_image(image: np.ndarray):
|
||||
return ~image
|
||||
|
||||
|
||||
def fill_rectangles(image: np.ndarray, rectangles: Iterable[Rectangle]) -> np.ndarray:
|
||||
image = reduce(fill_in_component_area, rectangles, image)
|
||||
return image
|
||||
|
||||
|
||||
def fill_in_component_area(image: np.ndarray, rectangle: Rectangle) -> np.ndarray:
|
||||
cv2.rectangle(image, (rectangle.x1, rectangle.y1), (rectangle.x2, rectangle.y2), (0, 0, 0), -1)
|
||||
cv2.rectangle(image, (rectangle.x1, rectangle.y1), (rectangle.x2, rectangle.y2), (255, 255, 255), 7)
|
||||
return image
|
||||
47
cv_analysis/utils/conversion.py
Normal file
47
cv_analysis/utils/conversion.py
Normal file
@ -0,0 +1,47 @@
|
||||
import json
|
||||
from typing import Sequence, Union
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from cv_analysis.utils.rectangle import Rectangle
|
||||
|
||||
Image_t = Union[Image.Image, np.ndarray]
|
||||
|
||||
|
||||
def contour_to_rectangle(contour):
|
||||
return box_to_rectangle(cv2.boundingRect(contour))
|
||||
|
||||
|
||||
def box_to_rectangle(box: Sequence[int]) -> Rectangle:
|
||||
x, y, w, h = box
|
||||
return Rectangle(x, y, x + w, y + h)
|
||||
|
||||
|
||||
def rectangle_to_box(rectangle: Rectangle) -> Sequence[int]:
|
||||
return [rectangle.x1, rectangle.y1, rectangle.width, rectangle.height]
|
||||
|
||||
|
||||
class RectangleJSONEncoder(json.JSONEncoder):
|
||||
def __init__(self, *args, **kwargs):
|
||||
json.JSONEncoder.__init__(self, *args, **kwargs)
|
||||
self._replacement_map = {}
|
||||
|
||||
def default(self, o):
|
||||
if isinstance(o, Rectangle):
|
||||
return {"x1": o.x1, "x2": o.x2, "y1": o.y1, "y2": o.y2}
|
||||
else:
|
||||
return json.JSONEncoder.default(self, o)
|
||||
|
||||
def encode(self, o):
|
||||
result = json.JSONEncoder.encode(self, o)
|
||||
return result
|
||||
|
||||
|
||||
def normalize_image_format_to_array(image: Image_t):
|
||||
return np.array(image).astype(np.uint8) if isinstance(image, Image.Image) else image
|
||||
|
||||
|
||||
def normalize_image_format_to_pil(image: Image_t):
|
||||
return Image.fromarray(image.astype(np.uint8)) if isinstance(image, np.ndarray) else image
|
||||
51
cv_analysis/utils/display.py
Normal file
51
cv_analysis/utils/display.py
Normal file
@ -0,0 +1,51 @@
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from PIL.Image import Image as Image_t
|
||||
from matplotlib import pyplot as plt
|
||||
|
||||
from cv_analysis.utils.conversion import normalize_image_format_to_array
|
||||
|
||||
|
||||
def show_image(image, backend="mpl", **kwargs):
|
||||
image = normalize_image_format_to_array(image)
|
||||
if backend == "mpl":
|
||||
show_image_mpl(image, **kwargs)
|
||||
elif backend == "cv2":
|
||||
show_image_cv2(image, **kwargs)
|
||||
elif backend == "pil":
|
||||
Image.fromarray(image).show()
|
||||
else:
|
||||
raise ValueError(f"Unknown backend: {backend}")
|
||||
|
||||
|
||||
def show_image_cv2(image, maxdim=700, **kwargs):
|
||||
h, w, c = image.shape
|
||||
maxhw = max(h, w)
|
||||
if maxhw > maxdim:
|
||||
ratio = maxdim / maxhw
|
||||
h = int(h * ratio)
|
||||
w = int(w * ratio)
|
||||
|
||||
img = cv2.resize(image, (h, w))
|
||||
cv2.imshow("", img)
|
||||
cv2.waitKey(0)
|
||||
cv2.destroyAllWindows()
|
||||
|
||||
|
||||
def show_image_mpl(image, **kwargs):
|
||||
if isinstance(image, Image_t):
|
||||
# noinspection PyTypeChecker
|
||||
image = np.array(image)
|
||||
# noinspection PyArgumentList
|
||||
assert image.max() <= 255
|
||||
fig, ax = plt.subplots(1, 1)
|
||||
fig.set_size_inches(20, 20)
|
||||
assert image.dtype == np.uint8
|
||||
ax.imshow(image, cmap="gray")
|
||||
ax.title.set_text(kwargs.get("title", ""))
|
||||
plt.show()
|
||||
|
||||
|
||||
def save_image(image, path):
|
||||
cv2.imwrite(path, image)
|
||||
@ -1,18 +1,23 @@
|
||||
from typing import Union
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from cv_analysis.utils import copy_and_normalize_channels
|
||||
|
||||
|
||||
def draw_contours(image, contours, color=None, annotate=False):
|
||||
def draw_contours(image, contours):
|
||||
|
||||
image = copy_and_normalize_channels(image)
|
||||
|
||||
for cont in contours:
|
||||
cv2.drawContours(image, cont, -1, (0, 255, 0), 4)
|
||||
for contour in contours:
|
||||
cv2.drawContours(image, contour, -1, (0, 255, 0), 4)
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def draw_rectangles(image, rectangles, color=None, annotate=False):
|
||||
def draw_rectangles(image: Union[np.ndarray, Image.Image], rectangles, color=None, annotate=False, filled=False):
|
||||
def annotate_rect(x, y, w, h):
|
||||
cv2.putText(
|
||||
image,
|
||||
@ -20,18 +25,18 @@ def draw_rectangles(image, rectangles, color=None, annotate=False):
|
||||
(x + (w // 2) - 12, y + (h // 2) + 9),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
1,
|
||||
(0, 255, 0),
|
||||
(0, 255, 0, 255),
|
||||
2,
|
||||
)
|
||||
|
||||
image = copy_and_normalize_channels(image)
|
||||
|
||||
if not color:
|
||||
color = (0, 255, 0)
|
||||
color = (0, 255, 0, 255)
|
||||
|
||||
for rect in rectangles:
|
||||
x, y, w, h = rect
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), color, 2)
|
||||
cv2.rectangle(image, (x, y), (x + w, y + h), color, -1 if filled else 1)
|
||||
if annotate:
|
||||
annotate_rect(x, y, w, h)
|
||||
|
||||
@ -5,7 +5,7 @@ def is_large_enough(cont, min_area):
|
||||
return cv2.contourArea(cont, False) > min_area
|
||||
|
||||
|
||||
def is_not_too_large(cnt, max_area):
|
||||
def is_small_enough(cnt, max_area):
|
||||
return cv2.contourArea(cnt, False) < max_area
|
||||
|
||||
|
||||
13
cv_analysis/utils/geometric.py
Normal file
13
cv_analysis/utils/geometric.py
Normal file
@ -0,0 +1,13 @@
|
||||
from cv_analysis.utils.rectangle import Rectangle
|
||||
|
||||
|
||||
def is_square_like(box: Rectangle):
|
||||
return box.width / box.height > 0.5 and box.height / box.width > 0.5
|
||||
|
||||
|
||||
def is_wide(box: Rectangle):
|
||||
return box.width / box.height > 1.5
|
||||
|
||||
|
||||
def is_tall(box: Rectangle):
|
||||
return box.height / box.width > 1.5
|
||||
115
cv_analysis/utils/image_operations.py
Normal file
115
cv_analysis/utils/image_operations.py
Normal file
@ -0,0 +1,115 @@
|
||||
from typing import Tuple
|
||||
|
||||
import cv2 as cv
|
||||
import numpy as np
|
||||
from PIL import ImageOps, Image
|
||||
from loguru import logger
|
||||
|
||||
from cv_analysis.utils.conversion import normalize_image_format_to_pil
|
||||
|
||||
Color = Tuple[int, int, int]
|
||||
|
||||
|
||||
def blur(image: np.ndarray):
|
||||
return cv.blur(image, (3, 3))
|
||||
|
||||
|
||||
def sharpen(image: np.ndarray):
|
||||
return cv.filter2D(image, -1, np.array([[-1, -1, -1], [-1, 6, -1], [-1, -1, -1]]))
|
||||
|
||||
|
||||
def overlay(images, mode=np.sum):
|
||||
assert mode in [np.sum, np.max]
|
||||
images = np.stack(list(images))
|
||||
image = mode(images, axis=0)
|
||||
image = (image / image.max() * 255).astype(np.uint8)
|
||||
return image
|
||||
|
||||
|
||||
def tint_image(src, color="#FFFFFF"):
|
||||
src.load()
|
||||
r, g, b, alpha = src.split()
|
||||
gray = ImageOps.grayscale(src)
|
||||
result = ImageOps.colorize(gray, (0, 0, 0), color)
|
||||
result.putalpha(alpha)
|
||||
return result
|
||||
|
||||
|
||||
def color_shift_array(image: np.ndarray, color: Color):
|
||||
"""Creates a 3-tensor from a 2-tensor by stacking the 2-tensor three times weighted by the color tuple."""
|
||||
assert image.ndim == 3
|
||||
assert image.shape[-1] == 3
|
||||
assert isinstance(color, tuple)
|
||||
assert max(color) <= 255
|
||||
assert image.max() <= 255
|
||||
|
||||
color = np.array(color)
|
||||
weights = color / color.sum() / 10
|
||||
assert max(weights) <= 1
|
||||
|
||||
colored = (image * weights).astype(np.uint8)
|
||||
|
||||
assert colored.shape == image.shape
|
||||
|
||||
return colored
|
||||
|
||||
|
||||
def superimpose(
|
||||
base_image: Image,
|
||||
image_to_superimpose: Image,
|
||||
crop_to_content=True,
|
||||
pad=True,
|
||||
) -> Image:
|
||||
"""Superimposes an image with transparency onto another image.
|
||||
|
||||
Args:
|
||||
base_image: The page image.
|
||||
image_to_superimpose: The texture image.
|
||||
crop_to_content: If True, the texture will be cropped to content (i.e. the bounding box of all non-transparent
|
||||
parts of the texture image).
|
||||
pad: If True, the texture will be padded to the size of the page.
|
||||
|
||||
Returns:
|
||||
Image where the texture is superimposed onto the page.
|
||||
"""
|
||||
base_image = normalize_image_format_to_pil(base_image)
|
||||
image_to_superimpose = normalize_image_format_to_pil(image_to_superimpose)
|
||||
|
||||
if crop_to_content:
|
||||
image_to_superimpose = image_to_superimpose.crop(image_to_superimpose.getbbox())
|
||||
|
||||
if base_image.size != image_to_superimpose.size:
|
||||
logger.trace(f"Size of page and texture do not match: {base_image.size} != {image_to_superimpose.size}")
|
||||
if pad:
|
||||
logger.trace(f"Padding texture before pasting to fit size {base_image.size}")
|
||||
image_to_superimpose = pad_image_to_size(image_to_superimpose, base_image.size)
|
||||
else:
|
||||
logger.trace(f"Resizing texture before pasting to fit size {base_image.size}")
|
||||
image_to_superimpose = image_to_superimpose.resize(base_image.size)
|
||||
|
||||
assert base_image.size == image_to_superimpose.size
|
||||
assert image_to_superimpose.mode == "RGBA"
|
||||
|
||||
base_image.paste(image_to_superimpose, (0, 0), image_to_superimpose)
|
||||
return base_image
|
||||
|
||||
|
||||
def pad_image_to_size(image: Image, size: Tuple[int, int]) -> Image:
|
||||
"""Pads an image to a given size."""
|
||||
if image.size == size:
|
||||
return image
|
||||
|
||||
if image.size[0] > size[0] or image.size[1] > size[1]:
|
||||
raise ValueError(f"Image size {image.size} is larger than target size {size}.")
|
||||
|
||||
padded = Image.new(image.mode, size, color=255)
|
||||
|
||||
pasting_coords = compute_pasting_coordinates(image, padded)
|
||||
assert image.mode == "RGBA"
|
||||
padded.paste(image, pasting_coords)
|
||||
return padded
|
||||
|
||||
|
||||
def compute_pasting_coordinates(smaller: Image, larger: Image.Image):
|
||||
"""Computes the coordinates for centrally pasting a smaller image onto a larger image."""
|
||||
return abs(larger.width - smaller.width) // 2, abs(larger.height - smaller.height) // 2
|
||||
29
cv_analysis/utils/input.py
Normal file
29
cv_analysis/utils/input.py
Normal file
@ -0,0 +1,29 @@
|
||||
from numpy import array, ndarray
|
||||
import pdf2image
|
||||
from PIL import Image
|
||||
|
||||
from cv_analysis.utils.preprocessing import preprocess_page_array
|
||||
|
||||
|
||||
def open_analysis_input_file(path_or_bytes, first_page=1, last_page=None):
|
||||
|
||||
assert first_page > 0, "Page numbers are 1-based."
|
||||
assert last_page is None or last_page >= first_page, "last_page must be greater than or equal to first_page."
|
||||
|
||||
last_page = last_page or first_page
|
||||
|
||||
if type(path_or_bytes) == str:
|
||||
if path_or_bytes.lower().endswith((".png", ".jpg", ".jpeg")):
|
||||
pages = [Image.open(path_or_bytes)]
|
||||
elif path_or_bytes.lower().endswith(".pdf"):
|
||||
pages = pdf2image.convert_from_path(path_or_bytes, first_page=first_page, last_page=last_page)
|
||||
else:
|
||||
raise IOError("Invalid file extension. Accepted filetypes: .png, .jpg, .jpeg, .pdf")
|
||||
elif type(path_or_bytes) == bytes:
|
||||
pages = pdf2image.convert_from_bytes(path_or_bytes, first_page=first_page, last_page=last_page)
|
||||
elif type(path_or_bytes) in {list, ndarray}:
|
||||
return path_or_bytes
|
||||
|
||||
pages = [preprocess_page_array(array(p)) for p in pages]
|
||||
|
||||
return pages
|
||||
54
cv_analysis/utils/merging.py
Normal file
54
cv_analysis/utils/merging.py
Normal file
@ -0,0 +1,54 @@
|
||||
from functools import reduce
|
||||
from itertools import combinations
|
||||
from typing import List, Tuple, Set
|
||||
|
||||
from funcy import all
|
||||
|
||||
from cv_analysis.utils import until, make_merger_sentinel
|
||||
from cv_analysis.utils.rectangle import Rectangle
|
||||
from cv_analysis.utils.spacial import related
|
||||
|
||||
|
||||
def merge_related_rectangles(rectangles: List[Rectangle]) -> List[Rectangle]:
|
||||
"""Merges rectangles that are related to each other, iterating on partial merge results until no more mergers are
|
||||
possible."""
|
||||
assert isinstance(rectangles, list)
|
||||
no_new_merges = make_merger_sentinel()
|
||||
return until(no_new_merges, merge_rectangles_once, rectangles)
|
||||
|
||||
|
||||
def merge_rectangles_once(rectangles: List[Rectangle]) -> List[Rectangle]:
|
||||
"""Merges rectangles that are related to each other, but does not iterate on the results."""
|
||||
rectangles = set(rectangles)
|
||||
merged, used = reduce(merge_if_related, combinations(rectangles, 2), (set(), set()))
|
||||
|
||||
return list(merged | rectangles - used)
|
||||
|
||||
|
||||
T = Tuple[Set[Rectangle], Set[Rectangle]]
|
||||
V = Tuple[Rectangle, Rectangle]
|
||||
|
||||
|
||||
def merge_if_related(merged_and_used_so_far: T, rectangle_pair: V) -> T:
|
||||
"""Merges two rectangles if they are related, otherwise returns the accumulator unchanged."""
|
||||
alpha, beta = rectangle_pair
|
||||
merged, used = merged_and_used_so_far
|
||||
|
||||
def unused(*args) -> bool:
|
||||
return not used & {*args}
|
||||
|
||||
if all(unused, (alpha, beta)) and related(alpha, beta):
|
||||
return merged | {bounding_rect(alpha, beta)}, used | {alpha, beta}
|
||||
|
||||
else:
|
||||
return merged, used
|
||||
|
||||
|
||||
def bounding_rect(alpha: Rectangle, beta: Rectangle) -> Rectangle:
|
||||
"""Returns the smallest rectangle that contains both rectangles."""
|
||||
return Rectangle(
|
||||
min(alpha.x1, beta.x1),
|
||||
min(alpha.y1, beta.y1),
|
||||
max(alpha.x2, beta.x2),
|
||||
max(alpha.y2, beta.y2),
|
||||
)
|
||||
56
cv_analysis/utils/metrics.py
Normal file
56
cv_analysis/utils/metrics.py
Normal file
@ -0,0 +1,56 @@
|
||||
from functools import reduce
|
||||
from operator import itemgetter
|
||||
from typing import Iterable
|
||||
|
||||
import numpy as np
|
||||
from funcy import lmap, lpluck, first
|
||||
|
||||
from cv_analysis.utils import lift
|
||||
from cv_analysis.utils.rectangle import Rectangle
|
||||
|
||||
|
||||
def compute_document_score(result_dict, ground_truth_dicts):
|
||||
|
||||
extract_cells = lambda dicts: lpluck("cells", dicts["pages"])
|
||||
|
||||
cells_per_ground_truth_page, cells_per_result_page = map(extract_cells, (ground_truth_dicts, result_dict))
|
||||
cells_on_page_to_rectangles = lift(rectangle_from_dict)
|
||||
cells_on_pages_to_rectangles = lift(cells_on_page_to_rectangles)
|
||||
|
||||
rectangles_per_ground_truth_page, rectangles_per_result_page = map(
|
||||
cells_on_pages_to_rectangles, (cells_per_ground_truth_page, cells_per_result_page)
|
||||
)
|
||||
|
||||
scores = lmap(compute_page_iou, rectangles_per_result_page, rectangles_per_ground_truth_page)
|
||||
|
||||
n_cells_per_page = np.array(lmap(len, cells_per_ground_truth_page))
|
||||
document_score = np.average(scores, weights=n_cells_per_page / n_cells_per_page.sum())
|
||||
|
||||
return document_score
|
||||
|
||||
|
||||
def rectangle_from_dict(d):
|
||||
x1, y1, w, h = itemgetter("x", "y", "width", "height")(d)
|
||||
return Rectangle(x1, y1, x1 + w, y1 + h)
|
||||
|
||||
|
||||
def compute_page_iou(predicted_rectangles: Iterable[Rectangle], true_rectangles: Iterable[Rectangle]):
|
||||
def find_best_iou(sum_so_far_and_candidate_rectangles, true_rectangle):
|
||||
sum_so_far, predicted_rectangles = sum_so_far_and_candidate_rectangles
|
||||
best_match, best_iou = find_max_overlap(true_rectangle, predicted_rectangles)
|
||||
return sum_so_far + best_iou, predicted_rectangles - {best_match}
|
||||
|
||||
predicted_rectangles = set(predicted_rectangles)
|
||||
true_rectangles = set(true_rectangles)
|
||||
|
||||
iou_sum = first(reduce(find_best_iou, true_rectangles, (0, predicted_rectangles)))
|
||||
normalizing_factor = 1 / max(len(predicted_rectangles), len(true_rectangles))
|
||||
score = normalizing_factor * iou_sum
|
||||
|
||||
return score
|
||||
|
||||
|
||||
def find_max_overlap(rectangle: Rectangle, candidate_rectangles: Iterable[Rectangle]):
|
||||
best_candidate_rectangle = max(candidate_rectangles, key=rectangle.iou)
|
||||
iou = rectangle.iou(best_candidate_rectangle)
|
||||
return best_candidate_rectangle, iou
|
||||
38
cv_analysis/utils/morphing.py
Normal file
38
cv_analysis/utils/morphing.py
Normal file
@ -0,0 +1,38 @@
|
||||
from typing import Tuple
|
||||
|
||||
from PIL import Image
|
||||
from loguru import logger
|
||||
|
||||
from cv_analysis.utils.image_operations import compute_pasting_coordinates
|
||||
from cv_analysis.utils.rectangle import Rectangle
|
||||
from synthesis.segment.content_rectangle import ContentRectangle
|
||||
|
||||
|
||||
def shrink_rectangle(rectangle: Rectangle, factor: float) -> Rectangle:
|
||||
x1, y1, x2, y2 = compute_scaled_coordinates(rectangle, (1 - factor))
|
||||
|
||||
logger.trace(f"Shrinking {rectangle} by {factor} to ({x1}, {y1}, {x2}, {y2}).")
|
||||
|
||||
assert x1 >= rectangle.x1
|
||||
assert y1 >= rectangle.y1
|
||||
assert x2 <= rectangle.x2
|
||||
assert y2 <= rectangle.y2
|
||||
|
||||
shrunk_rectangle = Rectangle(x1, y1, x2, y2)
|
||||
|
||||
if isinstance(rectangle, ContentRectangle): # TODO: Refactor
|
||||
shrunk_rectangle = ContentRectangle(*shrunk_rectangle.coords, rectangle.content)
|
||||
|
||||
return shrunk_rectangle
|
||||
|
||||
|
||||
def compute_scaled_coordinates(rectangle: Rectangle, factor: float) -> Tuple[int, int, int, int]:
|
||||
# FIXME: Refactor: Using image to compute coordinates is not clean
|
||||
image = Image.new("RGBA", (rectangle.width, rectangle.height))
|
||||
scaled = image.resize((int(rectangle.width * factor), int(rectangle.height * factor)))
|
||||
|
||||
x1, y1 = compute_pasting_coordinates(scaled, image)
|
||||
x1 = rectangle.x1 + x1
|
||||
y1 = rectangle.y1 + y1
|
||||
x2, y2 = x1 + scaled.width, y1 + scaled.height
|
||||
return x1, y1, x2, y2
|
||||
@ -1,9 +1,10 @@
|
||||
from collections import namedtuple
|
||||
from functools import partial
|
||||
from itertools import compress, starmap
|
||||
from typing import Iterable, List
|
||||
from itertools import starmap, compress
|
||||
from typing import Iterable, List, Sequence
|
||||
|
||||
from cv_analysis.utils.structures import Rectangle
|
||||
from funcy import lremove
|
||||
|
||||
from cv_analysis.utils.rectangle import Rectangle
|
||||
|
||||
|
||||
def remove_overlapping(rectangles: Iterable[Rectangle]) -> List[Rectangle]:
|
||||
@ -18,15 +19,28 @@ def remove_overlapping(rectangles: Iterable[Rectangle]) -> List[Rectangle]:
|
||||
|
||||
|
||||
def remove_included(rectangles: Iterable[Rectangle]) -> List[Rectangle]:
|
||||
keep = [rect for rect in rectangles if not rect.is_included(rectangles)]
|
||||
return keep
|
||||
rectangles_to_keep = [rect for rect in rectangles if not rect.is_included(rectangles)]
|
||||
return rectangles_to_keep
|
||||
|
||||
|
||||
def remove_small(boxes: Iterable[Rectangle], page_width, page_height, min_percentage=0.13) -> List[Rectangle]:
|
||||
min_width = page_width * min_percentage
|
||||
min_height = page_height * min_percentage
|
||||
|
||||
def small(box: Rectangle):
|
||||
return box.width < min_width or box.height < min_height
|
||||
|
||||
return lremove(small, boxes)
|
||||
|
||||
|
||||
def __remove_isolated_unsorted(rectangles: Iterable[Rectangle]) -> List[Rectangle]:
|
||||
def is_connected(rect: Rectangle, rectangles: Iterable[Rectangle]):
|
||||
return any(rect.adjacent(rect2) for rect2 in rectangles if not rect == rect2)
|
||||
|
||||
rectangles = list(filter(partial(is_connected, rectangles=list(rectangles)), rectangles))
|
||||
if not isinstance(rectangles, list):
|
||||
rectangles = list(rectangles)
|
||||
|
||||
rectangles = list(filter(partial(is_connected, rectangles=rectangles), rectangles))
|
||||
return rectangles
|
||||
|
||||
|
||||
@ -43,9 +57,9 @@ def __remove_isolated_sorted(rectangles: Iterable[Rectangle]) -> List[Rectangle]
|
||||
return rectangles
|
||||
|
||||
|
||||
def remove_isolated(rectangles: Iterable[Rectangle], input_unsorted=True) -> List[Rectangle]:
|
||||
def remove_isolated(rectangles: Iterable[Rectangle], input_unsorted: bool = True) -> List[Rectangle]:
|
||||
return (__remove_isolated_unsorted if input_unsorted else __remove_isolated_sorted)(rectangles)
|
||||
|
||||
|
||||
def has_no_parent(hierarchy):
|
||||
def has_no_parent(hierarchy: Sequence[int]) -> bool:
|
||||
return hierarchy[-1] <= 0
|
||||
@ -1,5 +1,5 @@
|
||||
import cv2
|
||||
from numpy import frombuffer, ndarray
|
||||
import cv2
|
||||
|
||||
|
||||
def preprocess_page_array(page):
|
||||
@ -10,6 +10,7 @@ def preprocess_page_array(page):
|
||||
|
||||
|
||||
def page2image(page):
|
||||
|
||||
if type(page) == bytes:
|
||||
page = frombuffer(page)
|
||||
elif type(page) == ndarray:
|
||||
99
cv_analysis/utils/rectangle.py
Normal file
99
cv_analysis/utils/rectangle.py
Normal file
@ -0,0 +1,99 @@
|
||||
# See https://stackoverflow.com/a/33533514
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Iterable, Union
|
||||
|
||||
from funcy import identity
|
||||
|
||||
from cv_analysis.utils.spacial import adjacent, contains, intersection, iou, area, is_contained, shift
|
||||
|
||||
Coord = Union[int, float]
|
||||
|
||||
|
||||
class Rectangle:
|
||||
def __init__(self, x1, y1, x2, y2, discrete=True):
|
||||
"""Creates a rectangle from two points."""
|
||||
nearest_valid = int if discrete else identity
|
||||
|
||||
self.__x1 = nearest_valid(x1)
|
||||
self.__y1 = nearest_valid(y1)
|
||||
self.__x2 = nearest_valid(x2)
|
||||
self.__y2 = nearest_valid(y2)
|
||||
|
||||
def __repr__(self):
|
||||
return f"Rectangle({self.x1}, {self.y1}, {self.x2}, {self.y2})"
|
||||
|
||||
@property
|
||||
def x1(self):
|
||||
return self.__x1
|
||||
|
||||
@property
|
||||
def x2(self):
|
||||
return self.__x2
|
||||
|
||||
@property
|
||||
def y1(self):
|
||||
return self.__y1
|
||||
|
||||
@property
|
||||
def y2(self):
|
||||
return self.__y2
|
||||
|
||||
@property
|
||||
def width(self):
|
||||
return abs(self.x2 - self.x1)
|
||||
|
||||
@property
|
||||
def height(self):
|
||||
return abs(self.y2 - self.y1)
|
||||
|
||||
@property
|
||||
def coords(self):
|
||||
return [self.x1, self.y1, self.x2, self.y2]
|
||||
|
||||
@property
|
||||
def size(self):
|
||||
return self.width, self.height
|
||||
|
||||
def __hash__(self):
|
||||
return hash((self.x1, self.y1, self.x2, self.y2))
|
||||
|
||||
def __iter__(self):
|
||||
yield self.x1
|
||||
yield self.y1
|
||||
yield self.width
|
||||
yield self.height
|
||||
|
||||
def area(self):
|
||||
"""Calculates the area of this rectangle."""
|
||||
return area(self)
|
||||
|
||||
def intersection(self, other):
|
||||
"""Calculates the intersection of this and the given other rectangle."""
|
||||
return intersection(self, other)
|
||||
|
||||
def iou(self, other: Rectangle):
|
||||
"""Calculates the intersection over union of this and the given other rectangle."""
|
||||
return iou(self, other)
|
||||
|
||||
def includes(self, other: Rectangle, tol=3):
|
||||
"""Checks if this rectangle contains the given other."""
|
||||
return contains(self, other, tol)
|
||||
|
||||
def is_included(self, rectangles: Iterable[Rectangle]):
|
||||
"""Checks if this rectangle is contained by any of the given rectangles."""
|
||||
return is_contained(self, rectangles)
|
||||
|
||||
def adjacent(self, other: Rectangle, tolerance=7):
|
||||
"""Checks if this rectangle is adjacent to the given other."""
|
||||
return adjacent(self, other, tolerance)
|
||||
|
||||
def shift(self, dx, dy):
|
||||
"""Shifts this rectangle by the given amount."""
|
||||
x1, y1, x2, y2 = shift(self, dx, dy)
|
||||
self.__x1 = x1
|
||||
self.__y1 = y1
|
||||
self.__x2 = x2
|
||||
self.__y2 = y2
|
||||
|
||||
return self
|
||||
294
cv_analysis/utils/spacial.py
Normal file
294
cv_analysis/utils/spacial.py
Normal file
@ -0,0 +1,294 @@
|
||||
# See https://stackoverflow.com/a/39757388
|
||||
from __future__ import annotations
|
||||
|
||||
from functools import lru_cache
|
||||
from operator import attrgetter
|
||||
from typing import TYPE_CHECKING, Iterable
|
||||
|
||||
from funcy import juxt, rpartial, compose, lflatten, first, second
|
||||
|
||||
from cv_analysis.utils import lift
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from cv_analysis.utils.rectangle import Rectangle
|
||||
|
||||
|
||||
def adjacent(alpha: Rectangle, beta: Rectangle, tolerance=7, strict=False):
|
||||
"""Checks if the two rectangles are adjacent to each other.
|
||||
|
||||
Args:
|
||||
alpha: The first rectangle.
|
||||
beta: The second rectangle.
|
||||
tolerance: The maximum distance between the two rectangles.
|
||||
strict: If True, the rectangles must be adjacent along one axis and contained within the other axis. Else, the
|
||||
rectangles must be adjacent along one axis and overlapping the other axis.
|
||||
Returns:
|
||||
True if the two rectangles are adjacent to each other, False otherwise.
|
||||
"""
|
||||
select_strictness_variant = first if strict else second
|
||||
test_candidates = [
|
||||
# +---+
|
||||
# | | +---+
|
||||
# | a | | b |
|
||||
# | | +___+
|
||||
# +___+
|
||||
(right_left_aligned_and_vertically_contained, right_left_aligned_and_vertically_overlapping),
|
||||
# +---+
|
||||
# +---+ | |
|
||||
# | b | | a |
|
||||
# +___+ | |
|
||||
# +___+
|
||||
(left_right_aligned_and_vertically_contained, left_right_aligned_and_vertically_overlapping),
|
||||
# +-----------+
|
||||
# | a |
|
||||
# +___________+
|
||||
# +-----+
|
||||
# | b |
|
||||
# +_____+
|
||||
(bottom_top_aligned_and_horizontally_contained, bottom_top_aligned_and_horizontally_overlapping),
|
||||
# +-----+
|
||||
# | b |
|
||||
# +_____+
|
||||
# +-----------+
|
||||
# | a |
|
||||
# +___________+
|
||||
(top_bottom_aligned_and_horizontally_contained, top_bottom_aligned_and_horizontally_overlapping),
|
||||
]
|
||||
|
||||
tests = map(select_strictness_variant, test_candidates)
|
||||
return any(juxt(*tests)(alpha, beta, tolerance))
|
||||
|
||||
|
||||
def right_left_aligned_and_vertically_overlapping(alpha: Rectangle, beta: Rectangle, tol):
|
||||
"""Checks if the first rectangle is left of the other within a tolerance and also overlaps the other's y range."""
|
||||
return adjacent_along_one_axis_and_overlapping_along_perpendicular_axis(
|
||||
alpha.x2, beta.x1, beta.y1, beta.y2, alpha.y1, alpha.y2, tolerance=tol
|
||||
)
|
||||
|
||||
|
||||
def left_right_aligned_and_vertically_overlapping(alpha: Rectangle, beta: Rectangle, tol):
|
||||
"""Checks if the first rectangle is right of the other within a tolerance and also overlaps the other's y range."""
|
||||
return adjacent_along_one_axis_and_overlapping_along_perpendicular_axis(
|
||||
alpha.x1, beta.x2, beta.y1, beta.y2, alpha.y1, alpha.y2, tolerance=tol
|
||||
)
|
||||
|
||||
|
||||
def bottom_top_aligned_and_horizontally_overlapping(alpha: Rectangle, beta: Rectangle, tol):
|
||||
"""Checks if the first rectangle is above the other within a tolerance and also overlaps the other's x range."""
|
||||
return adjacent_along_one_axis_and_overlapping_along_perpendicular_axis(
|
||||
alpha.y2, beta.y1, beta.x1, beta.x2, alpha.x1, alpha.x2, tolerance=tol
|
||||
)
|
||||
|
||||
|
||||
def top_bottom_aligned_and_horizontally_overlapping(alpha: Rectangle, beta: Rectangle, tol):
|
||||
"""Checks if the first rectangle is below the other within a tolerance and also overlaps the other's x range."""
|
||||
return adjacent_along_one_axis_and_overlapping_along_perpendicular_axis(
|
||||
alpha.y1, beta.y2, beta.x1, beta.x2, alpha.x1, alpha.x2, tolerance=tol
|
||||
)
|
||||
|
||||
|
||||
def right_left_aligned_and_vertically_contained(alpha: Rectangle, beta: Rectangle, tol):
|
||||
"""Checks if the first rectangle is left of the other within a tolerance and also contains the other's y range."""
|
||||
return adjacent_along_one_axis_and_contained_within_perpendicular_axis(
|
||||
alpha.x2, beta.x1, beta.y1, beta.y2, alpha.y1, alpha.y2, tolerance=tol
|
||||
)
|
||||
|
||||
|
||||
def left_right_aligned_and_vertically_contained(alpha: Rectangle, beta: Rectangle, tol):
|
||||
"""Checks if the first rectangle is right of the other within a tolerance and also contains the other's y range."""
|
||||
return adjacent_along_one_axis_and_contained_within_perpendicular_axis(
|
||||
alpha.x1, beta.x2, beta.y1, beta.y2, alpha.y1, alpha.y2, tolerance=tol
|
||||
)
|
||||
|
||||
|
||||
def bottom_top_aligned_and_horizontally_contained(alpha: Rectangle, beta: Rectangle, tol):
|
||||
"""Checks if the first rectangle is above the other within a tolerance and also contains the other's x range."""
|
||||
return adjacent_along_one_axis_and_contained_within_perpendicular_axis(
|
||||
alpha.y2, beta.y1, beta.x1, beta.x2, alpha.x1, alpha.x2, tolerance=tol
|
||||
)
|
||||
|
||||
|
||||
def top_bottom_aligned_and_horizontally_contained(alpha: Rectangle, beta: Rectangle, tol):
|
||||
"""Checks if the first rectangle is below the other within a tolerance and also contains the other's x range."""
|
||||
return adjacent_along_one_axis_and_contained_within_perpendicular_axis(
|
||||
alpha.y1, beta.y2, beta.x1, beta.x2, alpha.x1, alpha.x2, tolerance=tol
|
||||
)
|
||||
|
||||
|
||||
def adjacent_along_one_axis_and_overlapping_along_perpendicular_axis(
|
||||
axis_0_point_1,
|
||||
axis_1_point_2,
|
||||
axis_1_contained_point_1,
|
||||
axis_1_contained_point_2,
|
||||
axis_1_lower_bound,
|
||||
axis_1_upper_bound,
|
||||
tolerance,
|
||||
):
|
||||
"""Checks if two points are adjacent along one axis and two other points overlap a range along the perpendicular
|
||||
axis.
|
||||
"""
|
||||
return adjacent_along_one_axis_and_overlapping_or_contained_along_perpendicular_axis(
|
||||
axis_0_point_1,
|
||||
axis_1_point_2,
|
||||
axis_1_contained_point_1,
|
||||
axis_1_contained_point_2,
|
||||
axis_1_lower_bound,
|
||||
axis_1_upper_bound,
|
||||
tolerance,
|
||||
mode="overlapping",
|
||||
)
|
||||
|
||||
|
||||
def adjacent_along_one_axis_and_contained_within_perpendicular_axis(
|
||||
axis_0_point_1,
|
||||
axis_1_point_2,
|
||||
axis_1_contained_point_1,
|
||||
axis_1_contained_point_2,
|
||||
axis_1_lower_bound,
|
||||
axis_1_upper_bound,
|
||||
tolerance,
|
||||
):
|
||||
"""Checks if two points are adjacent along one axis and two other points overlap a range along the perpendicular
|
||||
axis.
|
||||
"""
|
||||
return adjacent_along_one_axis_and_overlapping_or_contained_along_perpendicular_axis(
|
||||
axis_0_point_1,
|
||||
axis_1_point_2,
|
||||
axis_1_contained_point_1,
|
||||
axis_1_contained_point_2,
|
||||
axis_1_lower_bound,
|
||||
axis_1_upper_bound,
|
||||
tolerance,
|
||||
mode="contained",
|
||||
)
|
||||
|
||||
|
||||
def adjacent_along_one_axis_and_overlapping_or_contained_along_perpendicular_axis(
|
||||
axis_0_point_1,
|
||||
axis_1_point_2,
|
||||
axis_1_contained_point_1,
|
||||
axis_1_contained_point_2,
|
||||
axis_1_lower_bound,
|
||||
axis_1_upper_bound,
|
||||
tolerance,
|
||||
mode,
|
||||
):
|
||||
"""Checks if two points are adjacent along one axis and two other points overlap a range along the perpendicular
|
||||
axis or are contained in that range, depending on the mode specified.
|
||||
"""
|
||||
assert mode in ["overlapping", "contained"]
|
||||
quantifier = any if mode == "overlapping" else all
|
||||
return all(
|
||||
[
|
||||
abs(axis_0_point_1 - axis_1_point_2) <= tolerance,
|
||||
quantifier(
|
||||
[
|
||||
axis_1_lower_bound <= p <= axis_1_upper_bound
|
||||
for p in [axis_1_contained_point_1, axis_1_contained_point_2]
|
||||
]
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def contains(alpha: Rectangle, beta: Rectangle, tol=3):
|
||||
"""Checks if the first rectangle contains the second rectangle."""
|
||||
return (
|
||||
beta.x1 + tol >= alpha.x1
|
||||
and beta.y1 + tol >= alpha.y1
|
||||
and beta.x2 - tol <= alpha.x2
|
||||
and beta.y2 - tol <= alpha.y2
|
||||
)
|
||||
|
||||
|
||||
def is_contained(rectangle: Rectangle, rectangles: Iterable[Rectangle]):
|
||||
"""Checks if the rectangle is contained within any of the other rectangles."""
|
||||
other_rectangles = filter(lambda r: r != rectangle, rectangles)
|
||||
return any(map(rpartial(contains, rectangle), other_rectangles))
|
||||
|
||||
|
||||
def iou(alpha: Rectangle, beta: Rectangle):
|
||||
"""Calculates the intersection area over the union area of two rectangles."""
|
||||
return intersection(alpha, beta) / union(alpha, beta)
|
||||
|
||||
|
||||
def area(rectangle: Rectangle):
|
||||
"""Calculates the area of a rectangle."""
|
||||
return abs((rectangle.x2 - rectangle.x1) * (rectangle.y2 - rectangle.y1))
|
||||
|
||||
|
||||
def union(alpha: Rectangle, beta: Rectangle):
|
||||
"""Calculates the union area of two rectangles."""
|
||||
return area(alpha) + area(beta) - intersection(alpha, beta)
|
||||
|
||||
|
||||
@lru_cache(maxsize=1000)
|
||||
def intersection(alpha, beta):
|
||||
"""Calculates the intersection of two rectangles."""
|
||||
return intersection_along_x_axis(alpha, beta) * intersection_along_y_axis(alpha, beta)
|
||||
|
||||
|
||||
def intersection_along_x_axis(alpha, beta):
|
||||
"""Calculates the intersection along the x-axis."""
|
||||
return intersection_along_axis(alpha, beta, "x")
|
||||
|
||||
|
||||
def intersection_along_y_axis(alpha, beta):
|
||||
"""Calculates the intersection along the y-axis."""
|
||||
return intersection_along_axis(alpha, beta, "y")
|
||||
|
||||
|
||||
def intersection_along_axis(alpha, beta, axis):
|
||||
"""Calculates the intersection along the given axis.
|
||||
|
||||
Cases:
|
||||
a b
|
||||
[-----] (---) ==> [a1, b1, a2, b2] ==> max(0, (a2 - b1)) = 0
|
||||
b a
|
||||
(---) [-----] ==> [b1, a1, b2, a2] ==> max(0, (b2 - a1)) = 0
|
||||
a b
|
||||
[--(----]----) ==> [a1, b1, a2, b2] ==> max(0, (a2 - b1)) = (a2 - b1)
|
||||
a b
|
||||
(-[---]----) ==> [b1, a1, a2, b2] ==> max(0, (a2 - a1)) = (a2 - a1)
|
||||
b a
|
||||
[-(---)----] ==> [a1, b1, b2, a2] ==> max(0, (b2 - b1)) = (b2 - b1)
|
||||
b a
|
||||
(----[--)----] ==> [b1, a1, b2, a2] ==> max(0, (b2 - a1)) = (b2 - a1)
|
||||
"""
|
||||
assert axis in ["x", "y"]
|
||||
|
||||
def get_component_accessor(component):
|
||||
"""Returns a function that accesses the given component of a rectangle."""
|
||||
return attrgetter(f"{axis}{component}")
|
||||
|
||||
def make_access_components_and_sort_fn(component):
|
||||
"""Returns a function that accesses and sorts the given component of multiple rectangles."""
|
||||
assert component in [1, 2]
|
||||
return compose(sorted, lift(get_component_accessor(component)))
|
||||
|
||||
sort_first_components, sort_second_components = map(make_access_components_and_sort_fn, [1, 2])
|
||||
|
||||
min_c1, max_c1, min_c2, max_c2 = lflatten(juxt(sort_first_components, sort_second_components)((alpha, beta)))
|
||||
intersection = max(0, min_c2 - max_c1)
|
||||
return intersection
|
||||
|
||||
|
||||
def related(alpha: Rectangle, beta: Rectangle):
|
||||
"""Checks if two rectangles lie close by or overlap."""
|
||||
return close(alpha, beta) or overlap(alpha, beta)
|
||||
|
||||
|
||||
def close(alpha: Rectangle, beta: Rectangle, max_gap=14):
|
||||
"""Checks if two rectangles are close to each other."""
|
||||
# FIXME: Parameterize via factory
|
||||
return adjacent(alpha, beta, tolerance=max_gap, strict=True)
|
||||
|
||||
|
||||
def overlap(alpha: Rectangle, beta: Rectangle):
|
||||
"""Checks if two rectangles overlap."""
|
||||
return intersection(alpha, beta) > 0
|
||||
|
||||
|
||||
def shift(rectangle: Rectangle, dx: int, dy: int):
|
||||
"""Shifts a rectangle by the given amount."""
|
||||
return rectangle.x1 + dx, rectangle.y1 + dy, rectangle.x2 + dx, rectangle.y2 + dy
|
||||
90
cv_analysis/utils/utils.py
Normal file
90
cv_analysis/utils/utils.py
Normal file
@ -0,0 +1,90 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import itertools
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from funcy import first, iterate, keep, lmap, repeatedly
|
||||
from numpy import generic
|
||||
|
||||
|
||||
def copy_and_normalize_channels(image):
|
||||
|
||||
if isinstance(image, Image.Image):
|
||||
image = np.array(image)
|
||||
|
||||
image = image.copy()
|
||||
try:
|
||||
image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR)
|
||||
except cv2.error:
|
||||
pass
|
||||
|
||||
return image
|
||||
|
||||
|
||||
def npconvert(ob):
|
||||
if isinstance(ob, generic):
|
||||
return ob.item()
|
||||
raise TypeError
|
||||
|
||||
|
||||
def lift(fn):
|
||||
def lifted(coll):
|
||||
yield from map(fn, coll)
|
||||
|
||||
return lifted
|
||||
|
||||
|
||||
def star(fn):
|
||||
def starred(args):
|
||||
return fn(*args)
|
||||
|
||||
return starred
|
||||
|
||||
|
||||
def lstarkeep(fn, coll):
|
||||
return list(starkeep(fn, coll))
|
||||
|
||||
|
||||
def starkeep(fn, coll):
|
||||
yield from keep(star(fn), coll)
|
||||
|
||||
|
||||
def until(cond, func, *args, **kwargs):
|
||||
return first(filter(cond, iterate(func, *args, **kwargs)))
|
||||
|
||||
|
||||
def conj(x, xs):
|
||||
return [x, *xs]
|
||||
|
||||
|
||||
def rconj(xs, x):
|
||||
return [*xs, x]
|
||||
|
||||
|
||||
def make_merger_sentinel():
|
||||
def no_new_mergers(records):
|
||||
nonlocal number_of_records_so_far
|
||||
|
||||
number_of_records_now = len(records)
|
||||
|
||||
if number_of_records_now == number_of_records_so_far:
|
||||
return True
|
||||
|
||||
else:
|
||||
number_of_records_so_far = number_of_records_now
|
||||
return False
|
||||
|
||||
number_of_records_so_far = -1
|
||||
|
||||
return no_new_mergers
|
||||
|
||||
|
||||
def zipmap(fn, boxes, n=2):
|
||||
rets = lmap(list, zip(*map(fn, boxes)))
|
||||
yield from repeatedly(lambda: [], n) if len(rets) < n else rets
|
||||
|
||||
|
||||
def every_nth(n, iterable):
|
||||
return itertools.islice(iterable, 0, None, n)
|
||||
@ -1,11 +1,9 @@
|
||||
import os
|
||||
|
||||
from pyinfra.config.loader import load_settings # type: ignore
|
||||
|
||||
from cv_analysis.config import get_config
|
||||
from cv_analysis.utils.display import save_image
|
||||
|
||||
settings = get_config()
|
||||
CV_CONFIG = get_config()
|
||||
|
||||
|
||||
class VisualLogger:
|
||||
@ -41,4 +39,4 @@ class VisualLogger:
|
||||
return self.level == "ALL"
|
||||
|
||||
|
||||
vizlogger = VisualLogger(settings.logging.visual_logging_level, settings.logging.visual_logging_output_folder)
|
||||
vizlogger = VisualLogger(CV_CONFIG.visual_logging_level, CV_CONFIG.visual_logging_output_folder)
|
||||
Binary file not shown.
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@ -1,30 +0,0 @@
|
||||
#!/bin/bash
|
||||
python_version=$1
|
||||
gitlab_user=$2
|
||||
gitlab_personal_access_token=$3
|
||||
|
||||
# cookiecutter https://gitlab.knecon.com/knecon/research/template-python-project.git --checkout master
|
||||
# latest_dir=$(ls -td -- */ | head -n 1) # should be the dir cookiecutter just created
|
||||
|
||||
# cd $latest_dir
|
||||
|
||||
pyenv install $python_version
|
||||
pyenv local $python_version
|
||||
pyenv shell $python_version
|
||||
|
||||
pip install --upgrade pip
|
||||
pip install poetry
|
||||
|
||||
poetry config installer.max-workers 10
|
||||
# research package registry
|
||||
poetry config repositories.gitlab-research https://gitlab.knecon.com/api/v4/groups/19/-/packages/pypi
|
||||
poetry config http-basic.gitlab-research ${gitlab_user} ${gitlab_personal_access_token}
|
||||
# redactmanager package registry
|
||||
poetry config repositories.gitlab-red https://gitlab.knecon.com/api/v4/groups/12/-/packages/pypi
|
||||
poetry config http-basic.gitlab-red ${gitlab_user} ${gitlab_personal_access_token}
|
||||
|
||||
poetry env use $(pyenv which python)
|
||||
poetry install --with=dev
|
||||
poetry update
|
||||
|
||||
source .venv/bin/activate
|
||||
@ -28,4 +28,4 @@ services:
|
||||
volumes:
|
||||
- /opt/bitnami/rabbitmq/.rabbitmq/:/data/bitnami
|
||||
volumes:
|
||||
mdata:
|
||||
mdata:
|
||||
4
docs/build/html/.buildinfo
vendored
4
docs/build/html/.buildinfo
vendored
@ -1,4 +0,0 @@
|
||||
# Sphinx build info version 1
|
||||
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
|
||||
config: 04e9c6c5d3e412413c2949e598da60dc
|
||||
tags: 645f666f9bcd5a90fca523b33c5a78b7
|
||||
BIN
docs/build/html/.doctrees/README.doctree
vendored
BIN
docs/build/html/.doctrees/README.doctree
vendored
Binary file not shown.
BIN
docs/build/html/.doctrees/environment.pickle
vendored
BIN
docs/build/html/.doctrees/environment.pickle
vendored
Binary file not shown.
BIN
docs/build/html/.doctrees/index.doctree
vendored
BIN
docs/build/html/.doctrees/index.doctree
vendored
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
docs/build/html/.doctrees/modules/serve.doctree
vendored
BIN
docs/build/html/.doctrees/modules/serve.doctree
vendored
Binary file not shown.
657
docs/build/html/README.html
vendored
657
docs/build/html/README.html
vendored
@ -1,657 +0,0 @@
|
||||
|
||||
<!DOCTYPE html>
|
||||
|
||||
|
||||
<html lang="en" data-content_root="./" >
|
||||
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||||
|
||||
<title>cv-analysis - Visual (CV-Based) Document Parsing — CV Analysis Service 2.5.2 documentation</title>
|
||||
|
||||
|
||||
|
||||
<script data-cfasync="false">
|
||||
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
|
||||
document.documentElement.dataset.theme = localStorage.getItem("theme") || "light";
|
||||
</script>
|
||||
|
||||
<!-- Loaded before other Sphinx assets -->
|
||||
<link href="_static/styles/theme.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
|
||||
<link href="_static/styles/bootstrap.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
|
||||
<link href="_static/styles/pydata-sphinx-theme.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
|
||||
|
||||
|
||||
<link href="_static/vendor/fontawesome/6.5.1/css/all.min.css?digest=8d27b9dea8ad943066ae" rel="stylesheet" />
|
||||
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.5.1/webfonts/fa-solid-900.woff2" />
|
||||
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.5.1/webfonts/fa-brands-400.woff2" />
|
||||
<link rel="preload" as="font" type="font/woff2" crossorigin href="_static/vendor/fontawesome/6.5.1/webfonts/fa-regular-400.woff2" />
|
||||
|
||||
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=a746c00c" />
|
||||
<link rel="stylesheet" type="text/css" href="https://assets.readthedocs.org/static/css/badge_only.css" />
|
||||
|
||||
<!-- Pre-loaded scripts that we'll load fully later -->
|
||||
<link rel="preload" as="script" href="_static/scripts/bootstrap.js?digest=8d27b9dea8ad943066ae" />
|
||||
<link rel="preload" as="script" href="_static/scripts/pydata-sphinx-theme.js?digest=8d27b9dea8ad943066ae" />
|
||||
<script src="_static/vendor/fontawesome/6.5.1/js/all.min.js?digest=8d27b9dea8ad943066ae"></script>
|
||||
|
||||
<script src="_static/documentation_options.js?v=afc61bbc"></script>
|
||||
<script src="_static/doctools.js?v=9a2dae69"></script>
|
||||
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
|
||||
<script>DOCUMENTATION_OPTIONS.pagename = 'README';</script>
|
||||
<script async="async" src="https://assets.readthedocs.org/static/javascript/readthedocs-doc-embed.js"></script>
|
||||
<link rel="index" title="Index" href="genindex.html" />
|
||||
<link rel="search" title="Search" href="search.html" />
|
||||
<link rel="next" title="cv_analysis package" href="modules/cv_analysis.html" />
|
||||
<link rel="prev" title="Welcome to CV Analysis Service documentation!" href="index.html" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||||
<meta name="docsearch:language" content="en"/>
|
||||
|
||||
<!-- RTD Extra Head -->
|
||||
|
||||
<link rel="stylesheet" href="https://assets.readthedocs.org/static/css/readthedocs-doc-embed.css" type="text/css" />
|
||||
|
||||
<script type="application/json" id="READTHEDOCS_DATA">{"ad_free": "", "api_host": "", "builder": "sphinx", "canonical_url": "", "docroot": "", "features": {"docsearch_disabled": false}, "global_analytics_code": null, "language": "", "page": "README", "programming_language": "", "project": "", "source_suffix": ".md", "subprojects": {}, "theme": "", "user_analytics_code": null, "version": ""}</script>
|
||||
|
||||
<!--
|
||||
Using this variable directly instead of using `JSON.parse` is deprecated.
|
||||
The READTHEDOCS_DATA global variable will be removed in the future.
|
||||
-->
|
||||
<script type="text/javascript">
|
||||
READTHEDOCS_DATA = JSON.parse(document.getElementById('READTHEDOCS_DATA').innerHTML);
|
||||
</script>
|
||||
|
||||
<script type="text/javascript" src="https://assets.readthedocs.org/static/javascript/readthedocs-analytics.js" async="async"></script>
|
||||
|
||||
<!-- end RTD <extrahead> -->
|
||||
</head>
|
||||
|
||||
|
||||
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
|
||||
|
||||
|
||||
|
||||
<a id="pst-skip-link" class="skip-link" href="#main-content">Skip to main content</a>
|
||||
|
||||
<div id="pst-scroll-pixel-helper"></div>
|
||||
|
||||
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
|
||||
<i class="fa-solid fa-arrow-up"></i>
|
||||
Back to top
|
||||
</button>
|
||||
|
||||
|
||||
<input type="checkbox"
|
||||
class="sidebar-toggle"
|
||||
name="__primary"
|
||||
id="__primary"/>
|
||||
<label class="overlay overlay-primary" for="__primary"></label>
|
||||
|
||||
<input type="checkbox"
|
||||
class="sidebar-toggle"
|
||||
name="__secondary"
|
||||
id="__secondary"/>
|
||||
<label class="overlay overlay-secondary" for="__secondary"></label>
|
||||
|
||||
<div class="search-button__wrapper">
|
||||
<div class="search-button__overlay"></div>
|
||||
<div class="search-button__search-container">
|
||||
<form class="bd-search d-flex align-items-center"
|
||||
action="search.html"
|
||||
method="get">
|
||||
<i class="fa-solid fa-magnifying-glass"></i>
|
||||
<input type="search"
|
||||
class="form-control"
|
||||
name="q"
|
||||
id="search-input"
|
||||
placeholder="Search the docs ..."
|
||||
aria-label="Search the docs ..."
|
||||
autocomplete="off"
|
||||
autocorrect="off"
|
||||
autocapitalize="off"
|
||||
spellcheck="false"/>
|
||||
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
|
||||
</form></div>
|
||||
</div>
|
||||
|
||||
<header class="bd-header navbar navbar-expand-lg bd-navbar">
|
||||
<div class="bd-header__inner bd-page-width">
|
||||
<label class="sidebar-toggle primary-toggle" for="__primary">
|
||||
<span class="fa-solid fa-bars"></span>
|
||||
</label>
|
||||
|
||||
|
||||
<div class="col-lg-3 navbar-header-items__start">
|
||||
|
||||
<div class="navbar-item">
|
||||
|
||||
|
||||
|
||||
<a class="navbar-brand logo" href="index.html">
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<img src="_static/logo.png" class="logo__image only-light" alt="CV Analysis Service 2.5.2 documentation - Home"/>
|
||||
<script>document.write(`<img src="_static/logo.png" class="logo__image only-dark" alt="CV Analysis Service 2.5.2 documentation - Home"/>`);</script>
|
||||
|
||||
|
||||
</a></div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="col-lg-9 navbar-header-items">
|
||||
|
||||
<div class="me-auto navbar-header-items__center">
|
||||
|
||||
<div class="navbar-item">
|
||||
<nav class="navbar-nav">
|
||||
<ul class="bd-navbar-elements navbar-nav">
|
||||
|
||||
<li class="nav-item current active">
|
||||
<a class="nav-link nav-internal" href="#">
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="modules/cv_analysis.html">
|
||||
cv_analysis package
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="modules/serve.html">
|
||||
serve module
|
||||
</a>
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<div class="navbar-header-items__end">
|
||||
|
||||
<div class="navbar-item navbar-persistent--container">
|
||||
|
||||
|
||||
<script>
|
||||
document.write(`
|
||||
<button class="btn navbar-btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||||
<i class="fa-solid fa-magnifying-glass"></i>
|
||||
<span class="search-button__default-text">Search</span>
|
||||
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
|
||||
</button>
|
||||
`);
|
||||
</script>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="navbar-item">
|
||||
|
||||
<script>
|
||||
document.write(`
|
||||
<button class="btn btn-sm navbar-btn theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||||
<span class="theme-switch nav-link" data-mode="light"><i class="fa-solid fa-sun fa-lg"></i></span>
|
||||
<span class="theme-switch nav-link" data-mode="dark"><i class="fa-solid fa-moon fa-lg"></i></span>
|
||||
<span class="theme-switch nav-link" data-mode="auto"><i class="fa-solid fa-circle-half-stroke fa-lg"></i></span>
|
||||
</button>
|
||||
`);
|
||||
</script></div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
<div class="navbar-persistent--mobile">
|
||||
|
||||
<script>
|
||||
document.write(`
|
||||
<button class="btn navbar-btn search-button-field search-button__button" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||||
<i class="fa-solid fa-magnifying-glass"></i>
|
||||
<span class="search-button__default-text">Search</span>
|
||||
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
|
||||
</button>
|
||||
`);
|
||||
</script>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<label class="sidebar-toggle secondary-toggle" for="__secondary" tabindex="0">
|
||||
<span class="fa-solid fa-outdent"></span>
|
||||
</label>
|
||||
|
||||
</div>
|
||||
|
||||
</header>
|
||||
|
||||
|
||||
<div class="bd-container">
|
||||
<div class="bd-container__inner bd-page-width">
|
||||
|
||||
|
||||
|
||||
<div class="bd-sidebar-primary bd-sidebar">
|
||||
|
||||
|
||||
|
||||
<div class="sidebar-header-items sidebar-primary__section">
|
||||
|
||||
|
||||
<div class="sidebar-header-items__center">
|
||||
|
||||
<div class="navbar-item">
|
||||
<nav class="navbar-nav">
|
||||
<ul class="bd-navbar-elements navbar-nav">
|
||||
|
||||
<li class="nav-item current active">
|
||||
<a class="nav-link nav-internal" href="#">
|
||||
cv-analysis - Visual (CV-Based) Document Parsing
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="modules/cv_analysis.html">
|
||||
cv_analysis package
|
||||
</a>
|
||||
</li>
|
||||
|
||||
|
||||
<li class="nav-item">
|
||||
<a class="nav-link nav-internal" href="modules/serve.html">
|
||||
serve module
|
||||
</a>
|
||||
</li>
|
||||
|
||||
</ul>
|
||||
</nav></div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="sidebar-header-items__end">
|
||||
|
||||
<div class="navbar-item">
|
||||
|
||||
<script>
|
||||
document.write(`
|
||||
<button class="btn btn-sm navbar-btn theme-switch-button" title="light/dark" aria-label="light/dark" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||||
<span class="theme-switch nav-link" data-mode="light"><i class="fa-solid fa-sun fa-lg"></i></span>
|
||||
<span class="theme-switch nav-link" data-mode="dark"><i class="fa-solid fa-moon fa-lg"></i></span>
|
||||
<span class="theme-switch nav-link" data-mode="auto"><i class="fa-solid fa-circle-half-stroke fa-lg"></i></span>
|
||||
</button>
|
||||
`);
|
||||
</script></div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<div class="sidebar-primary-items__start sidebar-primary__section">
|
||||
<div class="sidebar-primary-item">
|
||||
<nav class="bd-docs-nav bd-links"
|
||||
aria-label="Section Navigation">
|
||||
<p class="bd-links__title" role="heading" aria-level="1">Section Navigation</p>
|
||||
<div class="bd-toc-item navbar-nav"></div>
|
||||
</nav></div>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="sidebar-primary-items__end sidebar-primary__section">
|
||||
</div>
|
||||
|
||||
<div id="rtd-footer-container"></div>
|
||||
|
||||
|
||||
</div>
|
||||
|
||||
<main id="main-content" class="bd-main">
|
||||
|
||||
|
||||
<div class="bd-content">
|
||||
<div class="bd-article-container">
|
||||
|
||||
<div class="bd-header-article">
|
||||
<div class="header-article-items header-article__inner">
|
||||
|
||||
<div class="header-article-items__start">
|
||||
|
||||
<div class="header-article-item">
|
||||
|
||||
|
||||
|
||||
<nav aria-label="Breadcrumb">
|
||||
<ul class="bd-breadcrumbs">
|
||||
|
||||
<li class="breadcrumb-item breadcrumb-home">
|
||||
<a href="index.html" class="nav-link" aria-label="Home">
|
||||
<i class="fa-solid fa-home"></i>
|
||||
</a>
|
||||
</li>
|
||||
<li class="breadcrumb-item active" aria-current="page">cv-analysis...</li>
|
||||
</ul>
|
||||
</nav>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
|
||||
<div id="searchbox"></div>
|
||||
<article class="bd-article">
|
||||
|
||||
<section id="cv-analysis-visual-cv-based-document-parsing">
|
||||
<h1>cv-analysis - Visual (CV-Based) Document Parsing<a class="headerlink" href="#cv-analysis-visual-cv-based-document-parsing" title="Link to this heading">#</a></h1>
|
||||
<p>parse_pdf()
|
||||
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
|
||||
previous redactions in documents.</p>
|
||||
<section id="api">
|
||||
<h2>API<a class="headerlink" href="#api" title="Link to this heading">#</a></h2>
|
||||
<p>Input message:</p>
|
||||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="nt">"targetFilePath"</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="nt">"pdf"</span><span class="p">:</span><span class="w"> </span><span class="s2">"absolute file path"</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="nt">"vlp_output"</span><span class="p">:</span><span class="w"> </span><span class="s2">"absolute file path"</span>
|
||||
<span class="w"> </span><span class="p">},</span>
|
||||
<span class="w"> </span><span class="nt">"responseFilePath"</span><span class="p">:</span><span class="w"> </span><span class="s2">"absolute file path"</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="nt">"operation"</span><span class="p">:</span><span class="w"> </span><span class="s2">"table_image_inference"</span>
|
||||
<span class="p">}</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>Response is uploaded to the storage as specified in the <code class="docutils literal notranslate"><span class="pre">responseFilePath</span></code> field. The structure is as follows:</p>
|
||||
<div class="highlight-json notranslate"><div class="highlight"><pre><span></span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="err">...</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="nt">"data"</span><span class="p">:</span><span class="w"> </span><span class="p">[</span>
|
||||
<span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="err">'pageNum'</span><span class="p">:</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'bbox'</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="err">'x</span><span class="mi">1</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mf">55.3407</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'y</span><span class="mi">1</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mf">247.0246</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'x</span><span class="mi">2</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mf">558.5602</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'y</span><span class="mi">2</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mf">598.0585</span>
|
||||
<span class="w"> </span><span class="p">},</span>
|
||||
<span class="w"> </span><span class="err">'uuid'</span><span class="p">:</span><span class="w"> </span><span class="err">'</span><span class="mi">2</span><span class="err">b</span><span class="mi">10</span><span class="err">c</span><span class="mi">1</span><span class="err">a</span><span class="mi">2-393</span><span class="err">c</span><span class="mi">-4</span><span class="kc">f</span><span class="err">ca</span><span class="mi">-</span><span class="err">b</span><span class="mf">9e3-0</span><span class="err">ad</span><span class="mi">5</span><span class="err">b</span><span class="mi">774</span><span class="err">ac</span><span class="mi">84</span><span class="err">'</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'label'</span><span class="p">:</span><span class="w"> </span><span class="err">'</span><span class="kc">ta</span><span class="err">ble'</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'</span><span class="kc">ta</span><span class="err">bleLi</span><span class="kc">nes</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="p">[</span>
|
||||
<span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="err">'x</span><span class="mi">1</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'y</span><span class="mi">1</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mi">16</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'x</span><span class="mi">2</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mi">1399</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'y</span><span class="mi">2</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mi">16</span>
|
||||
<span class="w"> </span><span class="p">},</span>
|
||||
<span class="w"> </span><span class="err">...</span>
|
||||
<span class="w"> </span><span class="p">],</span>
|
||||
<span class="w"> </span><span class="err">'imageI</span><span class="kc">nf</span><span class="err">o'</span><span class="p">:</span><span class="w"> </span><span class="p">{</span>
|
||||
<span class="w"> </span><span class="err">'heigh</span><span class="kc">t</span><span class="err">'</span><span class="p">:</span><span class="w"> </span><span class="mi">693</span><span class="p">,</span>
|
||||
<span class="w"> </span><span class="err">'wid</span><span class="kc">t</span><span class="err">h'</span><span class="p">:</span><span class="w"> </span><span class="mi">1414</span>
|
||||
<span class="w"> </span><span class="p">}</span>
|
||||
<span class="w"> </span><span class="p">},</span>
|
||||
<span class="w"> </span><span class="err">...</span>
|
||||
<span class="w"> </span><span class="p">]</span>
|
||||
<span class="p">}</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
</section>
|
||||
<section id="installation">
|
||||
<h2>Installation<a class="headerlink" href="#installation" title="Link to this heading">#</a></h2>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>git<span class="w"> </span>clone<span class="w"> </span>ssh://git@git.iqser.com:2222/rr/cv-analysis.git
|
||||
<span class="nb">cd</span><span class="w"> </span>cv-analysis
|
||||
|
||||
python<span class="w"> </span>-m<span class="w"> </span>venv<span class="w"> </span>env
|
||||
<span class="nb">source</span><span class="w"> </span>env/bin/activate
|
||||
|
||||
pip<span class="w"> </span>install<span class="w"> </span>-e<span class="w"> </span>.
|
||||
pip<span class="w"> </span>install<span class="w"> </span>-r<span class="w"> </span>requirements.txt
|
||||
|
||||
dvc<span class="w"> </span>pull
|
||||
</pre></div>
|
||||
</div>
|
||||
</section>
|
||||
<section id="usage">
|
||||
<h2>Usage<a class="headerlink" href="#usage" title="Link to this heading">#</a></h2>
|
||||
<section id="as-an-api">
|
||||
<h3>As an API<a class="headerlink" href="#as-an-api" title="Link to this heading">#</a></h3>
|
||||
<p>The module provided functions for the individual tasks that all return some kind of collection of points, depending on
|
||||
the specific task.</p>
|
||||
<section id="redaction-detection-api">
|
||||
<h4>Redaction Detection (API)<a class="headerlink" href="#redaction-detection-api" title="Link to this heading">#</a></h4>
|
||||
<p>The below snippet shows hot to find the outlines of previous redactions.</p>
|
||||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span> <span class="nn">cv_analysis.redaction_detection</span> <span class="kn">import</span> <span class="n">find_redactions</span>
|
||||
<span class="kn">import</span> <span class="nn">pdf2image</span>
|
||||
<span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>
|
||||
|
||||
<span class="n">pdf_path</span> <span class="o">=</span> <span class="o">...</span>
|
||||
<span class="n">page_index</span> <span class="o">=</span> <span class="o">...</span>
|
||||
|
||||
<span class="n">page</span> <span class="o">=</span> <span class="n">pdf2image</span><span class="o">.</span><span class="n">convert_from_path</span><span class="p">(</span><span class="n">pdf_path</span><span class="p">,</span> <span class="n">first_page</span><span class="o">=</span><span class="n">page_index</span><span class="p">,</span> <span class="n">last_page</span><span class="o">=</span><span class="n">page_index</span><span class="p">)[</span><span class="mi">0</span><span class="p">]</span>
|
||||
<span class="n">page</span> <span class="o">=</span> <span class="n">np</span><span class="o">.</span><span class="n">array</span><span class="p">(</span><span class="n">page</span><span class="p">)</span>
|
||||
|
||||
<span class="n">redaction_contours</span> <span class="o">=</span> <span class="n">find_redactions</span><span class="p">(</span><span class="n">page</span><span class="p">)</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
</section>
|
||||
</section>
|
||||
</section>
|
||||
<section id="as-a-cli-tool">
|
||||
<h2>As a CLI Tool<a class="headerlink" href="#as-a-cli-tool" title="Link to this heading">#</a></h2>
|
||||
<p>Core API functionalities can be used through a CLI.</p>
|
||||
<section id="table-parsing">
|
||||
<h3>Table Parsing<a class="headerlink" href="#table-parsing" title="Link to this heading">#</a></h3>
|
||||
<p>The tables parsing utility detects and segments tables into individual cells.</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">7</span><span class="w"> </span>--type<span class="w"> </span>table
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>The below image shows a parsed table, where each table cell has been detected individually.</p>
|
||||
<p><img alt="Table Parsing Demonstration" src="_images/table_parsing.png" /></p>
|
||||
</section>
|
||||
<section id="redaction-detection-cli">
|
||||
<h3>Redaction Detection (CLI)<a class="headerlink" href="#redaction-detection-cli" title="Link to this heading">#</a></h3>
|
||||
<p>The redaction detection utility detects previous redactions in PDFs (filled black rectangles).</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">2</span><span class="w"> </span>--type<span class="w"> </span>redaction
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>The below image shows the detected redactions with green outlines.</p>
|
||||
<p><img alt="Redaction Detection Demonstration" src="_images/redaction_detection.png" /></p>
|
||||
</section>
|
||||
<section id="layout-parsing">
|
||||
<h3>Layout Parsing<a class="headerlink" href="#layout-parsing" title="Link to this heading">#</a></h3>
|
||||
<p>The layout parsing utility detects elements such as paragraphs, tables and figures.</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">7</span><span class="w"> </span>--type<span class="w"> </span>layout
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>The below image shows the detected layout elements on a page.</p>
|
||||
<p><img alt="Layout Parsing Demonstration" src="_images/layout_parsing.png" /></p>
|
||||
</section>
|
||||
<section id="figure-detection">
|
||||
<h3>Figure Detection<a class="headerlink" href="#figure-detection" title="Link to this heading">#</a></h3>
|
||||
<p>The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility.</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/annotate.py<span class="w"> </span>data/test_pdf.pdf<span class="w"> </span><span class="m">3</span><span class="w"> </span>--type<span class="w"> </span>figure
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>The below image shows the detected figure on a page.</p>
|
||||
<p><img alt="Figure Detection Demonstration" src="_images/figure_detection.png" /></p>
|
||||
</section>
|
||||
</section>
|
||||
<section id="running-as-a-service">
|
||||
<h2>Running as a service<a class="headerlink" href="#running-as-a-service" title="Link to this heading">#</a></h2>
|
||||
<section id="building">
|
||||
<h3>Building<a class="headerlink" href="#building" title="Link to this heading">#</a></h3>
|
||||
<p>Build base image</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>bash<span class="w"> </span>setup/docker.sh
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>Build head image</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>docker<span class="w"> </span>build<span class="w"> </span>-f<span class="w"> </span>Dockerfile<span class="w"> </span>-t<span class="w"> </span>cv-analysis<span class="w"> </span>.<span class="w"> </span>--build-arg<span class="w"> </span><span class="nv">BASE_ROOT</span><span class="o">=</span><span class="s2">""</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
</section>
|
||||
<section id="usage-service">
|
||||
<h3>Usage (service)<a class="headerlink" href="#usage-service" title="Link to this heading">#</a></h3>
|
||||
<p>Shell 1</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>docker<span class="w"> </span>run<span class="w"> </span>--rm<span class="w"> </span>--net<span class="o">=</span>host<span class="w"> </span>--rm<span class="w"> </span>cv-analysis
|
||||
</pre></div>
|
||||
</div>
|
||||
<p>Shell 2</p>
|
||||
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>python<span class="w"> </span>scripts/client_mock.py<span class="w"> </span>--pdf_path<span class="w"> </span>/path/to/a/pdf
|
||||
</pre></div>
|
||||
</div>
|
||||
</section>
|
||||
</section>
|
||||
</section>
|
||||
|
||||
|
||||
</article>
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
<footer class="prev-next-footer">
|
||||
|
||||
<div class="prev-next-area">
|
||||
<a class="left-prev"
|
||||
href="index.html"
|
||||
title="previous page">
|
||||
<i class="fa-solid fa-angle-left"></i>
|
||||
<div class="prev-next-info">
|
||||
<p class="prev-next-subtitle">previous</p>
|
||||
<p class="prev-next-title">Welcome to CV Analysis Service documentation!</p>
|
||||
</div>
|
||||
</a>
|
||||
<a class="right-next"
|
||||
href="modules/cv_analysis.html"
|
||||
title="next page">
|
||||
<div class="prev-next-info">
|
||||
<p class="prev-next-subtitle">next</p>
|
||||
<p class="prev-next-title">cv_analysis package</p>
|
||||
</div>
|
||||
<i class="fa-solid fa-angle-right"></i>
|
||||
</a>
|
||||
</div>
|
||||
</footer>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
|
||||
|
||||
|
||||
<div class="sidebar-secondary-item">
|
||||
<div
|
||||
id="pst-page-navigation-heading-2"
|
||||
class="page-toc tocsection onthispage">
|
||||
<i class="fa-solid fa-list"></i> On this page
|
||||
</div>
|
||||
<nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
|
||||
<ul class="visible nav section-nav flex-column">
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#api">API</a></li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#installation">Installation</a></li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#usage">Usage</a><ul class="nav section-nav flex-column">
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#as-an-api">As an API</a><ul class="nav section-nav flex-column">
|
||||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#redaction-detection-api">Redaction Detection (API)</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#as-a-cli-tool">As a CLI Tool</a><ul class="nav section-nav flex-column">
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#table-parsing">Table Parsing</a></li>
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#redaction-detection-cli">Redaction Detection (CLI)</a></li>
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#layout-parsing">Layout Parsing</a></li>
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#figure-detection">Figure Detection</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#running-as-a-service">Running as a service</a><ul class="nav section-nav flex-column">
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#building">Building</a></li>
|
||||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#usage-service">Usage (service)</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</nav></div>
|
||||
|
||||
<div class="sidebar-secondary-item">
|
||||
|
||||
<div class="tocsection sourcelink">
|
||||
<a href="_sources/README.md.txt">
|
||||
<i class="fa-solid fa-file-lines"></i> Show Source
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div></div>
|
||||
|
||||
|
||||
</div>
|
||||
<footer class="bd-footer-content">
|
||||
|
||||
</footer>
|
||||
|
||||
</main>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Scripts loaded after <body> so the DOM is not blocked -->
|
||||
<script src="_static/scripts/bootstrap.js?digest=8d27b9dea8ad943066ae"></script>
|
||||
<script src="_static/scripts/pydata-sphinx-theme.js?digest=8d27b9dea8ad943066ae"></script>
|
||||
|
||||
<footer class="bd-footer">
|
||||
<div class="bd-footer__inner bd-page-width">
|
||||
|
||||
<div class="footer-items__start">
|
||||
|
||||
<div class="footer-item">
|
||||
|
||||
<p class="copyright">
|
||||
|
||||
© Copyright All rights reserved.
|
||||
<br/>
|
||||
|
||||
</p>
|
||||
</div>
|
||||
|
||||
<div class="footer-item">
|
||||
|
||||
<p class="sphinx-version">
|
||||
Created using <a href="https://www.sphinx-doc.org/">Sphinx</a> 7.3.7.
|
||||
<br/>
|
||||
</p>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
|
||||
|
||||
<div class="footer-items__end">
|
||||
|
||||
<div class="footer-item">
|
||||
<p class="theme-version">
|
||||
Built with the <a href="https://pydata-sphinx-theme.readthedocs.io/en/stable/index.html">PyData Sphinx Theme</a> 0.15.2.
|
||||
</p></div>
|
||||
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</footer>
|
||||
</body>
|
||||
</html>
|
||||
BIN
docs/build/html/_images/figure_detection.png
vendored
BIN
docs/build/html/_images/figure_detection.png
vendored
Binary file not shown.
|
Before Width: | Height: | Size: 707 KiB |
BIN
docs/build/html/_images/layout_parsing.png
vendored
BIN
docs/build/html/_images/layout_parsing.png
vendored
Binary file not shown.
|
Before Width: | Height: | Size: 568 KiB |
BIN
docs/build/html/_images/redaction_detection.png
vendored
BIN
docs/build/html/_images/redaction_detection.png
vendored
Binary file not shown.
|
Before Width: | Height: | Size: 3.2 MiB |
BIN
docs/build/html/_images/table_parsing.png
vendored
BIN
docs/build/html/_images/table_parsing.png
vendored
Binary file not shown.
|
Before Width: | Height: | Size: 566 KiB |
178
docs/build/html/_sources/README.md.txt
vendored
178
docs/build/html/_sources/README.md.txt
vendored
@ -1,178 +0,0 @@
|
||||
# cv-analysis - Visual (CV-Based) Document Parsing
|
||||
|
||||
parse_pdf()
|
||||
This repository implements computer vision based approaches for detecting and parsing visual features such as tables or
|
||||
previous redactions in documents.
|
||||
|
||||
## API
|
||||
|
||||
Input message:
|
||||
|
||||
```json
|
||||
{
|
||||
"targetFilePath": {
|
||||
"pdf": "absolute file path",
|
||||
"vlp_output": "absolute file path"
|
||||
},
|
||||
"responseFilePath": "absolute file path",
|
||||
"operation": "table_image_inference"
|
||||
}
|
||||
```
|
||||
|
||||
Response is uploaded to the storage as specified in the `responseFilePath` field. The structure is as follows:
|
||||
|
||||
```json
|
||||
{
|
||||
...,
|
||||
"data": [
|
||||
{
|
||||
'pageNum': 0,
|
||||
'bbox': {
|
||||
'x1': 55.3407,
|
||||
'y1': 247.0246,
|
||||
'x2': 558.5602,
|
||||
'y2': 598.0585
|
||||
},
|
||||
'uuid': '2b10c1a2-393c-4fca-b9e3-0ad5b774ac84',
|
||||
'label': 'table',
|
||||
'tableLines': [
|
||||
{
|
||||
'x1': 0,
|
||||
'y1': 16,
|
||||
'x2': 1399,
|
||||
'y2': 16
|
||||
},
|
||||
...
|
||||
],
|
||||
'imageInfo': {
|
||||
'height': 693,
|
||||
'width': 1414
|
||||
}
|
||||
},
|
||||
...
|
||||
]
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
## Installation
|
||||
|
||||
```bash
|
||||
git clone ssh://git@git.iqser.com:2222/rr/cv-analysis.git
|
||||
cd cv-analysis
|
||||
|
||||
python -m venv env
|
||||
source env/bin/activate
|
||||
|
||||
pip install -e .
|
||||
pip install -r requirements.txt
|
||||
|
||||
dvc pull
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
### As an API
|
||||
|
||||
The module provided functions for the individual tasks that all return some kind of collection of points, depending on
|
||||
the specific task.
|
||||
|
||||
#### Redaction Detection (API)
|
||||
|
||||
The below snippet shows hot to find the outlines of previous redactions.
|
||||
|
||||
```python
|
||||
from cv_analysis.redaction_detection import find_redactions
|
||||
import pdf2image
|
||||
import numpy as np
|
||||
|
||||
pdf_path = ...
|
||||
page_index = ...
|
||||
|
||||
page = pdf2image.convert_from_path(pdf_path, first_page=page_index, last_page=page_index)[0]
|
||||
page = np.array(page)
|
||||
|
||||
redaction_contours = find_redactions(page)
|
||||
```
|
||||
|
||||
## As a CLI Tool
|
||||
|
||||
Core API functionalities can be used through a CLI.
|
||||
|
||||
### Table Parsing
|
||||
|
||||
The tables parsing utility detects and segments tables into individual cells.
|
||||
|
||||
```bash
|
||||
python scripts/annotate.py data/test_pdf.pdf 7 --type table
|
||||
```
|
||||
|
||||
The below image shows a parsed table, where each table cell has been detected individually.
|
||||
|
||||

|
||||
|
||||
### Redaction Detection (CLI)
|
||||
|
||||
The redaction detection utility detects previous redactions in PDFs (filled black rectangles).
|
||||
|
||||
```bash
|
||||
python scripts/annotate.py data/test_pdf.pdf 2 --type redaction
|
||||
```
|
||||
|
||||
The below image shows the detected redactions with green outlines.
|
||||
|
||||

|
||||
|
||||
### Layout Parsing
|
||||
|
||||
The layout parsing utility detects elements such as paragraphs, tables and figures.
|
||||
|
||||
```bash
|
||||
python scripts/annotate.py data/test_pdf.pdf 7 --type layout
|
||||
```
|
||||
|
||||
The below image shows the detected layout elements on a page.
|
||||
|
||||

|
||||
|
||||
### Figure Detection
|
||||
|
||||
The figure detection utility detects figures specifically, which can be missed by the generic layout parsing utility.
|
||||
|
||||
```bash
|
||||
python scripts/annotate.py data/test_pdf.pdf 3 --type figure
|
||||
```
|
||||
|
||||
The below image shows the detected figure on a page.
|
||||
|
||||

|
||||
|
||||
## Running as a service
|
||||
|
||||
### Building
|
||||
|
||||
Build base image
|
||||
|
||||
```bash
|
||||
bash setup/docker.sh
|
||||
```
|
||||
|
||||
Build head image
|
||||
|
||||
```bash
|
||||
docker build -f Dockerfile -t cv-analysis . --build-arg BASE_ROOT=""
|
||||
```
|
||||
|
||||
### Usage (service)
|
||||
|
||||
Shell 1
|
||||
|
||||
```bash
|
||||
docker run --rm --net=host --rm cv-analysis
|
||||
```
|
||||
|
||||
Shell 2
|
||||
|
||||
```bash
|
||||
python scripts/client_mock.py --pdf_path /path/to/a/pdf
|
||||
```
|
||||
37
docs/build/html/_sources/index.rst.txt
vendored
37
docs/build/html/_sources/index.rst.txt
vendored
@ -1,37 +0,0 @@
|
||||
.. Keyword Extraction Service documentation master file, created by
|
||||
sphinx-quickstart on Mon Sep 12 12:04:24 2022.
|
||||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
=============================================
|
||||
Welcome to CV Analysis Service documentation!
|
||||
=============================================
|
||||
|
||||
.. note::
|
||||
|
||||
If you'd like to change the looks of things 👉 https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html
|
||||
|
||||
|
||||
Table of Contents
|
||||
-----------------
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 3
|
||||
:caption: README
|
||||
|
||||
README.md
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 3
|
||||
:caption: Modules
|
||||
|
||||
modules/cv_analysis
|
||||
modules/serve
|
||||
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
||||
* :ref:`genindex`
|
||||
* :ref:`modindex`
|
||||
* :ref:`search`
|
||||
@ -1,7 +0,0 @@
|
||||
cv\_analysis.config module
|
||||
==========================
|
||||
|
||||
.. automodule:: cv_analysis.config
|
||||
:members:
|
||||
:undoc-members:
|
||||
:show-inheritance:
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user