Pull request #20: New pyinfra
Merge in RR/cv-analysis from new_pyinfra to master
Squashed commit of the following:
commit f7a01a90aad1c402ac537de5bdf15df628ad54df
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Jul 27 10:40:59 2022 +0200
fix typo
commit ff4d549fac5b612c2d391ae85823c5eca1e91916
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Jul 27 10:34:04 2022 +0200
adjust build scripts for new pyinfra
commit ecd70f60d46406d8b6cc7f36a1533d706c917ca8
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Jul 27 09:42:55 2022 +0200
simplify logging by using default configurations
commit 20193c14c940eed2b0a7a72058167e26064119d0
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Tue Jul 26 17:16:57 2022 +0200
tidy-up, refactor config logic to not dependent on external files
commit d8069cd4d404a570bb04a04278161669d1c83332
Author: Isaac Riley <Isaac.Riley@iqser.com>
Date: Tue Jul 26 15:14:59 2022 +0200
update pyinfra
commit c3bc11037cca9baf016043ab997c566f5b4a2586
Author: Isaac Riley <Isaac.Riley@iqser.com>
Date: Tue Jul 26 15:09:14 2022 +0200
repair tests
commit 6f4e4f2863ee16ae056c1d432f663858c5f10221
Author: Isaac Riley <Isaac.Riley@iqser.com>
Date: Tue Jul 26 14:52:38 2022 +0200
updated server logic to work with new pyinfra; update scripts for pyinfra as submodule
commit 2a18dba81de5ee84d0bdf0e77f478693e8d8aef4
Author: Isaac Riley <Isaac.Riley@iqser.com>
Date: Tue Jul 26 14:10:41 2022 +0200
formatting
commit d87ce9328de9aa2341228af9b24473d5e583504e
Author: Isaac Riley <Isaac.Riley@iqser.com>
Date: Tue Jul 26 14:10:11 2022 +0200
make server logic compatible with new pyinfra
This commit is contained in:
parent
1618909d8e
commit
9d98945ff9
29
Dockerfile
29
Dockerfile
@ -1,20 +1,25 @@
|
|||||||
ARG BASE_ROOT="nexus.iqser.com:5001/red/"
|
FROM python:3.10
|
||||||
ARG VERSION_TAG=latest
|
|
||||||
|
|
||||||
FROM ${BASE_ROOT}cv-analysis-base:${VERSION_TAG}
|
RUN python -m venv /app/venv
|
||||||
|
ENV PATH="/app/venv/bin:$PATH"
|
||||||
|
|
||||||
|
RUN python -m pip install --upgrade pip
|
||||||
|
|
||||||
WORKDIR /app/service
|
WORKDIR /app/service
|
||||||
|
|
||||||
|
COPY ./requirements.txt ./requirements.txt
|
||||||
|
RUN python3 -m pip install -r requirements.txt
|
||||||
|
|
||||||
|
COPY ./incl/pyinfra/requirements.txt ./incl/pyinfra/requirements.txt
|
||||||
|
RUN python -m pip install -r incl/pyinfra/requirements.txt
|
||||||
|
|
||||||
|
COPY ./incl ./incl
|
||||||
|
RUN python3 -m pip install -e incl/pyinfra
|
||||||
|
|
||||||
COPY ./src ./src
|
COPY ./src ./src
|
||||||
COPY cv_analysis ./cv_analysis
|
COPY ./cv_analysis ./cv_analysis
|
||||||
COPY config.yaml ./config.yaml
|
COPY ./setup.py ./setup.py
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip
|
|
||||||
RUN python3 -m pip install -e .
|
RUN python3 -m pip install -e .
|
||||||
|
|
||||||
WORKDIR /app/service
|
CMD ["python3", "-u", "src/serve.py"]
|
||||||
|
|
||||||
EXPOSE 5000
|
|
||||||
EXPOSE 8080
|
|
||||||
|
|
||||||
CMD ["python3", "src/serve.py"]
|
|
||||||
@ -1,31 +0,0 @@
|
|||||||
FROM python:3.10 as builder1
|
|
||||||
|
|
||||||
# Use a virtual environment.
|
|
||||||
RUN python -m venv /app/venv
|
|
||||||
ENV PATH="/app/venv/bin:$PATH"
|
|
||||||
|
|
||||||
# Upgrade pip.
|
|
||||||
RUN python -m pip install --upgrade pip
|
|
||||||
|
|
||||||
# Make a directory for the service files and copy the service repo into the container.
|
|
||||||
WORKDIR /app/service
|
|
||||||
COPY . ./
|
|
||||||
|
|
||||||
# Install dependencies.
|
|
||||||
RUN python3 -m pip install -r requirements.txt
|
|
||||||
RUN python3 -m pip install -r incl/pyinfra/requirements.txt
|
|
||||||
RUN python3 -m pip install -e incl/pyinfra
|
|
||||||
|
|
||||||
# Make a new container and copy all relevant files over to filter out temporary files
|
|
||||||
# produced during setup to reduce the final container's size.
|
|
||||||
FROM python:3.10
|
|
||||||
|
|
||||||
WORKDIR /app/
|
|
||||||
COPY --from=builder1 /app .
|
|
||||||
ENV PATH="/app/venv/bin:$PATH"
|
|
||||||
|
|
||||||
WORKDIR /app/service
|
|
||||||
|
|
||||||
RUN apt update
|
|
||||||
#RUN apt install python3-opencv-headless
|
|
||||||
RUN apt install poppler-utils --yes
|
|
||||||
@ -34,7 +34,6 @@ import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
|
|||||||
public class PlanSpec {
|
public class PlanSpec {
|
||||||
|
|
||||||
private static final String SERVICE_NAME = "cv-analysis";
|
private static final String SERVICE_NAME = "cv-analysis";
|
||||||
private static final String SERVICE_NAME_BASE = "cv-analysis-base";
|
|
||||||
|
|
||||||
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_","");
|
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_","");
|
||||||
|
|
||||||
@ -72,7 +71,7 @@ public class PlanSpec {
|
|||||||
return new Plan(
|
return new Plan(
|
||||||
project(),
|
project(),
|
||||||
SERVICE_NAME, new BambooKey(SERVICE_KEY))
|
SERVICE_NAME, new BambooKey(SERVICE_KEY))
|
||||||
.description("Docker build for cv-analysis.")
|
// .description("Docker build for cv-analysis.")
|
||||||
// .variables()
|
// .variables()
|
||||||
.stages(new Stage("Build Stage")
|
.stages(new Stage("Build Stage")
|
||||||
.jobs(
|
.jobs(
|
||||||
@ -84,9 +83,6 @@ public class PlanSpec {
|
|||||||
new VcsCheckoutTask()
|
new VcsCheckoutTask()
|
||||||
.description("Checkout default repository.")
|
.description("Checkout default repository.")
|
||||||
.checkoutItems(new CheckoutItem().defaultRepository()),
|
.checkoutItems(new CheckoutItem().defaultRepository()),
|
||||||
new VcsCheckoutTask()
|
|
||||||
.description("Checkout pyinfra research repository.")
|
|
||||||
.checkoutItems(new CheckoutItem().repository("RR / pyinfra").path("pyinfra")),
|
|
||||||
new ScriptTask()
|
new ScriptTask()
|
||||||
.description("Set config and keys.")
|
.description("Set config and keys.")
|
||||||
.inlineBody("mkdir -p ~/.ssh\n" +
|
.inlineBody("mkdir -p ~/.ssh\n" +
|
||||||
@ -98,10 +94,10 @@ public class PlanSpec {
|
|||||||
.description("Build Docker container.")
|
.description("Build Docker container.")
|
||||||
.location(Location.FILE)
|
.location(Location.FILE)
|
||||||
.fileFromPath("bamboo-specs/src/main/resources/scripts/docker-build.sh")
|
.fileFromPath("bamboo-specs/src/main/resources/scripts/docker-build.sh")
|
||||||
.argument(SERVICE_NAME + " " + SERVICE_NAME_BASE))
|
.argument(SERVICE_NAME))
|
||||||
.dockerConfiguration(
|
.dockerConfiguration(
|
||||||
new DockerConfiguration()
|
new DockerConfiguration()
|
||||||
.image("nexus.iqser.com:5001/infra/release_build:4.2.0")
|
.image("nexus.iqser.com:5001/infra/release_build:4.5.0")
|
||||||
.volume("/var/run/docker.sock", "/var/run/docker.sock")),
|
.volume("/var/run/docker.sock", "/var/run/docker.sock")),
|
||||||
new Job("Sonar Job", new BambooKey("SONAR"))
|
new Job("Sonar Job", new BambooKey("SONAR"))
|
||||||
.tasks(
|
.tasks(
|
||||||
@ -111,9 +107,6 @@ public class PlanSpec {
|
|||||||
new VcsCheckoutTask()
|
new VcsCheckoutTask()
|
||||||
.description("Checkout default repository.")
|
.description("Checkout default repository.")
|
||||||
.checkoutItems(new CheckoutItem().defaultRepository()),
|
.checkoutItems(new CheckoutItem().defaultRepository()),
|
||||||
new VcsCheckoutTask()
|
|
||||||
.description("Checkout pyinfra research repository.")
|
|
||||||
.checkoutItems(new CheckoutItem().repository("RR / pyinfra").path("pyinfra")),
|
|
||||||
new ScriptTask()
|
new ScriptTask()
|
||||||
.description("Set config and keys.")
|
.description("Set config and keys.")
|
||||||
.inlineBody("mkdir -p ~/.ssh\n" +
|
.inlineBody("mkdir -p ~/.ssh\n" +
|
||||||
@ -134,6 +127,9 @@ public class PlanSpec {
|
|||||||
.jobs(
|
.jobs(
|
||||||
new Job("Git Tag Job", new BambooKey("GITTAG"))
|
new Job("Git Tag Job", new BambooKey("GITTAG"))
|
||||||
.tasks(
|
.tasks(
|
||||||
|
new CleanWorkingDirectoryTask()
|
||||||
|
.description("Clean working directory.")
|
||||||
|
.enabled(true),
|
||||||
new VcsCheckoutTask()
|
new VcsCheckoutTask()
|
||||||
.description("Checkout default repository.")
|
.description("Checkout default repository.")
|
||||||
.checkoutItems(new CheckoutItem().defaultRepository()),
|
.checkoutItems(new CheckoutItem().defaultRepository()),
|
||||||
@ -152,7 +148,7 @@ public class PlanSpec {
|
|||||||
.defaultRepository())
|
.defaultRepository())
|
||||||
.dockerConfiguration(
|
.dockerConfiguration(
|
||||||
new DockerConfiguration()
|
new DockerConfiguration()
|
||||||
.image("nexus.iqser.com:5001/infra/release_build:4.4.1")),
|
.image("nexus.iqser.com:5001/infra/release_build:4.5.0")),
|
||||||
new Job("Licence Job", new BambooKey("LICENCE"))
|
new Job("Licence Job", new BambooKey("LICENCE"))
|
||||||
.enabled(false)
|
.enabled(false)
|
||||||
.tasks(
|
.tasks(
|
||||||
@ -169,7 +165,6 @@ public class PlanSpec {
|
|||||||
.volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
|
.volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
|
||||||
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
|
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
|
||||||
.linkedRepositories("RR / " + SERVICE_NAME)
|
.linkedRepositories("RR / " + SERVICE_NAME)
|
||||||
.linkedRepositories("RR / pyinfra")
|
|
||||||
.triggers(new BitbucketServerTrigger())
|
.triggers(new BitbucketServerTrigger())
|
||||||
.planBranchManagement(new PlanBranchManagement()
|
.planBranchManagement(new PlanBranchManagement()
|
||||||
.createForVcsBranch()
|
.createForVcsBranch()
|
||||||
|
|||||||
@ -2,8 +2,7 @@
|
|||||||
set -e
|
set -e
|
||||||
|
|
||||||
SERVICE_NAME=$1
|
SERVICE_NAME=$1
|
||||||
SERVICE_NAME_BASE=$2
|
|
||||||
# TODO version tag on master push
|
|
||||||
python3 -m venv build_venv
|
python3 -m venv build_venv
|
||||||
source build_venv/bin/activate
|
source build_venv/bin/activate
|
||||||
python3 -m pip install --upgrade pip
|
python3 -m pip install --upgrade pip
|
||||||
@ -13,7 +12,6 @@ pip install 'dvc[ssh]'
|
|||||||
dvc pull
|
dvc pull
|
||||||
|
|
||||||
echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
|
echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
|
||||||
docker build -f Dockerfile_base -t nexus.iqser.com:5001/red/$SERVICE_NAME_BASE:${bamboo_version_tag} .
|
|
||||||
docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} --build-arg VERSION_TAG=${bamboo_version_tag} .
|
|
||||||
echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
|
echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
|
||||||
|
docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} --build-arg VERSION_TAG=${bamboo_version_tag} .
|
||||||
docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag}
|
docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag}
|
||||||
|
|||||||
@ -7,17 +7,20 @@ python3 -m venv build_venv
|
|||||||
source build_venv/bin/activate
|
source build_venv/bin/activate
|
||||||
python3 -m pip install --upgrade pip
|
python3 -m pip install --upgrade pip
|
||||||
|
|
||||||
pip install -e .
|
echo "dev setup for unit test and coverage"
|
||||||
pip install -e incl/pyinfra
|
|
||||||
|
|
||||||
|
pip install -e incl/pyinfra
|
||||||
pip install -r incl/pyinfra/requirements.txt
|
pip install -r incl/pyinfra/requirements.txt
|
||||||
|
|
||||||
|
pip install -e .
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
|
|
||||||
|
|
||||||
echo "DVC pull step"
|
echo "DVC pull step"
|
||||||
dvc pull
|
dvc pull
|
||||||
|
|
||||||
echo "coverage calculation"
|
echo "coverage calculation"
|
||||||
coverage run -m pytest test
|
coverage run -m pytest
|
||||||
echo "coverage report generation"
|
echo "coverage report generation"
|
||||||
coverage report -m
|
coverage report -m
|
||||||
coverage xml
|
coverage xml
|
||||||
@ -28,7 +31,7 @@ echo "dependency-check:aggregate"
|
|||||||
mkdir -p reports
|
mkdir -p reports
|
||||||
dependency-check --enableExperimental -f JSON -f HTML -f XML \
|
dependency-check --enableExperimental -f JSON -f HTML -f XML \
|
||||||
--disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
|
--disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
|
||||||
--exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**"
|
--exclude "build_venv/**" --exclude "**/__pycache__/**"
|
||||||
|
|
||||||
if [[ -z "${bamboo_repository_pr_key}" ]]
|
if [[ -z "${bamboo_repository_pr_key}" ]]
|
||||||
then
|
then
|
||||||
|
|||||||
12
config.yaml
12
config.yaml
@ -1,12 +0,0 @@
|
|||||||
service:
|
|
||||||
logging_level: $LOGGING_LEVEL_ROOT|DEBUG # Logging level for log file messages
|
|
||||||
monitoring_enabled: $MONITORING_ENABLED|True # if app is doing monitoring or not
|
|
||||||
logfile_path: $LOGFILE_PATH|null # Overwrites the default path for the service logfile (image_service/log.log)
|
|
||||||
|
|
||||||
webserver:
|
|
||||||
host: $SERVER_HOST|"127.0.0.1" # webserver address
|
|
||||||
port: $SERVER_PORT|5000 # webserver port
|
|
||||||
|
|
||||||
visual_logging:
|
|
||||||
level: DISABLED # NOTHING > INFO > DEBUG > ALL
|
|
||||||
output_folder: /tmp/debug/
|
|
||||||
@ -1,39 +1,30 @@
|
|||||||
"""Implements a config object with dot-indexing syntax."""
|
import os
|
||||||
|
|
||||||
|
|
||||||
from envyaml import EnvYAML
|
def get_config():
|
||||||
from cv_analysis.locations import CONFIG_FILE
|
return Config()
|
||||||
|
|
||||||
|
|
||||||
def _get_item_and_maybe_make_dotindexable(container, item):
|
|
||||||
ret = container[item]
|
|
||||||
return DotIndexable(ret) if isinstance(ret, dict) else ret
|
|
||||||
|
|
||||||
|
|
||||||
class DotIndexable:
|
|
||||||
def __init__(self, x):
|
|
||||||
self.x = x
|
|
||||||
|
|
||||||
def __getattr__(self, item):
|
|
||||||
return _get_item_and_maybe_make_dotindexable(self.x, item)
|
|
||||||
|
|
||||||
def __setitem__(self, key, value):
|
|
||||||
self.x[key] = value
|
|
||||||
|
|
||||||
def __repr__(self):
|
|
||||||
return self.x.__repr__()
|
|
||||||
|
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
def __init__(self, config_path):
|
def __init__(self):
|
||||||
self.__config = EnvYAML(config_path)
|
self.logging_level_root = os.environ.get("LOGGING_LEVEL_ROOT", "INFO")
|
||||||
|
|
||||||
def __getattr__(self, item):
|
# visual_logging_level: NOTHING > INFO > DEBUG > ALL
|
||||||
if item in self.__config:
|
self.visual_logging_level = "DISABLED"
|
||||||
return _get_item_and_maybe_make_dotindexable(self.__config, item)
|
self.visual_logging_output_folder = "/tmp/debug"
|
||||||
|
|
||||||
def __getitem__(self, item):
|
# locations
|
||||||
return self.__getattr__(item)
|
# FIXME: is everything here necessary?
|
||||||
|
root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
self.dvc_data_dir = os.path.join(root, "data")
|
||||||
|
self.pdf_for_testing = os.path.join(self.dvc_data_dir, "pdfs_for_testing")
|
||||||
|
self.png_for_testing = os.path.join(self.dvc_data_dir, "pngs_for_testing")
|
||||||
|
self.png_figures_detected = os.path.join(self.png_for_testing, "figures_detected")
|
||||||
|
self.png_tables_detected = os.path.join(self.png_for_testing, "tables_detected_by_tp")
|
||||||
|
self.hashed_pdfs_for_testing = os.path.join(self.pdf_for_testing, "hashed")
|
||||||
|
self.metadata_test_files = os.path.join(self.dvc_data_dir, "metadata_testing_files.csv")
|
||||||
|
self.test_dir = os.path.join(root, "test")
|
||||||
|
self.test_data_dir = os.path.join(self.test_dir, "test_data")
|
||||||
|
|
||||||
|
def __getitem__(self, key):
|
||||||
CONFIG = Config(CONFIG_FILE)
|
return self.__getattribute__(key)
|
||||||
|
|||||||
@ -1,22 +0,0 @@
|
|||||||
"""Defines constant paths relative to the module root path."""
|
|
||||||
|
|
||||||
|
|
||||||
from os import path
|
|
||||||
|
|
||||||
MODULE_DIR = path.dirname(path.abspath(__file__))
|
|
||||||
PACKAGE_ROOT_DIR = path.dirname(MODULE_DIR)
|
|
||||||
|
|
||||||
CONFIG_FILE = path.join(PACKAGE_ROOT_DIR, "config.yaml")
|
|
||||||
LOG_FILE = "/tmp/log.log"
|
|
||||||
|
|
||||||
DVC_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data")
|
|
||||||
PDF_FOR_TESTING = path.join(DVC_DATA_DIR, "pdfs_for_testing")
|
|
||||||
PNG_FOR_TESTING = path.join(DVC_DATA_DIR, "pngs_for_testing")
|
|
||||||
PNG_FIGURES_DETECTED = path.join(PNG_FOR_TESTING, "figures_detected")
|
|
||||||
PNG_TABLES_DETECTED = path.join(PNG_FOR_TESTING, "tables_detected_by_tp")
|
|
||||||
HASHED_PDFS_FOR_TESTING = path.join(PDF_FOR_TESTING, "hashed")
|
|
||||||
METADATA_TESTFILES = path.join(DVC_DATA_DIR, "metadata_testing_files.csv")
|
|
||||||
|
|
||||||
|
|
||||||
TEST_DIR = path.join(PACKAGE_ROOT_DIR, "test")
|
|
||||||
TEST_DATA_DIR = path.join(TEST_DIR, "test_data")
|
|
||||||
@ -2,7 +2,6 @@ from functools import partial
|
|||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pdf2image
|
|
||||||
from iteration_utilities import starfilter, first
|
from iteration_utilities import starfilter, first
|
||||||
|
|
||||||
from cv_analysis.utils.filters import is_large_enough, is_filled, is_boxy
|
from cv_analysis.utils.filters import is_large_enough, is_filled, is_boxy
|
||||||
|
|||||||
@ -7,23 +7,20 @@ from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_d
|
|||||||
from cv_analysis.layout_parsing import parse_layout
|
from cv_analysis.layout_parsing import parse_layout
|
||||||
from cv_analysis.server.rotate import rotate_rectangle
|
from cv_analysis.server.rotate import rotate_rectangle
|
||||||
from cv_analysis.table_parsing import parse_tables
|
from cv_analysis.table_parsing import parse_tables
|
||||||
from cv_analysis.utils.logging import get_logger
|
|
||||||
from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs
|
from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs
|
||||||
from cv_analysis.utils.structures import Rectangle
|
from cv_analysis.utils.structures import Rectangle
|
||||||
|
|
||||||
logger = get_logger()
|
|
||||||
|
|
||||||
|
|
||||||
def make_analysis_pipeline(analysis_fn: Callable, dpi=200):
|
def make_analysis_pipeline(analysis_fn: Callable, dpi=200):
|
||||||
"""Make end-to-end pipeline to analyse a PDF with given analysis function.
|
"""Make end-to-end pipeline to analyse a PDF with given analysis function.
|
||||||
The pipeline returns a Generator of dicts containing page information and the analysis results.
|
The pipeline returns a Generator of dicts containing page information and the analysis results.
|
||||||
|
|
||||||
Steps:
|
Steps:
|
||||||
Convert PDF to Arrays and page information
|
Convert PDF to pairs of image and page information
|
||||||
Analise pages, get list of bboxes per page (e.g. table cells)
|
Analyse pages, get list of bounding boxes per page (e.g. table cells)
|
||||||
Convert pixel values to inches
|
Convert pixel values to inches
|
||||||
Rotate results if page is rotated
|
Rotate results if page is rotated
|
||||||
Format results to stream of dictionaries
|
Format results to stream of dictionaries with page information and analysis results
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def pipeline(pdf: bytes, index=None):
|
def pipeline(pdf: bytes, index=None):
|
||||||
|
|||||||
@ -1,16 +1,15 @@
|
|||||||
from functools import partial
|
from functools import partial
|
||||||
from itertools import chain, starmap
|
from itertools import chain, starmap
|
||||||
from operator import attrgetter
|
from operator import attrgetter
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
from funcy import lmap
|
from funcy import lmap
|
||||||
|
|
||||||
|
from cv_analysis.layout_parsing import parse_layout
|
||||||
from cv_analysis.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d
|
from cv_analysis.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d
|
||||||
|
|
||||||
from cv_analysis.utils.structures import Rectangle
|
from cv_analysis.utils.structures import Rectangle
|
||||||
from cv_analysis.utils.visual_logging import vizlogger
|
from cv_analysis.utils.visual_logging import vizlogger
|
||||||
from cv_analysis.layout_parsing import parse_layout
|
|
||||||
|
|
||||||
|
|
||||||
def add_external_contours(image, image_h_w_lines_only):
|
def add_external_contours(image, image_h_w_lines_only):
|
||||||
|
|||||||
@ -1,4 +1,3 @@
|
|||||||
from numpy import resize
|
|
||||||
import cv2
|
import cv2
|
||||||
from matplotlib import pyplot as plt
|
from matplotlib import pyplot as plt
|
||||||
|
|
||||||
|
|||||||
@ -1,26 +0,0 @@
|
|||||||
"""Defines the default logger for the service."""
|
|
||||||
import sys
|
|
||||||
import logging
|
|
||||||
|
|
||||||
from cv_analysis.config import CONFIG
|
|
||||||
|
|
||||||
|
|
||||||
def make_logger_getter():
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
logger.setLevel(logging.getLevelName(CONFIG.service.logging_level))
|
|
||||||
formatter = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s", datefmt="%d.%m.%Y - %H:%M:%S")
|
|
||||||
|
|
||||||
ch = logging.StreamHandler(sys.stdout)
|
|
||||||
ch.setLevel(logging.getLevelName(CONFIG.service.logging_level))
|
|
||||||
ch.setFormatter(formatter)
|
|
||||||
|
|
||||||
logger.addHandler(ch)
|
|
||||||
logger.propagate = False
|
|
||||||
|
|
||||||
def get_logger():
|
|
||||||
return logger
|
|
||||||
|
|
||||||
return get_logger
|
|
||||||
|
|
||||||
|
|
||||||
get_logger = make_logger_getter()
|
|
||||||
@ -1,6 +1,6 @@
|
|||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import Iterator, Tuple
|
from typing import Iterator
|
||||||
|
|
||||||
import fitz
|
import fitz
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -14,7 +14,8 @@ class ImageMetadataPair:
|
|||||||
|
|
||||||
def pdf_to_image_metadata_pairs(pdf: bytes, index=None, dpi=200) -> Iterator[ImageMetadataPair]:
|
def pdf_to_image_metadata_pairs(pdf: bytes, index=None, dpi=200) -> Iterator[ImageMetadataPair]:
|
||||||
"""Streams PDF as pairs of image (matrix) and metadata.
|
"""Streams PDF as pairs of image (matrix) and metadata.
|
||||||
Note: If Index is not given or evaluates to None, the whole PDF will be processed."""
|
Note: If Index is not given or evaluates to None, the whole PDF will be processed.
|
||||||
|
"""
|
||||||
convert_fn = partial(page_to_image_metadata_pair, dpi=dpi)
|
convert_fn = partial(page_to_image_metadata_pair, dpi=dpi)
|
||||||
yield from map(convert_fn, stream_pages(pdf, index))
|
yield from map(convert_fn, stream_pages(pdf, index))
|
||||||
|
|
||||||
|
|||||||
@ -1,7 +1,10 @@
|
|||||||
import os
|
import os
|
||||||
from cv_analysis.config import CONFIG
|
|
||||||
|
from cv_analysis.config import get_config
|
||||||
from cv_analysis.utils.display import save_image
|
from cv_analysis.utils.display import save_image
|
||||||
|
|
||||||
|
CV_CONFIG = get_config()
|
||||||
|
|
||||||
|
|
||||||
class VisualLogger:
|
class VisualLogger:
|
||||||
def __init__(self, level, output_folder):
|
def __init__(self, level, output_folder):
|
||||||
@ -36,4 +39,4 @@ class VisualLogger:
|
|||||||
return self.level == "ALL"
|
return self.level == "ALL"
|
||||||
|
|
||||||
|
|
||||||
vizlogger = VisualLogger(CONFIG.visual_logging.level, CONFIG.visual_logging.output_folder)
|
vizlogger = VisualLogger(CV_CONFIG.visual_logging_level, CV_CONFIG.visual_logging_output_folder)
|
||||||
|
|||||||
31
docker-compose.yaml
Normal file
31
docker-compose.yaml
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
version: '2'
|
||||||
|
services:
|
||||||
|
minio:
|
||||||
|
image: minio/minio
|
||||||
|
ports:
|
||||||
|
- "9000:9000"
|
||||||
|
environment:
|
||||||
|
- MINIO_ROOT_PASSWORD=password
|
||||||
|
- MINIO_ROOT_USER=root
|
||||||
|
volumes:
|
||||||
|
- ./data/minio_store:/data
|
||||||
|
command: server /data
|
||||||
|
network_mode: "bridge"
|
||||||
|
rabbitmq:
|
||||||
|
image: docker.io/bitnami/rabbitmq:3.9
|
||||||
|
ports:
|
||||||
|
- '4369:4369'
|
||||||
|
- '5551:5551'
|
||||||
|
- '5552:5552'
|
||||||
|
- '5672:5672'
|
||||||
|
- '25672:25672'
|
||||||
|
- '15672:15672'
|
||||||
|
environment:
|
||||||
|
- RABBITMQ_SECURE_PASSWORD=yes
|
||||||
|
- RABBITMQ_VM_MEMORY_HIGH_WATERMARK=100%
|
||||||
|
- RABBITMQ_DISK_FREE_ABSOLUTE_LIMIT=20Gi
|
||||||
|
network_mode: "bridge"
|
||||||
|
volumes:
|
||||||
|
- /opt/bitnami/rabbitmq/.rabbitmq/:/data/bitnami
|
||||||
|
volumes:
|
||||||
|
mdata:
|
||||||
@ -1 +1 @@
|
|||||||
Subproject commit 7e948a4cf05a3ef59fcc7e8719fcf910adc73864
|
Subproject commit 6c2652837a17a29476b11b1acbc35ba8825c2cd9
|
||||||
@ -1,2 +1,4 @@
|
|||||||
[pytest]
|
[pytest]
|
||||||
norecursedirs = incl
|
norecursedirs = incl
|
||||||
|
testpaths = test
|
||||||
|
addopts = --ignore=data
|
||||||
|
|||||||
@ -1,11 +1,15 @@
|
|||||||
import hashlib
|
import hashlib
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
|
from itertools import chain
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from pdf2image import convert_from_path
|
from pdf2image import convert_from_path
|
||||||
from itertools import chain
|
|
||||||
import json
|
from cv_analysis.config import get_config
|
||||||
from cv_analysis.locations import PDF_FOR_TESTING, TEST_DATA_DIR, PNG_FOR_TESTING, DVC_DATA_DIR, HASHED_PDFS_FOR_TESTING
|
|
||||||
|
CV_CONFIG = get_config()
|
||||||
|
|
||||||
|
|
||||||
def read_json(path):
|
def read_json(path):
|
||||||
@ -22,7 +26,7 @@ def collect_metadata(example_pages, save=False):
|
|||||||
metadata = list(chain.from_iterable(metadata))
|
metadata = list(chain.from_iterable(metadata))
|
||||||
if save:
|
if save:
|
||||||
df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
|
df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
|
||||||
df.to_csv(path.join(DVC_DATA_DIR, "metadata_testing_files.csv"))
|
df.to_csv(path.join(CV_CONFIG.dvc_data_dir, "metadata_testing_files.csv"))
|
||||||
else:
|
else:
|
||||||
return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
|
return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
|
||||||
|
|
||||||
@ -46,7 +50,7 @@ def make_metadata_entry_maker():
|
|||||||
|
|
||||||
|
|
||||||
def split_pdf(example_pages):
|
def split_pdf(example_pages):
|
||||||
dir_path = PDF_FOR_TESTING
|
dir_path = CV_CONFIG.pdf_for_testing
|
||||||
i = 0
|
i = 0
|
||||||
for name, document_sections in example_pages.items():
|
for name, document_sections in example_pages.items():
|
||||||
for pages in document_sections:
|
for pages in document_sections:
|
||||||
@ -54,7 +58,7 @@ def split_pdf(example_pages):
|
|||||||
pdf_path=path.join(dir_path, name), dpi=300, first_page=pages[0], last_page=pages[1]
|
pdf_path=path.join(dir_path, name), dpi=300, first_page=pages[0], last_page=pages[1]
|
||||||
)
|
)
|
||||||
for image in images:
|
for image in images:
|
||||||
fp = path.join(PNG_FOR_TESTING, f"fig_table{i:0>3}.png")
|
fp = path.join(CV_CONFIG.png_for_testing, f"fig_table{i:0>3}.png")
|
||||||
image.save(fp=fp, dpi=(300, 300))
|
image.save(fp=fp, dpi=(300, 300))
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
@ -74,7 +78,7 @@ def find_hash(file_path):
|
|||||||
|
|
||||||
def rename_files_with_hash(example_pages):
|
def rename_files_with_hash(example_pages):
|
||||||
files_to_rename = list(example_pages.keys())
|
files_to_rename = list(example_pages.keys())
|
||||||
folder = HASHED_PDFS_FOR_TESTING
|
folder = CV_CONFIG.hashed_pdfs_for_testing
|
||||||
|
|
||||||
# Iterate through the folder
|
# Iterate through the folder
|
||||||
for file in os.listdir(folder):
|
for file in os.listdir(folder):
|
||||||
@ -99,7 +103,7 @@ def rename_files_with_hash(example_pages):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
examples_pages = read_json(path.join(TEST_DATA_DIR, "example_pages.json"))
|
examples_pages = read_json(path.join(CV_CONFIG.test_data_dir, "example_pages.json"))
|
||||||
rename_files_with_hash(examples_pages)
|
rename_files_with_hash(examples_pages)
|
||||||
# collect_metadata(examples_pages, save=True)
|
# collect_metadata(examples_pages, save=True)
|
||||||
# split_pdf(examples_pages)
|
# split_pdf(examples_pages)
|
||||||
|
|||||||
68
scripts/manage_minio.py
Normal file
68
scripts/manage_minio.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
import argparse
|
||||||
|
import gzip
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
from pyinfra.config import get_config
|
||||||
|
from pyinfra.storage.storage import get_s3_storage
|
||||||
|
|
||||||
|
CONFIG = get_config()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
subparsers = parser.add_subparsers(help="sub-command help", dest="command")
|
||||||
|
|
||||||
|
parser_add = subparsers.add_parser("add", help="Add file(s) to the MinIO store")
|
||||||
|
parser_add.add_argument("dossier_id")
|
||||||
|
add_group = parser_add.add_mutually_exclusive_group(required=True)
|
||||||
|
add_group.add_argument("--file", "-f")
|
||||||
|
add_group.add_argument("--directory", "-d")
|
||||||
|
|
||||||
|
subparsers.add_parser("purge", help="Delete all files and buckets in the MinIO store")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def combine_dossier_id_and_file_id_and_extension(dossier_id, file_id, extension):
|
||||||
|
return f"{dossier_id}/{file_id}{extension}"
|
||||||
|
|
||||||
|
|
||||||
|
def add_file_compressed(storage, bucket_name, dossier_id, path) -> None:
|
||||||
|
if Path(path).suffix == ".pdf":
|
||||||
|
suffix_gz = ".ORIGIN.pdf.gz"
|
||||||
|
if Path(path).suffix == ".json":
|
||||||
|
suffix_gz = ".TEXT.json.gz"
|
||||||
|
path_gz = combine_dossier_id_and_file_id_and_extension(dossier_id, Path(path).stem, suffix_gz)
|
||||||
|
|
||||||
|
with open(path, "rb") as f:
|
||||||
|
data = gzip.compress(f.read())
|
||||||
|
storage.put_object(bucket_name, path_gz, data)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
storage = get_s3_storage(CONFIG)
|
||||||
|
bucket_name = CONFIG.storage_bucket
|
||||||
|
|
||||||
|
if not storage.has_bucket(bucket_name):
|
||||||
|
storage.make_bucket(bucket_name)
|
||||||
|
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
if args.command == "add":
|
||||||
|
|
||||||
|
if args.file:
|
||||||
|
add_file_compressed(storage, bucket_name, args.dossier_id, args.file)
|
||||||
|
|
||||||
|
elif args.directory:
|
||||||
|
for fname in tqdm([*os.listdir(args.directory)], desc="Adding files"):
|
||||||
|
path = Path(args.directory) / fname
|
||||||
|
add_file_compressed(storage, bucket_name, args.dossier_id, path)
|
||||||
|
|
||||||
|
elif args.command == "purge":
|
||||||
|
storage.clear_bucket(bucket_name)
|
||||||
84
scripts/publish_requests.py
Normal file
84
scripts/publish_requests.py
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
|
||||||
|
import pika
|
||||||
|
|
||||||
|
from pyinfra.config import get_config
|
||||||
|
from pyinfra.storage.storage import get_s3_storage
|
||||||
|
|
||||||
|
CONFIG = get_config()
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--operation", "-o", choices=["table", "layout", "figure"], required=True)
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def read_connection_params():
|
||||||
|
credentials = pika.PlainCredentials(CONFIG.rabbitmq_username, CONFIG.rabbitmq_password)
|
||||||
|
parameters = pika.ConnectionParameters(
|
||||||
|
host=CONFIG.rabbitmq_host,
|
||||||
|
port=CONFIG.rabbitmq_port,
|
||||||
|
heartbeat=int(CONFIG.rabbitmq_heartbeat),
|
||||||
|
credentials=credentials,
|
||||||
|
)
|
||||||
|
return parameters
|
||||||
|
|
||||||
|
|
||||||
|
def make_channel(connection) -> pika.adapters.blocking_connection.BlockingChannel:
|
||||||
|
channel = connection.channel()
|
||||||
|
channel.basic_qos(prefetch_count=1)
|
||||||
|
return channel
|
||||||
|
|
||||||
|
|
||||||
|
def declare_queue(channel, queue: str):
|
||||||
|
args = {"x-dead-letter-exchange": "", "x-dead-letter-routing-key": CONFIG.dead_letter_queue}
|
||||||
|
return channel.queue_declare(queue=queue, auto_delete=False, durable=True, arguments=args)
|
||||||
|
|
||||||
|
|
||||||
|
def make_connection() -> pika.BlockingConnection:
|
||||||
|
parameters = read_connection_params()
|
||||||
|
connection = pika.BlockingConnection(parameters)
|
||||||
|
return connection
|
||||||
|
|
||||||
|
|
||||||
|
def build_message_bodies(operation, bucket_name):
|
||||||
|
|
||||||
|
storage = get_s3_storage(CONFIG)
|
||||||
|
for bucket_name, pdf_name in storage.get_all_object_names(bucket_name):
|
||||||
|
if "pdf" not in pdf_name:
|
||||||
|
continue
|
||||||
|
file_id = pdf_name.split(".")[0]
|
||||||
|
dossier_id, file_id = file_id.split("/")
|
||||||
|
message_dict = {
|
||||||
|
"dossierId": dossier_id,
|
||||||
|
"fileId": file_id,
|
||||||
|
"targetFileExtension": "ORIGIN.pdf.gz",
|
||||||
|
"responseFileExtension": f"{operation.upper()}.json.gz",
|
||||||
|
"operation": operation,
|
||||||
|
}
|
||||||
|
yield json.dumps(message_dict).encode()
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
connection = make_connection()
|
||||||
|
channel = make_channel(connection)
|
||||||
|
declare_queue(channel, CONFIG.request_queue)
|
||||||
|
declare_queue(channel, CONFIG.response_queue)
|
||||||
|
|
||||||
|
for body in build_message_bodies(args.operation, CONFIG.storage_bucket):
|
||||||
|
channel.basic_publish("", CONFIG.request_queue, body)
|
||||||
|
print(f"Put {body} on {CONFIG.request_queue}")
|
||||||
|
|
||||||
|
for method_frame, _, body in channel.consume(queue=CONFIG.response_queue, inactivity_timeout=1):
|
||||||
|
if not body:
|
||||||
|
break
|
||||||
|
print(f"Received {json.loads(body)}")
|
||||||
|
channel.basic_ack(method_frame.delivery_tag)
|
||||||
|
channel.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main(parse_args())
|
||||||
@ -1,64 +0,0 @@
|
|||||||
import argparse
|
|
||||||
import gzip
|
|
||||||
from operator import itemgetter
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
import fitz
|
|
||||||
import pdf2image
|
|
||||||
from funcy import lmap, compose, pluck
|
|
||||||
|
|
||||||
from pyinfra.default_objects import get_component_factory
|
|
||||||
|
|
||||||
from cv_analysis.config import CONFIG
|
|
||||||
from incl.pyinfra.test.utils.image import image_to_bytes
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
|
||||||
parser = argparse.ArgumentParser()
|
|
||||||
parser.add_argument("--pdf_path", "-p", required=True)
|
|
||||||
parser.add_argument("--operation", "-o", choices=["figure_detection", "table_parsing"], required=True)
|
|
||||||
parser.add_argument("--result_path", "-r", required=True)
|
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
return args
|
|
||||||
|
|
||||||
|
|
||||||
def request_metadatas(dpi, n_metadata):
|
|
||||||
return [{"dpi": dpi} for _ in range(1, n_metadata)]
|
|
||||||
|
|
||||||
|
|
||||||
def draw_cells_on_page(cells: List[dict], page):
|
|
||||||
def format_xywh_to_x0y0x1y1(rect):
|
|
||||||
x, y, w, h = rect
|
|
||||||
return x, y, x + w, y + h
|
|
||||||
|
|
||||||
rects = map(itemgetter("x", "y", "width", "height"), cells)
|
|
||||||
rects = map(format_xywh_to_x0y0x1y1, rects)
|
|
||||||
|
|
||||||
for rect in rects:
|
|
||||||
page.draw_rect(rect, color=(0.3, 0.7, 0.1), width=2, overlay=True)
|
|
||||||
|
|
||||||
|
|
||||||
def annotate_results_on_pdf(results, pdf_path, result_path):
|
|
||||||
opened_pdf = fitz.open(pdf_path)
|
|
||||||
metadata_per_page = pluck("metadata", results)
|
|
||||||
|
|
||||||
for page, metadata in zip(opened_pdf, metadata_per_page):
|
|
||||||
if metadata:
|
|
||||||
draw_cells_on_page(metadata["cells"], page)
|
|
||||||
opened_pdf.save(result_path)
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
|
||||||
dpi = 200
|
|
||||||
images = lmap(compose(gzip.compress, image_to_bytes), pdf2image.convert_from_path(args.pdf_path, dpi=dpi))
|
|
||||||
|
|
||||||
submit_endpoint = f"http://{CONFIG.webserver.host}:{CONFIG.webserver.port}/{args.operation}"
|
|
||||||
pipeline = get_component_factory(CONFIG).get_pipeline(submit_endpoint)
|
|
||||||
results = list(pipeline(data=images, metadata=request_metadatas(dpi, len(images))))
|
|
||||||
|
|
||||||
annotate_results_on_pdf(results, args.pdf_path, args.result_path)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main(parse_args())
|
|
||||||
24
scripts/show_compressed_json.py
Normal file
24
scripts/show_compressed_json.py
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
import argparse
|
||||||
|
import gzip
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("compressed_json_path", help="Path to compressed JSON file")
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main(fp):
|
||||||
|
with open(fp, "rb") as f:
|
||||||
|
compressed_json_path = f.read()
|
||||||
|
|
||||||
|
json_str = gzip.decompress(compressed_json_path)
|
||||||
|
parsed = json.loads(json_str)
|
||||||
|
|
||||||
|
print(json.dumps(parsed, indent=2))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parse_args()
|
||||||
|
main(args.compressed_json_path)
|
||||||
@ -1,13 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
python3 -m venv build_venv
|
|
||||||
source build_venv/bin/activate
|
|
||||||
python3 -m pip install --upgrade pip
|
|
||||||
|
|
||||||
#pip install dvc
|
|
||||||
#pip install 'dvc[ssh]'
|
|
||||||
#dvc pull
|
|
||||||
|
|
||||||
docker build -f Dockerfile_base -t cv-analysis-base .
|
|
||||||
docker build -f Dockerfile -t cv-analysis .
|
|
||||||
51
src/serve.py
51
src/serve.py
@ -1,35 +1,42 @@
|
|||||||
|
import gzip
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
from operator import itemgetter
|
||||||
|
|
||||||
from waitress import serve
|
from cv_analysis.server.pipeline import get_analysis_fn, make_analysis_pipeline
|
||||||
|
|
||||||
from cv_analysis.config import CONFIG
|
|
||||||
from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline
|
|
||||||
from cv_analysis.server.stream import make_streamable_analysis_fn
|
|
||||||
from cv_analysis.table_parsing import parse_tables
|
|
||||||
from cv_analysis.utils.banner import make_art
|
from cv_analysis.utils.banner import make_art
|
||||||
from cv_analysis.utils.logging import get_logger
|
from pyinfra import config as pyinfra_config
|
||||||
from incl.pyinfra.pyinfra.server.server import set_up_processing_server
|
from pyinfra.queue.queue_manager import QueueManager
|
||||||
|
from pyinfra.storage.storage import get_storage
|
||||||
|
|
||||||
|
PYINFRA_CONFIG = pyinfra_config.get_config()
|
||||||
|
|
||||||
|
logging.basicConfig(level=PYINFRA_CONFIG.logging_level_root)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def analysis_callback(queue_message: dict):
|
||||||
logger.info(make_art())
|
|
||||||
|
|
||||||
operation2function = {"table_parsing": parse_tables, "figure_detection": make_figure_detection_pipeline()}
|
dossier_id, file_id, target_file_ext, response_file_ext, operation = itemgetter(
|
||||||
operation2streamable_function = {op: make_streamable_analysis_fn(fn) for op, fn in operation2function.items()}
|
"dossierId", "fileId", "targetFileExtension", "responseFileExtension", "operation"
|
||||||
|
)(queue_message)
|
||||||
|
logging.info(f"Processing {dossier_id=}/{file_id=}, {operation=}.")
|
||||||
|
storage = get_storage(PYINFRA_CONFIG)
|
||||||
|
object_name = f"{dossier_id}/{file_id}.{target_file_ext}"
|
||||||
|
object_bytes = gzip.decompress(storage.get_object(PYINFRA_CONFIG.storage_bucket, object_name))
|
||||||
|
analysis_fn = make_analysis_pipeline(get_analysis_fn(operation))
|
||||||
|
|
||||||
server = set_up_processing_server(operation2streamable_function)
|
results = analysis_fn(object_bytes)
|
||||||
|
response = {**queue_message, "data": list(results)}
|
||||||
|
response = gzip.compress(json.dumps(response).encode())
|
||||||
|
response_name = f"{dossier_id}/{file_id}.{response_file_ext}"
|
||||||
|
|
||||||
serve(server, host=CONFIG.webserver.host, port=CONFIG.webserver.port, _quiet=False)
|
storage.put_object(PYINFRA_CONFIG.storage_bucket, response_name, response)
|
||||||
|
return {"dossierId": dossier_id, "fileId": file_id}
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
logging.basicConfig(level=CONFIG.service.logging_level)
|
|
||||||
|
|
||||||
logging.getLogger("pillow").setLevel(logging.ERROR)
|
logging.info(make_art())
|
||||||
logging.getLogger("PIL").setLevel(logging.ERROR)
|
|
||||||
logging.getLogger("flask").setLevel(logging.ERROR)
|
|
||||||
logging.getLogger("urllib3").setLevel(logging.ERROR)
|
|
||||||
|
|
||||||
logger = get_logger()
|
queue_manager = QueueManager(PYINFRA_CONFIG)
|
||||||
|
queue_manager.start_consuming(analysis_callback)
|
||||||
main()
|
|
||||||
|
|||||||
8
test/fixtures/figure_detection.py
vendored
8
test/fixtures/figure_detection.py
vendored
@ -8,15 +8,11 @@ from lorem_text import lorem
|
|||||||
from cv_analysis.figure_detection.figure_detection_pipeline import (
|
from cv_analysis.figure_detection.figure_detection_pipeline import (
|
||||||
make_figure_detection_pipeline,
|
make_figure_detection_pipeline,
|
||||||
)
|
)
|
||||||
from cv_analysis.utils.display import show_image
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def page_with_images(random_image, n_images, background):
|
def page_with_images(random_image, background):
|
||||||
# page_image = Image.fromarray(background.astype("uint8")).convert("RGB")
|
page_image = paste_image(background, random_image, (200, 200))
|
||||||
page_image = paste_image(page_image, random_image, (200, 200))
|
|
||||||
if n_images == 2:
|
|
||||||
page_image = paste_image(page_image, random_image, (1000, 2600))
|
|
||||||
return np.array(page_image)
|
return np.array(page_image)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
3
test/fixtures/server.py
vendored
3
test/fixtures/server.py
vendored
@ -7,7 +7,6 @@ import pytest
|
|||||||
from funcy import first
|
from funcy import first
|
||||||
|
|
||||||
from cv_analysis.utils.structures import Rectangle
|
from cv_analysis.utils.structures import Rectangle
|
||||||
from incl.pyinfra.pyinfra.server.packing import bytes_to_string
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
@ -20,7 +19,7 @@ def random_image_as_bytes_and_compressed(random_image):
|
|||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def random_image_metadata_package(random_image_as_bytes_and_compressed):
|
def random_image_metadata_package(random_image_as_bytes_and_compressed):
|
||||||
data = bytes_to_string(random_image_as_bytes_and_compressed)
|
data = random_image_as_bytes_and_compressed.decode()
|
||||||
return [
|
return [
|
||||||
{
|
{
|
||||||
"data": data,
|
"data": data,
|
||||||
|
|||||||
9
test/fixtures/table_parsing.py
vendored
9
test/fixtures/table_parsing.py
vendored
@ -1,24 +1,27 @@
|
|||||||
import json
|
import json
|
||||||
from os.path import join
|
from os.path import join
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
import pytest
|
import pytest
|
||||||
from funcy import first
|
from funcy import first
|
||||||
|
|
||||||
from cv_analysis.locations import TEST_DATA_DIR
|
from cv_analysis.config import get_config
|
||||||
from cv_analysis.utils.draw import draw_rectangles
|
from cv_analysis.utils.draw import draw_rectangles
|
||||||
from cv_analysis.utils.open_pdf import open_pdf
|
from cv_analysis.utils.open_pdf import open_pdf
|
||||||
from test.fixtures.figure_detection import paste_text
|
from test.fixtures.figure_detection import paste_text
|
||||||
|
|
||||||
|
CV_CONFIG = get_config()
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def client_page_with_table(test_file_index):
|
def client_page_with_table(test_file_index):
|
||||||
img_path = join(TEST_DATA_DIR, f"test{test_file_index}.png")
|
img_path = join(CV_CONFIG.test_data_dir, f"test{test_file_index}.png")
|
||||||
return first(open_pdf(img_path))
|
return first(open_pdf(img_path))
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def expected_table_annotation(test_file_index):
|
def expected_table_annotation(test_file_index):
|
||||||
json_path = join(TEST_DATA_DIR, f"test{test_file_index}.json")
|
json_path = join(CV_CONFIG.test_data_dir, f"test{test_file_index}.json")
|
||||||
with open(json_path) as f:
|
with open(json_path) as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|||||||
@ -14,7 +14,6 @@ class TestFindPrimaryTextRegions:
|
|||||||
assert not list(results)
|
assert not list(results)
|
||||||
|
|
||||||
@pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
|
@pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
|
||||||
@pytest.mark.parametrize("n_images", [1])
|
|
||||||
def test_page_without_text_yields_figures(self, figure_detection_pipeline, page_with_images, image_size):
|
def test_page_without_text_yields_figures(self, figure_detection_pipeline, page_with_images, image_size):
|
||||||
results = figure_detection_pipeline(page_with_images)
|
results = figure_detection_pipeline(page_with_images)
|
||||||
result_figures_size = map(lambda x: (x.w, x.h), results)
|
result_figures_size = map(lambda x: (x.w, x.h), results)
|
||||||
@ -35,7 +34,6 @@ class TestFindPrimaryTextRegions:
|
|||||||
assert error <= error_tolerance
|
assert error <= error_tolerance
|
||||||
|
|
||||||
@pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
|
@pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
|
||||||
@pytest.mark.parametrize("n_images", [1, 2])
|
|
||||||
@pytest.mark.parametrize("font_scale", [1, 1.5, 2])
|
@pytest.mark.parametrize("font_scale", [1, 1.5, 2])
|
||||||
@pytest.mark.parametrize("font_style", [cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_COMPLEX])
|
@pytest.mark.parametrize("font_style", [cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_COMPLEX])
|
||||||
@pytest.mark.parametrize("text_types", powerset(["body", "header", "caption"]))
|
@pytest.mark.parametrize("text_types", powerset(["body", "header", "caption"]))
|
||||||
@ -45,13 +43,12 @@ class TestFindPrimaryTextRegions:
|
|||||||
figure_detection_pipeline,
|
figure_detection_pipeline,
|
||||||
page_with_images_and_text,
|
page_with_images_and_text,
|
||||||
image_size,
|
image_size,
|
||||||
n_images,
|
|
||||||
error_tolerance,
|
error_tolerance,
|
||||||
):
|
):
|
||||||
results = list(figure_detection_pipeline(page_with_images_and_text))
|
results = list(figure_detection_pipeline(page_with_images_and_text))
|
||||||
|
|
||||||
result_figures_area = sum(map(lambda x: (x.w * x.h), results))
|
result_figures_area = sum(map(lambda x: (x.w * x.h), results))
|
||||||
expected_figure_area = n_images * prod(image_size)
|
expected_figure_area = prod(image_size)
|
||||||
|
|
||||||
error = abs(result_figures_area - expected_figure_area) / expected_figure_area
|
error = abs(result_figures_area - expected_figure_area) / expected_figure_area
|
||||||
|
|
||||||
|
|||||||
@ -6,7 +6,6 @@ from cv_analysis.figure_detection.text import (
|
|||||||
remove_primary_text_regions,
|
remove_primary_text_regions,
|
||||||
apply_threshold_to_image,
|
apply_threshold_to_image,
|
||||||
)
|
)
|
||||||
from cv_analysis.utils.display import show_image
|
|
||||||
from test.utils.utils import powerset
|
from test.utils.utils import powerset
|
||||||
|
|
||||||
|
|
||||||
@ -19,7 +18,6 @@ class TestFindPrimaryTextRegions:
|
|||||||
np.testing.assert_equal(result_page, apply_threshold_to_image(background))
|
np.testing.assert_equal(result_page, apply_threshold_to_image(background))
|
||||||
|
|
||||||
@pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
|
@pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
|
||||||
@pytest.mark.parametrize("n_images", [1, 2])
|
|
||||||
def test_page_without_text_keeps_images(self, page_with_images, error_tolerance):
|
def test_page_without_text_keeps_images(self, page_with_images, error_tolerance):
|
||||||
result_page = remove_primary_text_regions(page_with_images)
|
result_page = remove_primary_text_regions(page_with_images)
|
||||||
np.testing.assert_equal(result_page, apply_threshold_to_image(page_with_images))
|
np.testing.assert_equal(result_page, apply_threshold_to_image(page_with_images))
|
||||||
@ -33,7 +31,6 @@ class TestFindPrimaryTextRegions:
|
|||||||
assert relative_error <= error_tolerance
|
assert relative_error <= error_tolerance
|
||||||
|
|
||||||
@pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
|
@pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
|
||||||
@pytest.mark.parametrize("n_images", [1, 2])
|
|
||||||
@pytest.mark.parametrize("font_scale", [1, 1.5, 2])
|
@pytest.mark.parametrize("font_scale", [1, 1.5, 2])
|
||||||
@pytest.mark.parametrize("font_style", [cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_COMPLEX])
|
@pytest.mark.parametrize("font_style", [cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_COMPLEX])
|
||||||
@pytest.mark.parametrize("text_types", powerset(["body", "header", "caption"]))
|
@pytest.mark.parametrize("text_types", powerset(["body", "header", "caption"]))
|
||||||
|
|||||||
@ -1,6 +0,0 @@
|
|||||||
from cv_analysis.config import CONFIG
|
|
||||||
|
|
||||||
|
|
||||||
def test_config():
|
|
||||||
assert CONFIG.service
|
|
||||||
assert CONFIG.webserver
|
|
||||||
Loading…
x
Reference in New Issue
Block a user