Pull request #20: New pyinfra

Merge in RR/cv-analysis from new_pyinfra to master

Squashed commit of the following:

commit f7a01a90aad1c402ac537de5bdf15df628ad54df
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Wed Jul 27 10:40:59 2022 +0200

    fix typo

commit ff4d549fac5b612c2d391ae85823c5eca1e91916
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Wed Jul 27 10:34:04 2022 +0200

    adjust build scripts for new pyinfra

commit ecd70f60d46406d8b6cc7f36a1533d706c917ca8
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Wed Jul 27 09:42:55 2022 +0200

    simplify logging by using default configurations

commit 20193c14c940eed2b0a7a72058167e26064119d0
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Tue Jul 26 17:16:57 2022 +0200

    tidy-up, refactor config logic to not dependent on external files

commit d8069cd4d404a570bb04a04278161669d1c83332
Author: Isaac Riley <Isaac.Riley@iqser.com>
Date:   Tue Jul 26 15:14:59 2022 +0200

    update pyinfra

commit c3bc11037cca9baf016043ab997c566f5b4a2586
Author: Isaac Riley <Isaac.Riley@iqser.com>
Date:   Tue Jul 26 15:09:14 2022 +0200

    repair tests

commit 6f4e4f2863ee16ae056c1d432f663858c5f10221
Author: Isaac Riley <Isaac.Riley@iqser.com>
Date:   Tue Jul 26 14:52:38 2022 +0200

    updated server logic to work with new pyinfra; update scripts for pyinfra as submodule

commit 2a18dba81de5ee84d0bdf0e77f478693e8d8aef4
Author: Isaac Riley <Isaac.Riley@iqser.com>
Date:   Tue Jul 26 14:10:41 2022 +0200

    formatting

commit d87ce9328de9aa2341228af9b24473d5e583504e
Author: Isaac Riley <Isaac.Riley@iqser.com>
Date:   Tue Jul 26 14:10:11 2022 +0200

    make server logic compatible with new pyinfra
This commit is contained in:
Isaac Riley 2022-07-27 10:50:10 +02:00 committed by Julius Unverfehrt
parent 1618909d8e
commit 9d98945ff9
31 changed files with 329 additions and 301 deletions

View File

@ -1,20 +1,25 @@
ARG BASE_ROOT="nexus.iqser.com:5001/red/"
ARG VERSION_TAG=latest
FROM python:3.10
FROM ${BASE_ROOT}cv-analysis-base:${VERSION_TAG}
RUN python -m venv /app/venv
ENV PATH="/app/venv/bin:$PATH"
RUN python -m pip install --upgrade pip
WORKDIR /app/service
COPY ./requirements.txt ./requirements.txt
RUN python3 -m pip install -r requirements.txt
COPY ./incl/pyinfra/requirements.txt ./incl/pyinfra/requirements.txt
RUN python -m pip install -r incl/pyinfra/requirements.txt
COPY ./incl ./incl
RUN python3 -m pip install -e incl/pyinfra
COPY ./src ./src
COPY cv_analysis ./cv_analysis
COPY config.yaml ./config.yaml
COPY ./cv_analysis ./cv_analysis
COPY ./setup.py ./setup.py
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install -e .
WORKDIR /app/service
EXPOSE 5000
EXPOSE 8080
CMD ["python3", "src/serve.py"]
CMD ["python3", "-u", "src/serve.py"]

View File

@ -1,31 +0,0 @@
FROM python:3.10 as builder1
# Use a virtual environment.
RUN python -m venv /app/venv
ENV PATH="/app/venv/bin:$PATH"
# Upgrade pip.
RUN python -m pip install --upgrade pip
# Make a directory for the service files and copy the service repo into the container.
WORKDIR /app/service
COPY . ./
# Install dependencies.
RUN python3 -m pip install -r requirements.txt
RUN python3 -m pip install -r incl/pyinfra/requirements.txt
RUN python3 -m pip install -e incl/pyinfra
# Make a new container and copy all relevant files over to filter out temporary files
# produced during setup to reduce the final container's size.
FROM python:3.10
WORKDIR /app/
COPY --from=builder1 /app .
ENV PATH="/app/venv/bin:$PATH"
WORKDIR /app/service
RUN apt update
#RUN apt install python3-opencv-headless
RUN apt install poppler-utils --yes

View File

@ -34,7 +34,6 @@ import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location;
public class PlanSpec {
private static final String SERVICE_NAME = "cv-analysis";
private static final String SERVICE_NAME_BASE = "cv-analysis-base";
private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_","");
@ -72,7 +71,7 @@ public class PlanSpec {
return new Plan(
project(),
SERVICE_NAME, new BambooKey(SERVICE_KEY))
.description("Docker build for cv-analysis.")
// .description("Docker build for cv-analysis.")
// .variables()
.stages(new Stage("Build Stage")
.jobs(
@ -84,9 +83,6 @@ public class PlanSpec {
new VcsCheckoutTask()
.description("Checkout default repository.")
.checkoutItems(new CheckoutItem().defaultRepository()),
new VcsCheckoutTask()
.description("Checkout pyinfra research repository.")
.checkoutItems(new CheckoutItem().repository("RR / pyinfra").path("pyinfra")),
new ScriptTask()
.description("Set config and keys.")
.inlineBody("mkdir -p ~/.ssh\n" +
@ -98,10 +94,10 @@ public class PlanSpec {
.description("Build Docker container.")
.location(Location.FILE)
.fileFromPath("bamboo-specs/src/main/resources/scripts/docker-build.sh")
.argument(SERVICE_NAME + " " + SERVICE_NAME_BASE))
.argument(SERVICE_NAME))
.dockerConfiguration(
new DockerConfiguration()
.image("nexus.iqser.com:5001/infra/release_build:4.2.0")
.image("nexus.iqser.com:5001/infra/release_build:4.5.0")
.volume("/var/run/docker.sock", "/var/run/docker.sock")),
new Job("Sonar Job", new BambooKey("SONAR"))
.tasks(
@ -111,9 +107,6 @@ public class PlanSpec {
new VcsCheckoutTask()
.description("Checkout default repository.")
.checkoutItems(new CheckoutItem().defaultRepository()),
new VcsCheckoutTask()
.description("Checkout pyinfra research repository.")
.checkoutItems(new CheckoutItem().repository("RR / pyinfra").path("pyinfra")),
new ScriptTask()
.description("Set config and keys.")
.inlineBody("mkdir -p ~/.ssh\n" +
@ -134,6 +127,9 @@ public class PlanSpec {
.jobs(
new Job("Git Tag Job", new BambooKey("GITTAG"))
.tasks(
new CleanWorkingDirectoryTask()
.description("Clean working directory.")
.enabled(true),
new VcsCheckoutTask()
.description("Checkout default repository.")
.checkoutItems(new CheckoutItem().defaultRepository()),
@ -152,7 +148,7 @@ public class PlanSpec {
.defaultRepository())
.dockerConfiguration(
new DockerConfiguration()
.image("nexus.iqser.com:5001/infra/release_build:4.4.1")),
.image("nexus.iqser.com:5001/infra/release_build:4.5.0")),
new Job("Licence Job", new BambooKey("LICENCE"))
.enabled(false)
.tasks(
@ -169,7 +165,6 @@ public class PlanSpec {
.volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml")
.volume("/var/run/docker.sock", "/var/run/docker.sock"))))
.linkedRepositories("RR / " + SERVICE_NAME)
.linkedRepositories("RR / pyinfra")
.triggers(new BitbucketServerTrigger())
.planBranchManagement(new PlanBranchManagement()
.createForVcsBranch()

View File

@ -2,8 +2,7 @@
set -e
SERVICE_NAME=$1
SERVICE_NAME_BASE=$2
# TODO version tag on master push
python3 -m venv build_venv
source build_venv/bin/activate
python3 -m pip install --upgrade pip
@ -13,7 +12,6 @@ pip install 'dvc[ssh]'
dvc pull
echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf
docker build -f Dockerfile_base -t nexus.iqser.com:5001/red/$SERVICE_NAME_BASE:${bamboo_version_tag} .
docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} --build-arg VERSION_TAG=${bamboo_version_tag} .
echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001
docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} --build-arg VERSION_TAG=${bamboo_version_tag} .
docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag}

View File

@ -7,17 +7,20 @@ python3 -m venv build_venv
source build_venv/bin/activate
python3 -m pip install --upgrade pip
pip install -e .
pip install -e incl/pyinfra
echo "dev setup for unit test and coverage"
pip install -e incl/pyinfra
pip install -r incl/pyinfra/requirements.txt
pip install -e .
pip install -r requirements.txt
echo "DVC pull step"
dvc pull
echo "coverage calculation"
coverage run -m pytest test
coverage run -m pytest
echo "coverage report generation"
coverage report -m
coverage xml
@ -28,7 +31,7 @@ echo "dependency-check:aggregate"
mkdir -p reports
dependency-check --enableExperimental -f JSON -f HTML -f XML \
--disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \
--exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**"
--exclude "build_venv/**" --exclude "**/__pycache__/**"
if [[ -z "${bamboo_repository_pr_key}" ]]
then

View File

@ -1,12 +0,0 @@
service:
logging_level: $LOGGING_LEVEL_ROOT|DEBUG # Logging level for log file messages
monitoring_enabled: $MONITORING_ENABLED|True # if app is doing monitoring or not
logfile_path: $LOGFILE_PATH|null # Overwrites the default path for the service logfile (image_service/log.log)
webserver:
host: $SERVER_HOST|"127.0.0.1" # webserver address
port: $SERVER_PORT|5000 # webserver port
visual_logging:
level: DISABLED # NOTHING > INFO > DEBUG > ALL
output_folder: /tmp/debug/

View File

@ -1,39 +1,30 @@
"""Implements a config object with dot-indexing syntax."""
import os
from envyaml import EnvYAML
from cv_analysis.locations import CONFIG_FILE
def _get_item_and_maybe_make_dotindexable(container, item):
ret = container[item]
return DotIndexable(ret) if isinstance(ret, dict) else ret
class DotIndexable:
def __init__(self, x):
self.x = x
def __getattr__(self, item):
return _get_item_and_maybe_make_dotindexable(self.x, item)
def __setitem__(self, key, value):
self.x[key] = value
def __repr__(self):
return self.x.__repr__()
def get_config():
return Config()
class Config:
def __init__(self, config_path):
self.__config = EnvYAML(config_path)
def __init__(self):
self.logging_level_root = os.environ.get("LOGGING_LEVEL_ROOT", "INFO")
def __getattr__(self, item):
if item in self.__config:
return _get_item_and_maybe_make_dotindexable(self.__config, item)
# visual_logging_level: NOTHING > INFO > DEBUG > ALL
self.visual_logging_level = "DISABLED"
self.visual_logging_output_folder = "/tmp/debug"
def __getitem__(self, item):
return self.__getattr__(item)
# locations
# FIXME: is everything here necessary?
root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
self.dvc_data_dir = os.path.join(root, "data")
self.pdf_for_testing = os.path.join(self.dvc_data_dir, "pdfs_for_testing")
self.png_for_testing = os.path.join(self.dvc_data_dir, "pngs_for_testing")
self.png_figures_detected = os.path.join(self.png_for_testing, "figures_detected")
self.png_tables_detected = os.path.join(self.png_for_testing, "tables_detected_by_tp")
self.hashed_pdfs_for_testing = os.path.join(self.pdf_for_testing, "hashed")
self.metadata_test_files = os.path.join(self.dvc_data_dir, "metadata_testing_files.csv")
self.test_dir = os.path.join(root, "test")
self.test_data_dir = os.path.join(self.test_dir, "test_data")
CONFIG = Config(CONFIG_FILE)
def __getitem__(self, key):
return self.__getattribute__(key)

View File

@ -1,22 +0,0 @@
"""Defines constant paths relative to the module root path."""
from os import path
MODULE_DIR = path.dirname(path.abspath(__file__))
PACKAGE_ROOT_DIR = path.dirname(MODULE_DIR)
CONFIG_FILE = path.join(PACKAGE_ROOT_DIR, "config.yaml")
LOG_FILE = "/tmp/log.log"
DVC_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data")
PDF_FOR_TESTING = path.join(DVC_DATA_DIR, "pdfs_for_testing")
PNG_FOR_TESTING = path.join(DVC_DATA_DIR, "pngs_for_testing")
PNG_FIGURES_DETECTED = path.join(PNG_FOR_TESTING, "figures_detected")
PNG_TABLES_DETECTED = path.join(PNG_FOR_TESTING, "tables_detected_by_tp")
HASHED_PDFS_FOR_TESTING = path.join(PDF_FOR_TESTING, "hashed")
METADATA_TESTFILES = path.join(DVC_DATA_DIR, "metadata_testing_files.csv")
TEST_DIR = path.join(PACKAGE_ROOT_DIR, "test")
TEST_DATA_DIR = path.join(TEST_DIR, "test_data")

View File

@ -2,7 +2,6 @@ from functools import partial
import cv2
import numpy as np
import pdf2image
from iteration_utilities import starfilter, first
from cv_analysis.utils.filters import is_large_enough, is_filled, is_boxy

View File

@ -7,23 +7,20 @@ from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_d
from cv_analysis.layout_parsing import parse_layout
from cv_analysis.server.rotate import rotate_rectangle
from cv_analysis.table_parsing import parse_tables
from cv_analysis.utils.logging import get_logger
from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs
from cv_analysis.utils.structures import Rectangle
logger = get_logger()
def make_analysis_pipeline(analysis_fn: Callable, dpi=200):
"""Make end-to-end pipeline to analyse a PDF with given analysis function.
The pipeline returns a Generator of dicts containing page information and the analysis results.
Steps:
Convert PDF to Arrays and page information
Analise pages, get list of bboxes per page (e.g. table cells)
Convert PDF to pairs of image and page information
Analyse pages, get list of bounding boxes per page (e.g. table cells)
Convert pixel values to inches
Rotate results if page is rotated
Format results to stream of dictionaries
Format results to stream of dictionaries with page information and analysis results
"""
def pipeline(pdf: bytes, index=None):

View File

@ -1,16 +1,15 @@
from functools import partial
from itertools import chain, starmap
from operator import attrgetter
import cv2
import numpy as np
from funcy import lmap
from cv_analysis.layout_parsing import parse_layout
from cv_analysis.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d
from cv_analysis.utils.structures import Rectangle
from cv_analysis.utils.visual_logging import vizlogger
from cv_analysis.layout_parsing import parse_layout
def add_external_contours(image, image_h_w_lines_only):

View File

@ -1,4 +1,3 @@
from numpy import resize
import cv2
from matplotlib import pyplot as plt

View File

@ -1,26 +0,0 @@
"""Defines the default logger for the service."""
import sys
import logging
from cv_analysis.config import CONFIG
def make_logger_getter():
logger = logging.getLogger(__name__)
logger.setLevel(logging.getLevelName(CONFIG.service.logging_level))
formatter = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s", datefmt="%d.%m.%Y - %H:%M:%S")
ch = logging.StreamHandler(sys.stdout)
ch.setLevel(logging.getLevelName(CONFIG.service.logging_level))
ch.setFormatter(formatter)
logger.addHandler(ch)
logger.propagate = False
def get_logger():
return logger
return get_logger
get_logger = make_logger_getter()

View File

@ -1,6 +1,6 @@
from dataclasses import dataclass
from functools import partial
from typing import Iterator, Tuple
from typing import Iterator
import fitz
import numpy as np
@ -14,7 +14,8 @@ class ImageMetadataPair:
def pdf_to_image_metadata_pairs(pdf: bytes, index=None, dpi=200) -> Iterator[ImageMetadataPair]:
"""Streams PDF as pairs of image (matrix) and metadata.
Note: If Index is not given or evaluates to None, the whole PDF will be processed."""
Note: If Index is not given or evaluates to None, the whole PDF will be processed.
"""
convert_fn = partial(page_to_image_metadata_pair, dpi=dpi)
yield from map(convert_fn, stream_pages(pdf, index))

View File

@ -1,7 +1,10 @@
import os
from cv_analysis.config import CONFIG
from cv_analysis.config import get_config
from cv_analysis.utils.display import save_image
CV_CONFIG = get_config()
class VisualLogger:
def __init__(self, level, output_folder):
@ -36,4 +39,4 @@ class VisualLogger:
return self.level == "ALL"
vizlogger = VisualLogger(CONFIG.visual_logging.level, CONFIG.visual_logging.output_folder)
vizlogger = VisualLogger(CV_CONFIG.visual_logging_level, CV_CONFIG.visual_logging_output_folder)

31
docker-compose.yaml Normal file
View File

@ -0,0 +1,31 @@
version: '2'
services:
minio:
image: minio/minio
ports:
- "9000:9000"
environment:
- MINIO_ROOT_PASSWORD=password
- MINIO_ROOT_USER=root
volumes:
- ./data/minio_store:/data
command: server /data
network_mode: "bridge"
rabbitmq:
image: docker.io/bitnami/rabbitmq:3.9
ports:
- '4369:4369'
- '5551:5551'
- '5552:5552'
- '5672:5672'
- '25672:25672'
- '15672:15672'
environment:
- RABBITMQ_SECURE_PASSWORD=yes
- RABBITMQ_VM_MEMORY_HIGH_WATERMARK=100%
- RABBITMQ_DISK_FREE_ABSOLUTE_LIMIT=20Gi
network_mode: "bridge"
volumes:
- /opt/bitnami/rabbitmq/.rabbitmq/:/data/bitnami
volumes:
mdata:

@ -1 +1 @@
Subproject commit 7e948a4cf05a3ef59fcc7e8719fcf910adc73864
Subproject commit 6c2652837a17a29476b11b1acbc35ba8825c2cd9

View File

@ -1,2 +1,4 @@
[pytest]
norecursedirs = incl
testpaths = test
addopts = --ignore=data

View File

@ -1,11 +1,15 @@
import hashlib
import json
import os
from itertools import chain
from os import path
import pandas as pd
from pdf2image import convert_from_path
from itertools import chain
import json
from cv_analysis.locations import PDF_FOR_TESTING, TEST_DATA_DIR, PNG_FOR_TESTING, DVC_DATA_DIR, HASHED_PDFS_FOR_TESTING
from cv_analysis.config import get_config
CV_CONFIG = get_config()
def read_json(path):
@ -22,7 +26,7 @@ def collect_metadata(example_pages, save=False):
metadata = list(chain.from_iterable(metadata))
if save:
df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
df.to_csv(path.join(DVC_DATA_DIR, "metadata_testing_files.csv"))
df.to_csv(path.join(CV_CONFIG.dvc_data_dir, "metadata_testing_files.csv"))
else:
return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
@ -46,7 +50,7 @@ def make_metadata_entry_maker():
def split_pdf(example_pages):
dir_path = PDF_FOR_TESTING
dir_path = CV_CONFIG.pdf_for_testing
i = 0
for name, document_sections in example_pages.items():
for pages in document_sections:
@ -54,7 +58,7 @@ def split_pdf(example_pages):
pdf_path=path.join(dir_path, name), dpi=300, first_page=pages[0], last_page=pages[1]
)
for image in images:
fp = path.join(PNG_FOR_TESTING, f"fig_table{i:0>3}.png")
fp = path.join(CV_CONFIG.png_for_testing, f"fig_table{i:0>3}.png")
image.save(fp=fp, dpi=(300, 300))
i += 1
@ -74,7 +78,7 @@ def find_hash(file_path):
def rename_files_with_hash(example_pages):
files_to_rename = list(example_pages.keys())
folder = HASHED_PDFS_FOR_TESTING
folder = CV_CONFIG.hashed_pdfs_for_testing
# Iterate through the folder
for file in os.listdir(folder):
@ -99,7 +103,7 @@ def rename_files_with_hash(example_pages):
def main():
examples_pages = read_json(path.join(TEST_DATA_DIR, "example_pages.json"))
examples_pages = read_json(path.join(CV_CONFIG.test_data_dir, "example_pages.json"))
rename_files_with_hash(examples_pages)
# collect_metadata(examples_pages, save=True)
# split_pdf(examples_pages)

68
scripts/manage_minio.py Normal file
View File

@ -0,0 +1,68 @@
import argparse
import gzip
import os
from pathlib import Path
from tqdm import tqdm
from pyinfra.config import get_config
from pyinfra.storage.storage import get_s3_storage
CONFIG = get_config()
def parse_args():
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers(help="sub-command help", dest="command")
parser_add = subparsers.add_parser("add", help="Add file(s) to the MinIO store")
parser_add.add_argument("dossier_id")
add_group = parser_add.add_mutually_exclusive_group(required=True)
add_group.add_argument("--file", "-f")
add_group.add_argument("--directory", "-d")
subparsers.add_parser("purge", help="Delete all files and buckets in the MinIO store")
args = parser.parse_args()
return args
def combine_dossier_id_and_file_id_and_extension(dossier_id, file_id, extension):
return f"{dossier_id}/{file_id}{extension}"
def add_file_compressed(storage, bucket_name, dossier_id, path) -> None:
if Path(path).suffix == ".pdf":
suffix_gz = ".ORIGIN.pdf.gz"
if Path(path).suffix == ".json":
suffix_gz = ".TEXT.json.gz"
path_gz = combine_dossier_id_and_file_id_and_extension(dossier_id, Path(path).stem, suffix_gz)
with open(path, "rb") as f:
data = gzip.compress(f.read())
storage.put_object(bucket_name, path_gz, data)
if __name__ == "__main__":
storage = get_s3_storage(CONFIG)
bucket_name = CONFIG.storage_bucket
if not storage.has_bucket(bucket_name):
storage.make_bucket(bucket_name)
args = parse_args()
if args.command == "add":
if args.file:
add_file_compressed(storage, bucket_name, args.dossier_id, args.file)
elif args.directory:
for fname in tqdm([*os.listdir(args.directory)], desc="Adding files"):
path = Path(args.directory) / fname
add_file_compressed(storage, bucket_name, args.dossier_id, path)
elif args.command == "purge":
storage.clear_bucket(bucket_name)

View File

@ -0,0 +1,84 @@
import argparse
import json
import pika
from pyinfra.config import get_config
from pyinfra.storage.storage import get_s3_storage
CONFIG = get_config()
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--operation", "-o", choices=["table", "layout", "figure"], required=True)
args = parser.parse_args()
return args
def read_connection_params():
credentials = pika.PlainCredentials(CONFIG.rabbitmq_username, CONFIG.rabbitmq_password)
parameters = pika.ConnectionParameters(
host=CONFIG.rabbitmq_host,
port=CONFIG.rabbitmq_port,
heartbeat=int(CONFIG.rabbitmq_heartbeat),
credentials=credentials,
)
return parameters
def make_channel(connection) -> pika.adapters.blocking_connection.BlockingChannel:
channel = connection.channel()
channel.basic_qos(prefetch_count=1)
return channel
def declare_queue(channel, queue: str):
args = {"x-dead-letter-exchange": "", "x-dead-letter-routing-key": CONFIG.dead_letter_queue}
return channel.queue_declare(queue=queue, auto_delete=False, durable=True, arguments=args)
def make_connection() -> pika.BlockingConnection:
parameters = read_connection_params()
connection = pika.BlockingConnection(parameters)
return connection
def build_message_bodies(operation, bucket_name):
storage = get_s3_storage(CONFIG)
for bucket_name, pdf_name in storage.get_all_object_names(bucket_name):
if "pdf" not in pdf_name:
continue
file_id = pdf_name.split(".")[0]
dossier_id, file_id = file_id.split("/")
message_dict = {
"dossierId": dossier_id,
"fileId": file_id,
"targetFileExtension": "ORIGIN.pdf.gz",
"responseFileExtension": f"{operation.upper()}.json.gz",
"operation": operation,
}
yield json.dumps(message_dict).encode()
def main(args):
connection = make_connection()
channel = make_channel(connection)
declare_queue(channel, CONFIG.request_queue)
declare_queue(channel, CONFIG.response_queue)
for body in build_message_bodies(args.operation, CONFIG.storage_bucket):
channel.basic_publish("", CONFIG.request_queue, body)
print(f"Put {body} on {CONFIG.request_queue}")
for method_frame, _, body in channel.consume(queue=CONFIG.response_queue, inactivity_timeout=1):
if not body:
break
print(f"Received {json.loads(body)}")
channel.basic_ack(method_frame.delivery_tag)
channel.close()
if __name__ == "__main__":
main(parse_args())

View File

@ -1,64 +0,0 @@
import argparse
import gzip
from operator import itemgetter
from typing import List
import fitz
import pdf2image
from funcy import lmap, compose, pluck
from pyinfra.default_objects import get_component_factory
from cv_analysis.config import CONFIG
from incl.pyinfra.test.utils.image import image_to_bytes
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--pdf_path", "-p", required=True)
parser.add_argument("--operation", "-o", choices=["figure_detection", "table_parsing"], required=True)
parser.add_argument("--result_path", "-r", required=True)
args = parser.parse_args()
return args
def request_metadatas(dpi, n_metadata):
return [{"dpi": dpi} for _ in range(1, n_metadata)]
def draw_cells_on_page(cells: List[dict], page):
def format_xywh_to_x0y0x1y1(rect):
x, y, w, h = rect
return x, y, x + w, y + h
rects = map(itemgetter("x", "y", "width", "height"), cells)
rects = map(format_xywh_to_x0y0x1y1, rects)
for rect in rects:
page.draw_rect(rect, color=(0.3, 0.7, 0.1), width=2, overlay=True)
def annotate_results_on_pdf(results, pdf_path, result_path):
opened_pdf = fitz.open(pdf_path)
metadata_per_page = pluck("metadata", results)
for page, metadata in zip(opened_pdf, metadata_per_page):
if metadata:
draw_cells_on_page(metadata["cells"], page)
opened_pdf.save(result_path)
def main(args):
dpi = 200
images = lmap(compose(gzip.compress, image_to_bytes), pdf2image.convert_from_path(args.pdf_path, dpi=dpi))
submit_endpoint = f"http://{CONFIG.webserver.host}:{CONFIG.webserver.port}/{args.operation}"
pipeline = get_component_factory(CONFIG).get_pipeline(submit_endpoint)
results = list(pipeline(data=images, metadata=request_metadatas(dpi, len(images))))
annotate_results_on_pdf(results, args.pdf_path, args.result_path)
if __name__ == "__main__":
main(parse_args())

View File

@ -0,0 +1,24 @@
import argparse
import gzip
import json
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("compressed_json_path", help="Path to compressed JSON file")
return parser.parse_args()
def main(fp):
with open(fp, "rb") as f:
compressed_json_path = f.read()
json_str = gzip.decompress(compressed_json_path)
parsed = json.loads(json_str)
print(json.dumps(parsed, indent=2))
if __name__ == "__main__":
args = parse_args()
main(args.compressed_json_path)

View File

@ -1,13 +0,0 @@
#!/bin/bash
set -e
python3 -m venv build_venv
source build_venv/bin/activate
python3 -m pip install --upgrade pip
#pip install dvc
#pip install 'dvc[ssh]'
#dvc pull
docker build -f Dockerfile_base -t cv-analysis-base .
docker build -f Dockerfile -t cv-analysis .

View File

@ -1,35 +1,42 @@
import gzip
import json
import logging
from operator import itemgetter
from waitress import serve
from cv_analysis.config import CONFIG
from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline
from cv_analysis.server.stream import make_streamable_analysis_fn
from cv_analysis.table_parsing import parse_tables
from cv_analysis.server.pipeline import get_analysis_fn, make_analysis_pipeline
from cv_analysis.utils.banner import make_art
from cv_analysis.utils.logging import get_logger
from incl.pyinfra.pyinfra.server.server import set_up_processing_server
from pyinfra import config as pyinfra_config
from pyinfra.queue.queue_manager import QueueManager
from pyinfra.storage.storage import get_storage
PYINFRA_CONFIG = pyinfra_config.get_config()
logging.basicConfig(level=PYINFRA_CONFIG.logging_level_root)
def main():
logger.info(make_art())
def analysis_callback(queue_message: dict):
operation2function = {"table_parsing": parse_tables, "figure_detection": make_figure_detection_pipeline()}
operation2streamable_function = {op: make_streamable_analysis_fn(fn) for op, fn in operation2function.items()}
dossier_id, file_id, target_file_ext, response_file_ext, operation = itemgetter(
"dossierId", "fileId", "targetFileExtension", "responseFileExtension", "operation"
)(queue_message)
logging.info(f"Processing {dossier_id=}/{file_id=}, {operation=}.")
storage = get_storage(PYINFRA_CONFIG)
object_name = f"{dossier_id}/{file_id}.{target_file_ext}"
object_bytes = gzip.decompress(storage.get_object(PYINFRA_CONFIG.storage_bucket, object_name))
analysis_fn = make_analysis_pipeline(get_analysis_fn(operation))
server = set_up_processing_server(operation2streamable_function)
results = analysis_fn(object_bytes)
response = {**queue_message, "data": list(results)}
response = gzip.compress(json.dumps(response).encode())
response_name = f"{dossier_id}/{file_id}.{response_file_ext}"
serve(server, host=CONFIG.webserver.host, port=CONFIG.webserver.port, _quiet=False)
storage.put_object(PYINFRA_CONFIG.storage_bucket, response_name, response)
return {"dossierId": dossier_id, "fileId": file_id}
if __name__ == "__main__":
logging.basicConfig(level=CONFIG.service.logging_level)
logging.getLogger("pillow").setLevel(logging.ERROR)
logging.getLogger("PIL").setLevel(logging.ERROR)
logging.getLogger("flask").setLevel(logging.ERROR)
logging.getLogger("urllib3").setLevel(logging.ERROR)
logging.info(make_art())
logger = get_logger()
main()
queue_manager = QueueManager(PYINFRA_CONFIG)
queue_manager.start_consuming(analysis_callback)

View File

@ -8,15 +8,11 @@ from lorem_text import lorem
from cv_analysis.figure_detection.figure_detection_pipeline import (
make_figure_detection_pipeline,
)
from cv_analysis.utils.display import show_image
@pytest.fixture
def page_with_images(random_image, n_images, background):
# page_image = Image.fromarray(background.astype("uint8")).convert("RGB")
page_image = paste_image(page_image, random_image, (200, 200))
if n_images == 2:
page_image = paste_image(page_image, random_image, (1000, 2600))
def page_with_images(random_image, background):
page_image = paste_image(background, random_image, (200, 200))
return np.array(page_image)

View File

@ -7,7 +7,6 @@ import pytest
from funcy import first
from cv_analysis.utils.structures import Rectangle
from incl.pyinfra.pyinfra.server.packing import bytes_to_string
@pytest.fixture
@ -20,7 +19,7 @@ def random_image_as_bytes_and_compressed(random_image):
@pytest.fixture
def random_image_metadata_package(random_image_as_bytes_and_compressed):
data = bytes_to_string(random_image_as_bytes_and_compressed)
data = random_image_as_bytes_and_compressed.decode()
return [
{
"data": data,

View File

@ -1,24 +1,27 @@
import json
from os.path import join
import cv2
import pytest
from funcy import first
from cv_analysis.locations import TEST_DATA_DIR
from cv_analysis.config import get_config
from cv_analysis.utils.draw import draw_rectangles
from cv_analysis.utils.open_pdf import open_pdf
from test.fixtures.figure_detection import paste_text
CV_CONFIG = get_config()
@pytest.fixture
def client_page_with_table(test_file_index):
img_path = join(TEST_DATA_DIR, f"test{test_file_index}.png")
img_path = join(CV_CONFIG.test_data_dir, f"test{test_file_index}.png")
return first(open_pdf(img_path))
@pytest.fixture
def expected_table_annotation(test_file_index):
json_path = join(TEST_DATA_DIR, f"test{test_file_index}.json")
json_path = join(CV_CONFIG.test_data_dir, f"test{test_file_index}.json")
with open(json_path) as f:
return json.load(f)

View File

@ -14,7 +14,6 @@ class TestFindPrimaryTextRegions:
assert not list(results)
@pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
@pytest.mark.parametrize("n_images", [1])
def test_page_without_text_yields_figures(self, figure_detection_pipeline, page_with_images, image_size):
results = figure_detection_pipeline(page_with_images)
result_figures_size = map(lambda x: (x.w, x.h), results)
@ -35,7 +34,6 @@ class TestFindPrimaryTextRegions:
assert error <= error_tolerance
@pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
@pytest.mark.parametrize("n_images", [1, 2])
@pytest.mark.parametrize("font_scale", [1, 1.5, 2])
@pytest.mark.parametrize("font_style", [cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_COMPLEX])
@pytest.mark.parametrize("text_types", powerset(["body", "header", "caption"]))
@ -45,13 +43,12 @@ class TestFindPrimaryTextRegions:
figure_detection_pipeline,
page_with_images_and_text,
image_size,
n_images,
error_tolerance,
):
results = list(figure_detection_pipeline(page_with_images_and_text))
result_figures_area = sum(map(lambda x: (x.w * x.h), results))
expected_figure_area = n_images * prod(image_size)
expected_figure_area = prod(image_size)
error = abs(result_figures_area - expected_figure_area) / expected_figure_area

View File

@ -6,7 +6,6 @@ from cv_analysis.figure_detection.text import (
remove_primary_text_regions,
apply_threshold_to_image,
)
from cv_analysis.utils.display import show_image
from test.utils.utils import powerset
@ -19,7 +18,6 @@ class TestFindPrimaryTextRegions:
np.testing.assert_equal(result_page, apply_threshold_to_image(background))
@pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
@pytest.mark.parametrize("n_images", [1, 2])
def test_page_without_text_keeps_images(self, page_with_images, error_tolerance):
result_page = remove_primary_text_regions(page_with_images)
np.testing.assert_equal(result_page, apply_threshold_to_image(page_with_images))
@ -33,7 +31,6 @@ class TestFindPrimaryTextRegions:
assert relative_error <= error_tolerance
@pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)])
@pytest.mark.parametrize("n_images", [1, 2])
@pytest.mark.parametrize("font_scale", [1, 1.5, 2])
@pytest.mark.parametrize("font_style", [cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_COMPLEX])
@pytest.mark.parametrize("text_types", powerset(["body", "header", "caption"]))

View File

@ -1,6 +0,0 @@
from cv_analysis.config import CONFIG
def test_config():
assert CONFIG.service
assert CONFIG.webserver