From 9d98945ff973a552c12214240f4d49e91d1204d3 Mon Sep 17 00:00:00 2001 From: Isaac Riley Date: Wed, 27 Jul 2022 10:50:10 +0200 Subject: [PATCH] Pull request #20: New pyinfra Merge in RR/cv-analysis from new_pyinfra to master Squashed commit of the following: commit f7a01a90aad1c402ac537de5bdf15df628ad54df Author: Julius Unverfehrt Date: Wed Jul 27 10:40:59 2022 +0200 fix typo commit ff4d549fac5b612c2d391ae85823c5eca1e91916 Author: Julius Unverfehrt Date: Wed Jul 27 10:34:04 2022 +0200 adjust build scripts for new pyinfra commit ecd70f60d46406d8b6cc7f36a1533d706c917ca8 Author: Julius Unverfehrt Date: Wed Jul 27 09:42:55 2022 +0200 simplify logging by using default configurations commit 20193c14c940eed2b0a7a72058167e26064119d0 Author: Julius Unverfehrt Date: Tue Jul 26 17:16:57 2022 +0200 tidy-up, refactor config logic to not dependent on external files commit d8069cd4d404a570bb04a04278161669d1c83332 Author: Isaac Riley Date: Tue Jul 26 15:14:59 2022 +0200 update pyinfra commit c3bc11037cca9baf016043ab997c566f5b4a2586 Author: Isaac Riley Date: Tue Jul 26 15:09:14 2022 +0200 repair tests commit 6f4e4f2863ee16ae056c1d432f663858c5f10221 Author: Isaac Riley Date: Tue Jul 26 14:52:38 2022 +0200 updated server logic to work with new pyinfra; update scripts for pyinfra as submodule commit 2a18dba81de5ee84d0bdf0e77f478693e8d8aef4 Author: Isaac Riley Date: Tue Jul 26 14:10:41 2022 +0200 formatting commit d87ce9328de9aa2341228af9b24473d5e583504e Author: Isaac Riley Date: Tue Jul 26 14:10:11 2022 +0200 make server logic compatible with new pyinfra --- Dockerfile | 29 ++++--- Dockerfile_base | 31 ------- .../src/main/java/buildjob/PlanSpec.java | 19 ++--- .../main/resources/scripts/docker-build.sh | 6 +- .../src/main/resources/scripts/sonar-scan.sh | 11 ++- config.yaml | 12 --- cv_analysis/config.py | 53 +++++------- cv_analysis/locations.py | 22 ----- cv_analysis/redaction_detection.py | 1 - cv_analysis/server/pipeline.py | 9 +- cv_analysis/table_parsing.py | 5 +- cv_analysis/utils/display.py | 1 - cv_analysis/utils/logging.py | 26 ------ cv_analysis/utils/pdf2image.py | 5 +- cv_analysis/utils/visual_logging.py | 7 +- docker-compose.yaml | 31 +++++++ incl/pyinfra | 2 +- pytest.ini | 2 + scripts/export_example_pages.py | 20 +++-- scripts/manage_minio.py | 68 +++++++++++++++ scripts/publish_requests.py | 84 +++++++++++++++++++ scripts/pyinfra_mock.py | 64 -------------- scripts/show_compressed_json.py | 24 ++++++ setup/docker.sh | 13 --- src/serve.py | 51 ++++++----- test/fixtures/figure_detection.py | 8 +- test/fixtures/server.py | 3 +- test/fixtures/table_parsing.py | 9 +- .../figure_detection_pipeline_test.py | 5 +- test/unit_tests/figure_detection/text_test.py | 3 - test/unit_tests/parse_configuration_test.py | 6 -- 31 files changed, 329 insertions(+), 301 deletions(-) delete mode 100644 Dockerfile_base delete mode 100644 config.yaml delete mode 100644 cv_analysis/locations.py delete mode 100644 cv_analysis/utils/logging.py create mode 100644 docker-compose.yaml create mode 100644 scripts/manage_minio.py create mode 100644 scripts/publish_requests.py delete mode 100644 scripts/pyinfra_mock.py create mode 100644 scripts/show_compressed_json.py delete mode 100644 setup/docker.sh delete mode 100644 test/unit_tests/parse_configuration_test.py diff --git a/Dockerfile b/Dockerfile index 4fa9003..349b83d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,20 +1,25 @@ -ARG BASE_ROOT="nexus.iqser.com:5001/red/" -ARG VERSION_TAG=latest +FROM python:3.10 -FROM ${BASE_ROOT}cv-analysis-base:${VERSION_TAG} +RUN python -m venv /app/venv +ENV PATH="/app/venv/bin:$PATH" + +RUN python -m pip install --upgrade pip WORKDIR /app/service +COPY ./requirements.txt ./requirements.txt +RUN python3 -m pip install -r requirements.txt + +COPY ./incl/pyinfra/requirements.txt ./incl/pyinfra/requirements.txt +RUN python -m pip install -r incl/pyinfra/requirements.txt + +COPY ./incl ./incl +RUN python3 -m pip install -e incl/pyinfra + COPY ./src ./src -COPY cv_analysis ./cv_analysis -COPY config.yaml ./config.yaml +COPY ./cv_analysis ./cv_analysis +COPY ./setup.py ./setup.py -RUN python3 -m pip install --upgrade pip RUN python3 -m pip install -e . -WORKDIR /app/service - -EXPOSE 5000 -EXPOSE 8080 - -CMD ["python3", "src/serve.py"] \ No newline at end of file +CMD ["python3", "-u", "src/serve.py"] \ No newline at end of file diff --git a/Dockerfile_base b/Dockerfile_base deleted file mode 100644 index 0838921..0000000 --- a/Dockerfile_base +++ /dev/null @@ -1,31 +0,0 @@ -FROM python:3.10 as builder1 - -# Use a virtual environment. -RUN python -m venv /app/venv -ENV PATH="/app/venv/bin:$PATH" - -# Upgrade pip. -RUN python -m pip install --upgrade pip - -# Make a directory for the service files and copy the service repo into the container. -WORKDIR /app/service -COPY . ./ - -# Install dependencies. -RUN python3 -m pip install -r requirements.txt -RUN python3 -m pip install -r incl/pyinfra/requirements.txt -RUN python3 -m pip install -e incl/pyinfra - -# Make a new container and copy all relevant files over to filter out temporary files -# produced during setup to reduce the final container's size. -FROM python:3.10 - -WORKDIR /app/ -COPY --from=builder1 /app . -ENV PATH="/app/venv/bin:$PATH" - -WORKDIR /app/service - -RUN apt update -#RUN apt install python3-opencv-headless -RUN apt install poppler-utils --yes \ No newline at end of file diff --git a/bamboo-specs/src/main/java/buildjob/PlanSpec.java b/bamboo-specs/src/main/java/buildjob/PlanSpec.java index 1dd7270..cb0e5c1 100644 --- a/bamboo-specs/src/main/java/buildjob/PlanSpec.java +++ b/bamboo-specs/src/main/java/buildjob/PlanSpec.java @@ -34,7 +34,6 @@ import com.atlassian.bamboo.specs.model.task.ScriptTaskProperties.Location; public class PlanSpec { private static final String SERVICE_NAME = "cv-analysis"; - private static final String SERVICE_NAME_BASE = "cv-analysis-base"; private static final String SERVICE_KEY = SERVICE_NAME.toUpperCase().replaceAll("-","").replaceAll("_",""); @@ -72,7 +71,7 @@ public class PlanSpec { return new Plan( project(), SERVICE_NAME, new BambooKey(SERVICE_KEY)) - .description("Docker build for cv-analysis.") +// .description("Docker build for cv-analysis.") // .variables() .stages(new Stage("Build Stage") .jobs( @@ -84,9 +83,6 @@ public class PlanSpec { new VcsCheckoutTask() .description("Checkout default repository.") .checkoutItems(new CheckoutItem().defaultRepository()), - new VcsCheckoutTask() - .description("Checkout pyinfra research repository.") - .checkoutItems(new CheckoutItem().repository("RR / pyinfra").path("pyinfra")), new ScriptTask() .description("Set config and keys.") .inlineBody("mkdir -p ~/.ssh\n" + @@ -98,10 +94,10 @@ public class PlanSpec { .description("Build Docker container.") .location(Location.FILE) .fileFromPath("bamboo-specs/src/main/resources/scripts/docker-build.sh") - .argument(SERVICE_NAME + " " + SERVICE_NAME_BASE)) + .argument(SERVICE_NAME)) .dockerConfiguration( new DockerConfiguration() - .image("nexus.iqser.com:5001/infra/release_build:4.2.0") + .image("nexus.iqser.com:5001/infra/release_build:4.5.0") .volume("/var/run/docker.sock", "/var/run/docker.sock")), new Job("Sonar Job", new BambooKey("SONAR")) .tasks( @@ -111,9 +107,6 @@ public class PlanSpec { new VcsCheckoutTask() .description("Checkout default repository.") .checkoutItems(new CheckoutItem().defaultRepository()), - new VcsCheckoutTask() - .description("Checkout pyinfra research repository.") - .checkoutItems(new CheckoutItem().repository("RR / pyinfra").path("pyinfra")), new ScriptTask() .description("Set config and keys.") .inlineBody("mkdir -p ~/.ssh\n" + @@ -134,6 +127,9 @@ public class PlanSpec { .jobs( new Job("Git Tag Job", new BambooKey("GITTAG")) .tasks( + new CleanWorkingDirectoryTask() + .description("Clean working directory.") + .enabled(true), new VcsCheckoutTask() .description("Checkout default repository.") .checkoutItems(new CheckoutItem().defaultRepository()), @@ -152,7 +148,7 @@ public class PlanSpec { .defaultRepository()) .dockerConfiguration( new DockerConfiguration() - .image("nexus.iqser.com:5001/infra/release_build:4.4.1")), + .image("nexus.iqser.com:5001/infra/release_build:4.5.0")), new Job("Licence Job", new BambooKey("LICENCE")) .enabled(false) .tasks( @@ -169,7 +165,6 @@ public class PlanSpec { .volume("/etc/maven/settings.xml", "/usr/share/maven/ref/settings.xml") .volume("/var/run/docker.sock", "/var/run/docker.sock")))) .linkedRepositories("RR / " + SERVICE_NAME) - .linkedRepositories("RR / pyinfra") .triggers(new BitbucketServerTrigger()) .planBranchManagement(new PlanBranchManagement() .createForVcsBranch() diff --git a/bamboo-specs/src/main/resources/scripts/docker-build.sh b/bamboo-specs/src/main/resources/scripts/docker-build.sh index 42874f6..8b6ab98 100755 --- a/bamboo-specs/src/main/resources/scripts/docker-build.sh +++ b/bamboo-specs/src/main/resources/scripts/docker-build.sh @@ -2,8 +2,7 @@ set -e SERVICE_NAME=$1 -SERVICE_NAME_BASE=$2 -# TODO version tag on master push + python3 -m venv build_venv source build_venv/bin/activate python3 -m pip install --upgrade pip @@ -13,7 +12,6 @@ pip install 'dvc[ssh]' dvc pull echo "index-url = https://${bamboo_nexus_user}:${bamboo_nexus_password}@nexus.iqser.com/repository/python-combind/simple" >> pip.conf -docker build -f Dockerfile_base -t nexus.iqser.com:5001/red/$SERVICE_NAME_BASE:${bamboo_version_tag} . -docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} --build-arg VERSION_TAG=${bamboo_version_tag} . echo "${bamboo_nexus_password}" | docker login --username "${bamboo_nexus_user}" --password-stdin nexus.iqser.com:5001 +docker build -f Dockerfile -t nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} --build-arg VERSION_TAG=${bamboo_version_tag} . docker push nexus.iqser.com:5001/red/$SERVICE_NAME:${bamboo_version_tag} diff --git a/bamboo-specs/src/main/resources/scripts/sonar-scan.sh b/bamboo-specs/src/main/resources/scripts/sonar-scan.sh index fb7a59d..7879f5a 100755 --- a/bamboo-specs/src/main/resources/scripts/sonar-scan.sh +++ b/bamboo-specs/src/main/resources/scripts/sonar-scan.sh @@ -7,17 +7,20 @@ python3 -m venv build_venv source build_venv/bin/activate python3 -m pip install --upgrade pip -pip install -e . -pip install -e incl/pyinfra +echo "dev setup for unit test and coverage" +pip install -e incl/pyinfra pip install -r incl/pyinfra/requirements.txt + +pip install -e . pip install -r requirements.txt + echo "DVC pull step" dvc pull echo "coverage calculation" -coverage run -m pytest test +coverage run -m pytest echo "coverage report generation" coverage report -m coverage xml @@ -28,7 +31,7 @@ echo "dependency-check:aggregate" mkdir -p reports dependency-check --enableExperimental -f JSON -f HTML -f XML \ --disableAssembly -s . -o reports --project $SERVICE_NAME --exclude ".git/**" --exclude "venv/**" \ - --exclude "build_venv/**" --exclude "**/__pycache__/**" --exclude "bamboo-specs/**" + --exclude "build_venv/**" --exclude "**/__pycache__/**" if [[ -z "${bamboo_repository_pr_key}" ]] then diff --git a/config.yaml b/config.yaml deleted file mode 100644 index 356aca8..0000000 --- a/config.yaml +++ /dev/null @@ -1,12 +0,0 @@ -service: - logging_level: $LOGGING_LEVEL_ROOT|DEBUG # Logging level for log file messages - monitoring_enabled: $MONITORING_ENABLED|True # if app is doing monitoring or not - logfile_path: $LOGFILE_PATH|null # Overwrites the default path for the service logfile (image_service/log.log) - -webserver: - host: $SERVER_HOST|"127.0.0.1" # webserver address - port: $SERVER_PORT|5000 # webserver port - -visual_logging: - level: DISABLED # NOTHING > INFO > DEBUG > ALL - output_folder: /tmp/debug/ \ No newline at end of file diff --git a/cv_analysis/config.py b/cv_analysis/config.py index bdb1ebf..550c2ee 100644 --- a/cv_analysis/config.py +++ b/cv_analysis/config.py @@ -1,39 +1,30 @@ -"""Implements a config object with dot-indexing syntax.""" +import os -from envyaml import EnvYAML -from cv_analysis.locations import CONFIG_FILE - - -def _get_item_and_maybe_make_dotindexable(container, item): - ret = container[item] - return DotIndexable(ret) if isinstance(ret, dict) else ret - - -class DotIndexable: - def __init__(self, x): - self.x = x - - def __getattr__(self, item): - return _get_item_and_maybe_make_dotindexable(self.x, item) - - def __setitem__(self, key, value): - self.x[key] = value - - def __repr__(self): - return self.x.__repr__() +def get_config(): + return Config() class Config: - def __init__(self, config_path): - self.__config = EnvYAML(config_path) + def __init__(self): + self.logging_level_root = os.environ.get("LOGGING_LEVEL_ROOT", "INFO") - def __getattr__(self, item): - if item in self.__config: - return _get_item_and_maybe_make_dotindexable(self.__config, item) + # visual_logging_level: NOTHING > INFO > DEBUG > ALL + self.visual_logging_level = "DISABLED" + self.visual_logging_output_folder = "/tmp/debug" - def __getitem__(self, item): - return self.__getattr__(item) + # locations + # FIXME: is everything here necessary? + root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + self.dvc_data_dir = os.path.join(root, "data") + self.pdf_for_testing = os.path.join(self.dvc_data_dir, "pdfs_for_testing") + self.png_for_testing = os.path.join(self.dvc_data_dir, "pngs_for_testing") + self.png_figures_detected = os.path.join(self.png_for_testing, "figures_detected") + self.png_tables_detected = os.path.join(self.png_for_testing, "tables_detected_by_tp") + self.hashed_pdfs_for_testing = os.path.join(self.pdf_for_testing, "hashed") + self.metadata_test_files = os.path.join(self.dvc_data_dir, "metadata_testing_files.csv") + self.test_dir = os.path.join(root, "test") + self.test_data_dir = os.path.join(self.test_dir, "test_data") - -CONFIG = Config(CONFIG_FILE) + def __getitem__(self, key): + return self.__getattribute__(key) diff --git a/cv_analysis/locations.py b/cv_analysis/locations.py deleted file mode 100644 index 5e7bf76..0000000 --- a/cv_analysis/locations.py +++ /dev/null @@ -1,22 +0,0 @@ -"""Defines constant paths relative to the module root path.""" - - -from os import path - -MODULE_DIR = path.dirname(path.abspath(__file__)) -PACKAGE_ROOT_DIR = path.dirname(MODULE_DIR) - -CONFIG_FILE = path.join(PACKAGE_ROOT_DIR, "config.yaml") -LOG_FILE = "/tmp/log.log" - -DVC_DATA_DIR = path.join(PACKAGE_ROOT_DIR, "data") -PDF_FOR_TESTING = path.join(DVC_DATA_DIR, "pdfs_for_testing") -PNG_FOR_TESTING = path.join(DVC_DATA_DIR, "pngs_for_testing") -PNG_FIGURES_DETECTED = path.join(PNG_FOR_TESTING, "figures_detected") -PNG_TABLES_DETECTED = path.join(PNG_FOR_TESTING, "tables_detected_by_tp") -HASHED_PDFS_FOR_TESTING = path.join(PDF_FOR_TESTING, "hashed") -METADATA_TESTFILES = path.join(DVC_DATA_DIR, "metadata_testing_files.csv") - - -TEST_DIR = path.join(PACKAGE_ROOT_DIR, "test") -TEST_DATA_DIR = path.join(TEST_DIR, "test_data") diff --git a/cv_analysis/redaction_detection.py b/cv_analysis/redaction_detection.py index b9d40d8..82e8c1f 100644 --- a/cv_analysis/redaction_detection.py +++ b/cv_analysis/redaction_detection.py @@ -2,7 +2,6 @@ from functools import partial import cv2 import numpy as np -import pdf2image from iteration_utilities import starfilter, first from cv_analysis.utils.filters import is_large_enough, is_filled, is_boxy diff --git a/cv_analysis/server/pipeline.py b/cv_analysis/server/pipeline.py index ef890f0..44c346e 100644 --- a/cv_analysis/server/pipeline.py +++ b/cv_analysis/server/pipeline.py @@ -7,23 +7,20 @@ from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_d from cv_analysis.layout_parsing import parse_layout from cv_analysis.server.rotate import rotate_rectangle from cv_analysis.table_parsing import parse_tables -from cv_analysis.utils.logging import get_logger from cv_analysis.utils.pdf2image import pdf_to_image_metadata_pairs from cv_analysis.utils.structures import Rectangle -logger = get_logger() - def make_analysis_pipeline(analysis_fn: Callable, dpi=200): """Make end-to-end pipeline to analyse a PDF with given analysis function. The pipeline returns a Generator of dicts containing page information and the analysis results. Steps: - Convert PDF to Arrays and page information - Analise pages, get list of bboxes per page (e.g. table cells) + Convert PDF to pairs of image and page information + Analyse pages, get list of bounding boxes per page (e.g. table cells) Convert pixel values to inches Rotate results if page is rotated - Format results to stream of dictionaries + Format results to stream of dictionaries with page information and analysis results """ def pipeline(pdf: bytes, index=None): diff --git a/cv_analysis/table_parsing.py b/cv_analysis/table_parsing.py index 9375a0f..b601742 100644 --- a/cv_analysis/table_parsing.py +++ b/cv_analysis/table_parsing.py @@ -1,16 +1,15 @@ from functools import partial from itertools import chain, starmap from operator import attrgetter + import cv2 import numpy as np - from funcy import lmap +from cv_analysis.layout_parsing import parse_layout from cv_analysis.utils.post_processing import xywh_to_vecs, xywh_to_vec_rect, adjacent1d - from cv_analysis.utils.structures import Rectangle from cv_analysis.utils.visual_logging import vizlogger -from cv_analysis.layout_parsing import parse_layout def add_external_contours(image, image_h_w_lines_only): diff --git a/cv_analysis/utils/display.py b/cv_analysis/utils/display.py index f5d9285..0d3f2a6 100644 --- a/cv_analysis/utils/display.py +++ b/cv_analysis/utils/display.py @@ -1,4 +1,3 @@ -from numpy import resize import cv2 from matplotlib import pyplot as plt diff --git a/cv_analysis/utils/logging.py b/cv_analysis/utils/logging.py deleted file mode 100644 index 51be0fb..0000000 --- a/cv_analysis/utils/logging.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Defines the default logger for the service.""" -import sys -import logging - -from cv_analysis.config import CONFIG - - -def make_logger_getter(): - logger = logging.getLogger(__name__) - logger.setLevel(logging.getLevelName(CONFIG.service.logging_level)) - formatter = logging.Formatter(fmt="%(asctime)s %(levelname)s: %(message)s", datefmt="%d.%m.%Y - %H:%M:%S") - - ch = logging.StreamHandler(sys.stdout) - ch.setLevel(logging.getLevelName(CONFIG.service.logging_level)) - ch.setFormatter(formatter) - - logger.addHandler(ch) - logger.propagate = False - - def get_logger(): - return logger - - return get_logger - - -get_logger = make_logger_getter() diff --git a/cv_analysis/utils/pdf2image.py b/cv_analysis/utils/pdf2image.py index b5da78d..a26b003 100644 --- a/cv_analysis/utils/pdf2image.py +++ b/cv_analysis/utils/pdf2image.py @@ -1,6 +1,6 @@ from dataclasses import dataclass from functools import partial -from typing import Iterator, Tuple +from typing import Iterator import fitz import numpy as np @@ -14,7 +14,8 @@ class ImageMetadataPair: def pdf_to_image_metadata_pairs(pdf: bytes, index=None, dpi=200) -> Iterator[ImageMetadataPair]: """Streams PDF as pairs of image (matrix) and metadata. - Note: If Index is not given or evaluates to None, the whole PDF will be processed.""" + Note: If Index is not given or evaluates to None, the whole PDF will be processed. + """ convert_fn = partial(page_to_image_metadata_pair, dpi=dpi) yield from map(convert_fn, stream_pages(pdf, index)) diff --git a/cv_analysis/utils/visual_logging.py b/cv_analysis/utils/visual_logging.py index e088dbe..f6e2fa1 100644 --- a/cv_analysis/utils/visual_logging.py +++ b/cv_analysis/utils/visual_logging.py @@ -1,7 +1,10 @@ import os -from cv_analysis.config import CONFIG + +from cv_analysis.config import get_config from cv_analysis.utils.display import save_image +CV_CONFIG = get_config() + class VisualLogger: def __init__(self, level, output_folder): @@ -36,4 +39,4 @@ class VisualLogger: return self.level == "ALL" -vizlogger = VisualLogger(CONFIG.visual_logging.level, CONFIG.visual_logging.output_folder) +vizlogger = VisualLogger(CV_CONFIG.visual_logging_level, CV_CONFIG.visual_logging_output_folder) diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..7155e61 --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,31 @@ +version: '2' +services: + minio: + image: minio/minio + ports: + - "9000:9000" + environment: + - MINIO_ROOT_PASSWORD=password + - MINIO_ROOT_USER=root + volumes: + - ./data/minio_store:/data + command: server /data + network_mode: "bridge" + rabbitmq: + image: docker.io/bitnami/rabbitmq:3.9 + ports: + - '4369:4369' + - '5551:5551' + - '5552:5552' + - '5672:5672' + - '25672:25672' + - '15672:15672' + environment: + - RABBITMQ_SECURE_PASSWORD=yes + - RABBITMQ_VM_MEMORY_HIGH_WATERMARK=100% + - RABBITMQ_DISK_FREE_ABSOLUTE_LIMIT=20Gi + network_mode: "bridge" + volumes: + - /opt/bitnami/rabbitmq/.rabbitmq/:/data/bitnami +volumes: + mdata: \ No newline at end of file diff --git a/incl/pyinfra b/incl/pyinfra index 7e948a4..6c26528 160000 --- a/incl/pyinfra +++ b/incl/pyinfra @@ -1 +1 @@ -Subproject commit 7e948a4cf05a3ef59fcc7e8719fcf910adc73864 +Subproject commit 6c2652837a17a29476b11b1acbc35ba8825c2cd9 diff --git a/pytest.ini b/pytest.ini index cd0d17d..3d95845 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,2 +1,4 @@ [pytest] norecursedirs = incl +testpaths = test +addopts = --ignore=data diff --git a/scripts/export_example_pages.py b/scripts/export_example_pages.py index 4fcba37..e0a9472 100644 --- a/scripts/export_example_pages.py +++ b/scripts/export_example_pages.py @@ -1,11 +1,15 @@ import hashlib +import json import os +from itertools import chain from os import path + import pandas as pd from pdf2image import convert_from_path -from itertools import chain -import json -from cv_analysis.locations import PDF_FOR_TESTING, TEST_DATA_DIR, PNG_FOR_TESTING, DVC_DATA_DIR, HASHED_PDFS_FOR_TESTING + +from cv_analysis.config import get_config + +CV_CONFIG = get_config() def read_json(path): @@ -22,7 +26,7 @@ def collect_metadata(example_pages, save=False): metadata = list(chain.from_iterable(metadata)) if save: df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"]) - df.to_csv(path.join(DVC_DATA_DIR, "metadata_testing_files.csv")) + df.to_csv(path.join(CV_CONFIG.dvc_data_dir, "metadata_testing_files.csv")) else: return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"]) @@ -46,7 +50,7 @@ def make_metadata_entry_maker(): def split_pdf(example_pages): - dir_path = PDF_FOR_TESTING + dir_path = CV_CONFIG.pdf_for_testing i = 0 for name, document_sections in example_pages.items(): for pages in document_sections: @@ -54,7 +58,7 @@ def split_pdf(example_pages): pdf_path=path.join(dir_path, name), dpi=300, first_page=pages[0], last_page=pages[1] ) for image in images: - fp = path.join(PNG_FOR_TESTING, f"fig_table{i:0>3}.png") + fp = path.join(CV_CONFIG.png_for_testing, f"fig_table{i:0>3}.png") image.save(fp=fp, dpi=(300, 300)) i += 1 @@ -74,7 +78,7 @@ def find_hash(file_path): def rename_files_with_hash(example_pages): files_to_rename = list(example_pages.keys()) - folder = HASHED_PDFS_FOR_TESTING + folder = CV_CONFIG.hashed_pdfs_for_testing # Iterate through the folder for file in os.listdir(folder): @@ -99,7 +103,7 @@ def rename_files_with_hash(example_pages): def main(): - examples_pages = read_json(path.join(TEST_DATA_DIR, "example_pages.json")) + examples_pages = read_json(path.join(CV_CONFIG.test_data_dir, "example_pages.json")) rename_files_with_hash(examples_pages) # collect_metadata(examples_pages, save=True) # split_pdf(examples_pages) diff --git a/scripts/manage_minio.py b/scripts/manage_minio.py new file mode 100644 index 0000000..d101cff --- /dev/null +++ b/scripts/manage_minio.py @@ -0,0 +1,68 @@ +import argparse +import gzip +import os +from pathlib import Path + +from tqdm import tqdm + +from pyinfra.config import get_config +from pyinfra.storage.storage import get_s3_storage + +CONFIG = get_config() + + +def parse_args(): + parser = argparse.ArgumentParser() + + subparsers = parser.add_subparsers(help="sub-command help", dest="command") + + parser_add = subparsers.add_parser("add", help="Add file(s) to the MinIO store") + parser_add.add_argument("dossier_id") + add_group = parser_add.add_mutually_exclusive_group(required=True) + add_group.add_argument("--file", "-f") + add_group.add_argument("--directory", "-d") + + subparsers.add_parser("purge", help="Delete all files and buckets in the MinIO store") + + args = parser.parse_args() + return args + + +def combine_dossier_id_and_file_id_and_extension(dossier_id, file_id, extension): + return f"{dossier_id}/{file_id}{extension}" + + +def add_file_compressed(storage, bucket_name, dossier_id, path) -> None: + if Path(path).suffix == ".pdf": + suffix_gz = ".ORIGIN.pdf.gz" + if Path(path).suffix == ".json": + suffix_gz = ".TEXT.json.gz" + path_gz = combine_dossier_id_and_file_id_and_extension(dossier_id, Path(path).stem, suffix_gz) + + with open(path, "rb") as f: + data = gzip.compress(f.read()) + storage.put_object(bucket_name, path_gz, data) + + +if __name__ == "__main__": + + storage = get_s3_storage(CONFIG) + bucket_name = CONFIG.storage_bucket + + if not storage.has_bucket(bucket_name): + storage.make_bucket(bucket_name) + + args = parse_args() + + if args.command == "add": + + if args.file: + add_file_compressed(storage, bucket_name, args.dossier_id, args.file) + + elif args.directory: + for fname in tqdm([*os.listdir(args.directory)], desc="Adding files"): + path = Path(args.directory) / fname + add_file_compressed(storage, bucket_name, args.dossier_id, path) + + elif args.command == "purge": + storage.clear_bucket(bucket_name) diff --git a/scripts/publish_requests.py b/scripts/publish_requests.py new file mode 100644 index 0000000..08f4af7 --- /dev/null +++ b/scripts/publish_requests.py @@ -0,0 +1,84 @@ +import argparse +import json + +import pika + +from pyinfra.config import get_config +from pyinfra.storage.storage import get_s3_storage + +CONFIG = get_config() + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--operation", "-o", choices=["table", "layout", "figure"], required=True) + args = parser.parse_args() + return args + + +def read_connection_params(): + credentials = pika.PlainCredentials(CONFIG.rabbitmq_username, CONFIG.rabbitmq_password) + parameters = pika.ConnectionParameters( + host=CONFIG.rabbitmq_host, + port=CONFIG.rabbitmq_port, + heartbeat=int(CONFIG.rabbitmq_heartbeat), + credentials=credentials, + ) + return parameters + + +def make_channel(connection) -> pika.adapters.blocking_connection.BlockingChannel: + channel = connection.channel() + channel.basic_qos(prefetch_count=1) + return channel + + +def declare_queue(channel, queue: str): + args = {"x-dead-letter-exchange": "", "x-dead-letter-routing-key": CONFIG.dead_letter_queue} + return channel.queue_declare(queue=queue, auto_delete=False, durable=True, arguments=args) + + +def make_connection() -> pika.BlockingConnection: + parameters = read_connection_params() + connection = pika.BlockingConnection(parameters) + return connection + + +def build_message_bodies(operation, bucket_name): + + storage = get_s3_storage(CONFIG) + for bucket_name, pdf_name in storage.get_all_object_names(bucket_name): + if "pdf" not in pdf_name: + continue + file_id = pdf_name.split(".")[0] + dossier_id, file_id = file_id.split("/") + message_dict = { + "dossierId": dossier_id, + "fileId": file_id, + "targetFileExtension": "ORIGIN.pdf.gz", + "responseFileExtension": f"{operation.upper()}.json.gz", + "operation": operation, + } + yield json.dumps(message_dict).encode() + + +def main(args): + connection = make_connection() + channel = make_channel(connection) + declare_queue(channel, CONFIG.request_queue) + declare_queue(channel, CONFIG.response_queue) + + for body in build_message_bodies(args.operation, CONFIG.storage_bucket): + channel.basic_publish("", CONFIG.request_queue, body) + print(f"Put {body} on {CONFIG.request_queue}") + + for method_frame, _, body in channel.consume(queue=CONFIG.response_queue, inactivity_timeout=1): + if not body: + break + print(f"Received {json.loads(body)}") + channel.basic_ack(method_frame.delivery_tag) + channel.close() + + +if __name__ == "__main__": + main(parse_args()) diff --git a/scripts/pyinfra_mock.py b/scripts/pyinfra_mock.py deleted file mode 100644 index 6d45b4d..0000000 --- a/scripts/pyinfra_mock.py +++ /dev/null @@ -1,64 +0,0 @@ -import argparse -import gzip -from operator import itemgetter -from typing import List - -import fitz -import pdf2image -from funcy import lmap, compose, pluck - -from pyinfra.default_objects import get_component_factory - -from cv_analysis.config import CONFIG -from incl.pyinfra.test.utils.image import image_to_bytes - - -def parse_args(): - parser = argparse.ArgumentParser() - parser.add_argument("--pdf_path", "-p", required=True) - parser.add_argument("--operation", "-o", choices=["figure_detection", "table_parsing"], required=True) - parser.add_argument("--result_path", "-r", required=True) - args = parser.parse_args() - - return args - - -def request_metadatas(dpi, n_metadata): - return [{"dpi": dpi} for _ in range(1, n_metadata)] - - -def draw_cells_on_page(cells: List[dict], page): - def format_xywh_to_x0y0x1y1(rect): - x, y, w, h = rect - return x, y, x + w, y + h - - rects = map(itemgetter("x", "y", "width", "height"), cells) - rects = map(format_xywh_to_x0y0x1y1, rects) - - for rect in rects: - page.draw_rect(rect, color=(0.3, 0.7, 0.1), width=2, overlay=True) - - -def annotate_results_on_pdf(results, pdf_path, result_path): - opened_pdf = fitz.open(pdf_path) - metadata_per_page = pluck("metadata", results) - - for page, metadata in zip(opened_pdf, metadata_per_page): - if metadata: - draw_cells_on_page(metadata["cells"], page) - opened_pdf.save(result_path) - - -def main(args): - dpi = 200 - images = lmap(compose(gzip.compress, image_to_bytes), pdf2image.convert_from_path(args.pdf_path, dpi=dpi)) - - submit_endpoint = f"http://{CONFIG.webserver.host}:{CONFIG.webserver.port}/{args.operation}" - pipeline = get_component_factory(CONFIG).get_pipeline(submit_endpoint) - results = list(pipeline(data=images, metadata=request_metadatas(dpi, len(images)))) - - annotate_results_on_pdf(results, args.pdf_path, args.result_path) - - -if __name__ == "__main__": - main(parse_args()) diff --git a/scripts/show_compressed_json.py b/scripts/show_compressed_json.py new file mode 100644 index 0000000..73debd7 --- /dev/null +++ b/scripts/show_compressed_json.py @@ -0,0 +1,24 @@ +import argparse +import gzip +import json + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("compressed_json_path", help="Path to compressed JSON file") + return parser.parse_args() + + +def main(fp): + with open(fp, "rb") as f: + compressed_json_path = f.read() + + json_str = gzip.decompress(compressed_json_path) + parsed = json.loads(json_str) + + print(json.dumps(parsed, indent=2)) + + +if __name__ == "__main__": + args = parse_args() + main(args.compressed_json_path) diff --git a/setup/docker.sh b/setup/docker.sh deleted file mode 100644 index b7da059..0000000 --- a/setup/docker.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash -set -e - -python3 -m venv build_venv -source build_venv/bin/activate -python3 -m pip install --upgrade pip - -#pip install dvc -#pip install 'dvc[ssh]' -#dvc pull - -docker build -f Dockerfile_base -t cv-analysis-base . -docker build -f Dockerfile -t cv-analysis . \ No newline at end of file diff --git a/src/serve.py b/src/serve.py index 1f55df1..812202f 100644 --- a/src/serve.py +++ b/src/serve.py @@ -1,35 +1,42 @@ +import gzip +import json import logging +from operator import itemgetter -from waitress import serve - -from cv_analysis.config import CONFIG -from cv_analysis.figure_detection.figure_detection_pipeline import make_figure_detection_pipeline -from cv_analysis.server.stream import make_streamable_analysis_fn -from cv_analysis.table_parsing import parse_tables +from cv_analysis.server.pipeline import get_analysis_fn, make_analysis_pipeline from cv_analysis.utils.banner import make_art -from cv_analysis.utils.logging import get_logger -from incl.pyinfra.pyinfra.server.server import set_up_processing_server +from pyinfra import config as pyinfra_config +from pyinfra.queue.queue_manager import QueueManager +from pyinfra.storage.storage import get_storage + +PYINFRA_CONFIG = pyinfra_config.get_config() + +logging.basicConfig(level=PYINFRA_CONFIG.logging_level_root) -def main(): - logger.info(make_art()) +def analysis_callback(queue_message: dict): - operation2function = {"table_parsing": parse_tables, "figure_detection": make_figure_detection_pipeline()} - operation2streamable_function = {op: make_streamable_analysis_fn(fn) for op, fn in operation2function.items()} + dossier_id, file_id, target_file_ext, response_file_ext, operation = itemgetter( + "dossierId", "fileId", "targetFileExtension", "responseFileExtension", "operation" + )(queue_message) + logging.info(f"Processing {dossier_id=}/{file_id=}, {operation=}.") + storage = get_storage(PYINFRA_CONFIG) + object_name = f"{dossier_id}/{file_id}.{target_file_ext}" + object_bytes = gzip.decompress(storage.get_object(PYINFRA_CONFIG.storage_bucket, object_name)) + analysis_fn = make_analysis_pipeline(get_analysis_fn(operation)) - server = set_up_processing_server(operation2streamable_function) + results = analysis_fn(object_bytes) + response = {**queue_message, "data": list(results)} + response = gzip.compress(json.dumps(response).encode()) + response_name = f"{dossier_id}/{file_id}.{response_file_ext}" - serve(server, host=CONFIG.webserver.host, port=CONFIG.webserver.port, _quiet=False) + storage.put_object(PYINFRA_CONFIG.storage_bucket, response_name, response) + return {"dossierId": dossier_id, "fileId": file_id} if __name__ == "__main__": - logging.basicConfig(level=CONFIG.service.logging_level) - logging.getLogger("pillow").setLevel(logging.ERROR) - logging.getLogger("PIL").setLevel(logging.ERROR) - logging.getLogger("flask").setLevel(logging.ERROR) - logging.getLogger("urllib3").setLevel(logging.ERROR) + logging.info(make_art()) - logger = get_logger() - - main() + queue_manager = QueueManager(PYINFRA_CONFIG) + queue_manager.start_consuming(analysis_callback) diff --git a/test/fixtures/figure_detection.py b/test/fixtures/figure_detection.py index 866dfc0..3b6b341 100644 --- a/test/fixtures/figure_detection.py +++ b/test/fixtures/figure_detection.py @@ -8,15 +8,11 @@ from lorem_text import lorem from cv_analysis.figure_detection.figure_detection_pipeline import ( make_figure_detection_pipeline, ) -from cv_analysis.utils.display import show_image @pytest.fixture -def page_with_images(random_image, n_images, background): - # page_image = Image.fromarray(background.astype("uint8")).convert("RGB") - page_image = paste_image(page_image, random_image, (200, 200)) - if n_images == 2: - page_image = paste_image(page_image, random_image, (1000, 2600)) +def page_with_images(random_image, background): + page_image = paste_image(background, random_image, (200, 200)) return np.array(page_image) diff --git a/test/fixtures/server.py b/test/fixtures/server.py index 0ecec7e..99825e4 100644 --- a/test/fixtures/server.py +++ b/test/fixtures/server.py @@ -7,7 +7,6 @@ import pytest from funcy import first from cv_analysis.utils.structures import Rectangle -from incl.pyinfra.pyinfra.server.packing import bytes_to_string @pytest.fixture @@ -20,7 +19,7 @@ def random_image_as_bytes_and_compressed(random_image): @pytest.fixture def random_image_metadata_package(random_image_as_bytes_and_compressed): - data = bytes_to_string(random_image_as_bytes_and_compressed) + data = random_image_as_bytes_and_compressed.decode() return [ { "data": data, diff --git a/test/fixtures/table_parsing.py b/test/fixtures/table_parsing.py index ccd5207..eed65f4 100644 --- a/test/fixtures/table_parsing.py +++ b/test/fixtures/table_parsing.py @@ -1,24 +1,27 @@ import json from os.path import join + import cv2 import pytest from funcy import first -from cv_analysis.locations import TEST_DATA_DIR +from cv_analysis.config import get_config from cv_analysis.utils.draw import draw_rectangles from cv_analysis.utils.open_pdf import open_pdf from test.fixtures.figure_detection import paste_text +CV_CONFIG = get_config() + @pytest.fixture def client_page_with_table(test_file_index): - img_path = join(TEST_DATA_DIR, f"test{test_file_index}.png") + img_path = join(CV_CONFIG.test_data_dir, f"test{test_file_index}.png") return first(open_pdf(img_path)) @pytest.fixture def expected_table_annotation(test_file_index): - json_path = join(TEST_DATA_DIR, f"test{test_file_index}.json") + json_path = join(CV_CONFIG.test_data_dir, f"test{test_file_index}.json") with open(json_path) as f: return json.load(f) diff --git a/test/unit_tests/figure_detection/figure_detection_pipeline_test.py b/test/unit_tests/figure_detection/figure_detection_pipeline_test.py index 95747ad..af0fbab 100644 --- a/test/unit_tests/figure_detection/figure_detection_pipeline_test.py +++ b/test/unit_tests/figure_detection/figure_detection_pipeline_test.py @@ -14,7 +14,6 @@ class TestFindPrimaryTextRegions: assert not list(results) @pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)]) - @pytest.mark.parametrize("n_images", [1]) def test_page_without_text_yields_figures(self, figure_detection_pipeline, page_with_images, image_size): results = figure_detection_pipeline(page_with_images) result_figures_size = map(lambda x: (x.w, x.h), results) @@ -35,7 +34,6 @@ class TestFindPrimaryTextRegions: assert error <= error_tolerance @pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)]) - @pytest.mark.parametrize("n_images", [1, 2]) @pytest.mark.parametrize("font_scale", [1, 1.5, 2]) @pytest.mark.parametrize("font_style", [cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_COMPLEX]) @pytest.mark.parametrize("text_types", powerset(["body", "header", "caption"])) @@ -45,13 +43,12 @@ class TestFindPrimaryTextRegions: figure_detection_pipeline, page_with_images_and_text, image_size, - n_images, error_tolerance, ): results = list(figure_detection_pipeline(page_with_images_and_text)) result_figures_area = sum(map(lambda x: (x.w * x.h), results)) - expected_figure_area = n_images * prod(image_size) + expected_figure_area = prod(image_size) error = abs(result_figures_area - expected_figure_area) / expected_figure_area diff --git a/test/unit_tests/figure_detection/text_test.py b/test/unit_tests/figure_detection/text_test.py index 794763b..edca7f4 100644 --- a/test/unit_tests/figure_detection/text_test.py +++ b/test/unit_tests/figure_detection/text_test.py @@ -6,7 +6,6 @@ from cv_analysis.figure_detection.text import ( remove_primary_text_regions, apply_threshold_to_image, ) -from cv_analysis.utils.display import show_image from test.utils.utils import powerset @@ -19,7 +18,6 @@ class TestFindPrimaryTextRegions: np.testing.assert_equal(result_page, apply_threshold_to_image(background)) @pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)]) - @pytest.mark.parametrize("n_images", [1, 2]) def test_page_without_text_keeps_images(self, page_with_images, error_tolerance): result_page = remove_primary_text_regions(page_with_images) np.testing.assert_equal(result_page, apply_threshold_to_image(page_with_images)) @@ -33,7 +31,6 @@ class TestFindPrimaryTextRegions: assert relative_error <= error_tolerance @pytest.mark.parametrize("image_size", [(200, 200), (500, 500), (800, 800)]) - @pytest.mark.parametrize("n_images", [1, 2]) @pytest.mark.parametrize("font_scale", [1, 1.5, 2]) @pytest.mark.parametrize("font_style", [cv2.FONT_HERSHEY_SIMPLEX, cv2.FONT_HERSHEY_COMPLEX]) @pytest.mark.parametrize("text_types", powerset(["body", "header", "caption"])) diff --git a/test/unit_tests/parse_configuration_test.py b/test/unit_tests/parse_configuration_test.py deleted file mode 100644 index 049d4f7..0000000 --- a/test/unit_tests/parse_configuration_test.py +++ /dev/null @@ -1,6 +0,0 @@ -from cv_analysis.config import CONFIG - - -def test_config(): - assert CONFIG.service - assert CONFIG.webserver