Merge branch 'RES-535-update-pyinfra' into 'master'

feat(opentel,dynaconf): adapt new pyinfra

Closes RES-535

See merge request redactmanager/cv-analysis-service!8
This commit is contained in:
Julius Unverfehrt 2024-02-08 12:33:05 +01:00
commit 688217f3cd
11 changed files with 1770 additions and 1050 deletions

44
config/pyinfra.toml Normal file
View File

@ -0,0 +1,44 @@
[metrics.prometheus]
enabled = true
prefix = "redactmanager_cv_analysis_service"
[tracing.opentelemetry]
enabled = true
endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
service_name = "redactmanager_cv_analyisis_service"
exporter = "otlp"
[webserver]
host = "0.0.0.0"
port = 8080
[rabbitmq]
host = "localhost"
port = 5672
username = ""
password = ""
heartbeat = 60
# Has to be a divider of heartbeat, and shouldn't be too big, since only in these intervals queue interactions happen (like receiving new messages)
# This is also the minimum time the service needs to process a message
connection_sleep = 5
input_queue = "request_queue"
output_queue = "response_queue"
dead_letter_queue = "dead_letter_queue"
[storage]
backend = "s3"
[storage.s3]
bucket = "redaction"
endpoint = "http://127.0.0.1:9000"
key = ""
secret = ""
region = "eu-central-1"
[storage.azure]
container = "redaction"
connection_string = ""
[storage.tenant_server]
public_key = ""
endpoint = "http://tenant-user-management:8081/internal-api/tenants"

19
config/settings.toml Normal file
View File

@ -0,0 +1,19 @@
[logging]
level = "INFO"
visual_logging_level = "DISABLED"
visual_logging_output_folder = "/tmp/debug"
[table_parsing]
skip_pages_without_images = true
[paths]
root = "@format {env[ROOT_PATH]}"
dvc_data_dir = "${paths.root}/data"
pdf_for_testing = "${paths.dvc_data_dir}/pdfs_for_testing"
png_for_testing = "${paths.dvc_data_dir}/pngs_for_testing"
png_figures_detected = "${paths.png_for_testing}/figures_detected"
png_tables_detected = "${paths.png_for_testing}/tables_detected_by_tp"
hashed_pdfs_for_testing = "${paths.pdf_for_testing}/hashed"
metadata_test_files = "${paths.dvc_data_dir}/metadata_testing_files.csv"
test_dir = "${paths.dvc_data_dir}/test"
test_data_dir = "${paths.dvc_data_dir}/test/test_data"

View File

@ -1,31 +1,9 @@
import os
from pathlib import Path
from pyinfra.config.loader import load_settings
def get_config():
return Config()
class Config:
def __init__(self):
self.logging_level_root = os.environ.get("LOGGING_LEVEL_ROOT", "INFO")
self.table_parsing_skip_pages_without_images = os.environ.get("TABLE_PARSING_SKIP_PAGES_WITHOUT_IMAGES", True)
# visual_logging_level: NOTHING > INFO > DEBUG > ALL
self.visual_logging_level = "DISABLED"
self.visual_logging_output_folder = "/tmp/debug"
# locations
# FIXME: is everything here necessary?
root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
self.dvc_data_dir = os.path.join(root, "data")
self.pdf_for_testing = os.path.join(self.dvc_data_dir, "pdfs_for_testing")
self.png_for_testing = os.path.join(self.dvc_data_dir, "pngs_for_testing")
self.png_figures_detected = os.path.join(self.png_for_testing, "figures_detected")
self.png_tables_detected = os.path.join(self.png_for_testing, "tables_detected_by_tp")
self.hashed_pdfs_for_testing = os.path.join(self.pdf_for_testing, "hashed")
self.metadata_test_files = os.path.join(self.dvc_data_dir, "metadata_testing_files.csv")
self.test_dir = os.path.join(root, "test")
self.test_data_dir = os.path.join(self.test_dir, "test_data")
def __getitem__(self, key):
return self.__getattribute__(key)
local_root_path = Path(__file__).parents[1]
settings = load_settings(root_path=local_root_path, settings_path="config")
return settings

View File

@ -1,3 +1,4 @@
import sys
from dataclasses import asdict
from operator import truth
@ -11,7 +12,7 @@ from pdf2img.default_objects.image import ImagePlus, ImageInfo
from pdf2img.default_objects.rectangle import RectanglePlus
def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images):
def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images=True):
if operation == "table":
return make_analysis_pipeline(
parse_tables,

View File

@ -1,9 +1,10 @@
import os
from pyinfra.config.loader import load_settings
from cv_analysis.config import get_config
from cv_analysis.utils.display import save_image
CV_CONFIG = get_config()
settings = get_config()
class VisualLogger:
@ -39,4 +40,4 @@ class VisualLogger:
return self.level == "ALL"
vizlogger = VisualLogger(CV_CONFIG.visual_logging_level, CV_CONFIG.visual_logging_output_folder)
vizlogger = VisualLogger(settings.logging.visual_logging_level, settings.logging.visual_logging_output_folder)

2655
poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "cv-analysis-service"
version = "1.23.0"
version = "2.0.0"
description = ""
authors = []
readme = "README.md"
@ -25,7 +25,7 @@ coverage = "^5.5"
dependency-check = "^0.6.0"
lorem-text = "^2.1"
PyMuPDF = "^1.19.6"
pyinfra = { version = "1.10.0", source = "gitlab-research" }
pyinfra = { version = "^2.0.0", source = "gitlab-research" }
kn-utils = { version = "0.2.7", source = "gitlab-research" }
pdf2img = { version = "0.7.0", source = "gitlab-red" }
dvc-azure = "^2.21.2"
@ -34,6 +34,10 @@ dvc-azure = "^2.21.2"
pytest = "^7.0.1"
pylint = "^2.17.4"
[tool.poetry.group.dev.dependencies]
ipython = "^8.21.0"
[tool.pytest.ini_options]
testpaths = ["test"]
addopts = "--ignore=data"

View File

@ -9,7 +9,7 @@ from pdf2image import convert_from_path
from cv_analysis.config import get_config
CV_CONFIG = get_config()
settings = get_config()
def read_json(path):
@ -26,7 +26,7 @@ def collect_metadata(example_pages, save=False):
metadata = list(chain.from_iterable(metadata))
if save:
df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
df.to_csv(path.join(CV_CONFIG.dvc_data_dir, "metadata_testing_files.csv"))
df.to_csv(path.join(settings.paths.dvc_data_dir, "metadata_testing_files.csv"))
else:
return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
@ -50,7 +50,7 @@ def make_metadata_entry_maker():
def split_pdf(example_pages):
dir_path = CV_CONFIG.pdf_for_testing
dir_path = settings.paths.pdf_for_testing
i = 0
for name, document_sections in example_pages.items():
for pages in document_sections:
@ -58,7 +58,7 @@ def split_pdf(example_pages):
pdf_path=path.join(dir_path, name), dpi=300, first_page=pages[0], last_page=pages[1]
)
for image in images:
fp = path.join(CV_CONFIG.png_for_testing, f"fig_table{i:0>3}.png")
fp = path.join(settings.paths.png_for_testing, f"fig_table{i:0>3}.png")
image.save(fp=fp, dpi=(300, 300))
i += 1
@ -78,7 +78,7 @@ def find_hash(file_path):
def rename_files_with_hash(example_pages):
files_to_rename = list(example_pages.keys())
folder = CV_CONFIG.hashed_pdfs_for_testing
folder = settings.paths.hashed_pdfs_for_testing
# Iterate through the folder
for file in os.listdir(folder):
@ -103,7 +103,7 @@ def rename_files_with_hash(example_pages):
def main():
examples_pages = read_json(path.join(CV_CONFIG.test_data_dir, "example_pages.json"))
examples_pages = read_json(path.join(settings.paths.test_data_dir, "example_pages.json"))
rename_files_with_hash(examples_pages)
# collect_metadata(examples_pages, save=True)
# split_pdf(examples_pages)

View File

@ -1,25 +1,26 @@
import logging
from sys import stdout
from pyinfra.payload_processing.processor import make_payload_processor
import IPython
from kn_utils.logging import logger
from pyinfra.examples import start_standard_queue_consumer
from pyinfra.queue.callback import make_download_process_upload_callback
from cv_analysis.config import get_config
from cv_analysis.server.pipeline import get_analysis_pipeline
from pyinfra import config as pyinfra_config
from pyinfra.queue.queue_manager import QueueManager
from cv_analysis.utils.banner import make_art
PYINFRA_CONFIG = pyinfra_config.get_config()
CV_CONFIG = get_config()
settings = get_config()
logger = logging.getLogger()
logger.setLevel(PYINFRA_CONFIG.logging_level_root)
logger.remove()
logger.add(sink=stdout, level=settings.logging.level)
def make_dispatched_data_analysis(config):
skip_pages_without_images = config.table_parsing_skip_pages_without_images
skip_pages_without_images = config.table_parsing.skip_pages_without_images
def inner(data: bytes, operation) -> list:
def inner(data: bytes, message: dict) -> list:
operation = message["operation"]
analyse = get_analysis_pipeline(operation, skip_pages_without_images)
return list(analyse(data))
@ -29,11 +30,9 @@ def make_dispatched_data_analysis(config):
def main():
logger.info(make_art())
process_data = make_dispatched_data_analysis(config=CV_CONFIG)
process_payload = make_payload_processor(process_data, config=PYINFRA_CONFIG)
queue_manager = QueueManager(PYINFRA_CONFIG)
queue_manager.start_consuming(process_payload)
process = make_dispatched_data_analysis(settings)
callback = make_download_process_upload_callback(process, settings)
start_standard_queue_consumer(callback, settings)
if __name__ == "__main__":

View File

@ -5,7 +5,7 @@ import cv2
import pytest
from dvc.repo import Repo
from funcy import first
from loguru import logger
from kn_utils.logging import logger
from cv_analysis.config import get_config
from cv_analysis.locations import REPO_ROOT_PATH, TEST_DATA_DVC
@ -13,12 +13,12 @@ from cv_analysis.utils.draw import draw_rectangles
from cv_analysis.utils.open_pdf import open_pdf
from test.fixtures.figure_detection import paste_text
CV_CONFIG = get_config()
settings = get_config()
@pytest.fixture
def client_page_with_table(test_file_index, dvc_test_data):
img_path = join(CV_CONFIG.test_data_dir, f"test{test_file_index}.png")
img_path = join(settings.paths.test_data_dir, f"test{test_file_index}.png")
return first(open_pdf(img_path))
@ -32,7 +32,7 @@ def dvc_test_data():
@pytest.fixture
def expected_table_annotation(test_file_index):
json_path = join(CV_CONFIG.test_data_dir, f"test{test_file_index}.json")
json_path = join(settings.paths.test_data_dir, f"test{test_file_index}.json")
with open(json_path) as f:
return json.load(f)

View File

@ -7,6 +7,7 @@ from cv_analysis.table_parsing import parse_tables
from cv_analysis.utils.test_metrics import compute_document_score
@pytest.mark.xfail(reason="Azure Connection String is not set and cannot be found. Where is it hiding?")
@pytest.mark.parametrize("score_threshold", [0.95])
@pytest.mark.parametrize("test_file_index", range(1, 11))
def test_table_parsing_on_client_pages(