Merge branch 'RES-535-update-pyinfra' into 'master'
feat(opentel,dynaconf): adapt new pyinfra Closes RES-535 See merge request redactmanager/cv-analysis-service!8
This commit is contained in:
commit
688217f3cd
44
config/pyinfra.toml
Normal file
44
config/pyinfra.toml
Normal file
@ -0,0 +1,44 @@
|
||||
[metrics.prometheus]
|
||||
enabled = true
|
||||
prefix = "redactmanager_cv_analysis_service"
|
||||
|
||||
[tracing.opentelemetry]
|
||||
enabled = true
|
||||
endpoint = "http://otel-collector-opentelemetry-collector.otel-collector:4318/v1/traces"
|
||||
service_name = "redactmanager_cv_analyisis_service"
|
||||
exporter = "otlp"
|
||||
|
||||
[webserver]
|
||||
host = "0.0.0.0"
|
||||
port = 8080
|
||||
|
||||
[rabbitmq]
|
||||
host = "localhost"
|
||||
port = 5672
|
||||
username = ""
|
||||
password = ""
|
||||
heartbeat = 60
|
||||
# Has to be a divider of heartbeat, and shouldn't be too big, since only in these intervals queue interactions happen (like receiving new messages)
|
||||
# This is also the minimum time the service needs to process a message
|
||||
connection_sleep = 5
|
||||
input_queue = "request_queue"
|
||||
output_queue = "response_queue"
|
||||
dead_letter_queue = "dead_letter_queue"
|
||||
|
||||
[storage]
|
||||
backend = "s3"
|
||||
|
||||
[storage.s3]
|
||||
bucket = "redaction"
|
||||
endpoint = "http://127.0.0.1:9000"
|
||||
key = ""
|
||||
secret = ""
|
||||
region = "eu-central-1"
|
||||
|
||||
[storage.azure]
|
||||
container = "redaction"
|
||||
connection_string = ""
|
||||
|
||||
[storage.tenant_server]
|
||||
public_key = ""
|
||||
endpoint = "http://tenant-user-management:8081/internal-api/tenants"
|
||||
19
config/settings.toml
Normal file
19
config/settings.toml
Normal file
@ -0,0 +1,19 @@
|
||||
[logging]
|
||||
level = "INFO"
|
||||
visual_logging_level = "DISABLED"
|
||||
visual_logging_output_folder = "/tmp/debug"
|
||||
|
||||
[table_parsing]
|
||||
skip_pages_without_images = true
|
||||
|
||||
[paths]
|
||||
root = "@format {env[ROOT_PATH]}"
|
||||
dvc_data_dir = "${paths.root}/data"
|
||||
pdf_for_testing = "${paths.dvc_data_dir}/pdfs_for_testing"
|
||||
png_for_testing = "${paths.dvc_data_dir}/pngs_for_testing"
|
||||
png_figures_detected = "${paths.png_for_testing}/figures_detected"
|
||||
png_tables_detected = "${paths.png_for_testing}/tables_detected_by_tp"
|
||||
hashed_pdfs_for_testing = "${paths.pdf_for_testing}/hashed"
|
||||
metadata_test_files = "${paths.dvc_data_dir}/metadata_testing_files.csv"
|
||||
test_dir = "${paths.dvc_data_dir}/test"
|
||||
test_data_dir = "${paths.dvc_data_dir}/test/test_data"
|
||||
@ -1,31 +1,9 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from pyinfra.config.loader import load_settings
|
||||
|
||||
|
||||
def get_config():
|
||||
return Config()
|
||||
|
||||
|
||||
class Config:
|
||||
def __init__(self):
|
||||
self.logging_level_root = os.environ.get("LOGGING_LEVEL_ROOT", "INFO")
|
||||
self.table_parsing_skip_pages_without_images = os.environ.get("TABLE_PARSING_SKIP_PAGES_WITHOUT_IMAGES", True)
|
||||
|
||||
# visual_logging_level: NOTHING > INFO > DEBUG > ALL
|
||||
self.visual_logging_level = "DISABLED"
|
||||
self.visual_logging_output_folder = "/tmp/debug"
|
||||
|
||||
# locations
|
||||
# FIXME: is everything here necessary?
|
||||
root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
self.dvc_data_dir = os.path.join(root, "data")
|
||||
self.pdf_for_testing = os.path.join(self.dvc_data_dir, "pdfs_for_testing")
|
||||
self.png_for_testing = os.path.join(self.dvc_data_dir, "pngs_for_testing")
|
||||
self.png_figures_detected = os.path.join(self.png_for_testing, "figures_detected")
|
||||
self.png_tables_detected = os.path.join(self.png_for_testing, "tables_detected_by_tp")
|
||||
self.hashed_pdfs_for_testing = os.path.join(self.pdf_for_testing, "hashed")
|
||||
self.metadata_test_files = os.path.join(self.dvc_data_dir, "metadata_testing_files.csv")
|
||||
self.test_dir = os.path.join(root, "test")
|
||||
self.test_data_dir = os.path.join(self.test_dir, "test_data")
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.__getattribute__(key)
|
||||
local_root_path = Path(__file__).parents[1]
|
||||
settings = load_settings(root_path=local_root_path, settings_path="config")
|
||||
return settings
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
import sys
|
||||
from dataclasses import asdict
|
||||
from operator import truth
|
||||
|
||||
@ -11,7 +12,7 @@ from pdf2img.default_objects.image import ImagePlus, ImageInfo
|
||||
from pdf2img.default_objects.rectangle import RectanglePlus
|
||||
|
||||
|
||||
def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images):
|
||||
def get_analysis_pipeline(operation, table_parsing_skip_pages_without_images=True):
|
||||
if operation == "table":
|
||||
return make_analysis_pipeline(
|
||||
parse_tables,
|
||||
|
||||
@ -1,9 +1,10 @@
|
||||
import os
|
||||
from pyinfra.config.loader import load_settings
|
||||
|
||||
from cv_analysis.config import get_config
|
||||
from cv_analysis.utils.display import save_image
|
||||
|
||||
CV_CONFIG = get_config()
|
||||
settings = get_config()
|
||||
|
||||
|
||||
class VisualLogger:
|
||||
@ -39,4 +40,4 @@ class VisualLogger:
|
||||
return self.level == "ALL"
|
||||
|
||||
|
||||
vizlogger = VisualLogger(CV_CONFIG.visual_logging_level, CV_CONFIG.visual_logging_output_folder)
|
||||
vizlogger = VisualLogger(settings.logging.visual_logging_level, settings.logging.visual_logging_output_folder)
|
||||
|
||||
2655
poetry.lock
generated
2655
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@ -1,6 +1,6 @@
|
||||
[tool.poetry]
|
||||
name = "cv-analysis-service"
|
||||
version = "1.23.0"
|
||||
version = "2.0.0"
|
||||
description = ""
|
||||
authors = []
|
||||
readme = "README.md"
|
||||
@ -25,7 +25,7 @@ coverage = "^5.5"
|
||||
dependency-check = "^0.6.0"
|
||||
lorem-text = "^2.1"
|
||||
PyMuPDF = "^1.19.6"
|
||||
pyinfra = { version = "1.10.0", source = "gitlab-research" }
|
||||
pyinfra = { version = "^2.0.0", source = "gitlab-research" }
|
||||
kn-utils = { version = "0.2.7", source = "gitlab-research" }
|
||||
pdf2img = { version = "0.7.0", source = "gitlab-red" }
|
||||
dvc-azure = "^2.21.2"
|
||||
@ -34,6 +34,10 @@ dvc-azure = "^2.21.2"
|
||||
pytest = "^7.0.1"
|
||||
pylint = "^2.17.4"
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
ipython = "^8.21.0"
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = ["test"]
|
||||
addopts = "--ignore=data"
|
||||
|
||||
@ -9,7 +9,7 @@ from pdf2image import convert_from_path
|
||||
|
||||
from cv_analysis.config import get_config
|
||||
|
||||
CV_CONFIG = get_config()
|
||||
settings = get_config()
|
||||
|
||||
|
||||
def read_json(path):
|
||||
@ -26,7 +26,7 @@ def collect_metadata(example_pages, save=False):
|
||||
metadata = list(chain.from_iterable(metadata))
|
||||
if save:
|
||||
df = pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
|
||||
df.to_csv(path.join(CV_CONFIG.dvc_data_dir, "metadata_testing_files.csv"))
|
||||
df.to_csv(path.join(settings.paths.dvc_data_dir, "metadata_testing_files.csv"))
|
||||
else:
|
||||
return pd.DataFrame(data=metadata, columns=["image_name", "pdf_name", "page"])
|
||||
|
||||
@ -50,7 +50,7 @@ def make_metadata_entry_maker():
|
||||
|
||||
|
||||
def split_pdf(example_pages):
|
||||
dir_path = CV_CONFIG.pdf_for_testing
|
||||
dir_path = settings.paths.pdf_for_testing
|
||||
i = 0
|
||||
for name, document_sections in example_pages.items():
|
||||
for pages in document_sections:
|
||||
@ -58,7 +58,7 @@ def split_pdf(example_pages):
|
||||
pdf_path=path.join(dir_path, name), dpi=300, first_page=pages[0], last_page=pages[1]
|
||||
)
|
||||
for image in images:
|
||||
fp = path.join(CV_CONFIG.png_for_testing, f"fig_table{i:0>3}.png")
|
||||
fp = path.join(settings.paths.png_for_testing, f"fig_table{i:0>3}.png")
|
||||
image.save(fp=fp, dpi=(300, 300))
|
||||
i += 1
|
||||
|
||||
@ -78,7 +78,7 @@ def find_hash(file_path):
|
||||
|
||||
def rename_files_with_hash(example_pages):
|
||||
files_to_rename = list(example_pages.keys())
|
||||
folder = CV_CONFIG.hashed_pdfs_for_testing
|
||||
folder = settings.paths.hashed_pdfs_for_testing
|
||||
|
||||
# Iterate through the folder
|
||||
for file in os.listdir(folder):
|
||||
@ -103,7 +103,7 @@ def rename_files_with_hash(example_pages):
|
||||
|
||||
|
||||
def main():
|
||||
examples_pages = read_json(path.join(CV_CONFIG.test_data_dir, "example_pages.json"))
|
||||
examples_pages = read_json(path.join(settings.paths.test_data_dir, "example_pages.json"))
|
||||
rename_files_with_hash(examples_pages)
|
||||
# collect_metadata(examples_pages, save=True)
|
||||
# split_pdf(examples_pages)
|
||||
|
||||
31
src/serve.py
31
src/serve.py
@ -1,25 +1,26 @@
|
||||
import logging
|
||||
from sys import stdout
|
||||
|
||||
from pyinfra.payload_processing.processor import make_payload_processor
|
||||
import IPython
|
||||
from kn_utils.logging import logger
|
||||
from pyinfra.examples import start_standard_queue_consumer
|
||||
from pyinfra.queue.callback import make_download_process_upload_callback
|
||||
|
||||
from cv_analysis.config import get_config
|
||||
from cv_analysis.server.pipeline import get_analysis_pipeline
|
||||
from pyinfra import config as pyinfra_config
|
||||
from pyinfra.queue.queue_manager import QueueManager
|
||||
|
||||
from cv_analysis.utils.banner import make_art
|
||||
|
||||
PYINFRA_CONFIG = pyinfra_config.get_config()
|
||||
CV_CONFIG = get_config()
|
||||
settings = get_config()
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(PYINFRA_CONFIG.logging_level_root)
|
||||
|
||||
logger.remove()
|
||||
logger.add(sink=stdout, level=settings.logging.level)
|
||||
|
||||
|
||||
def make_dispatched_data_analysis(config):
|
||||
skip_pages_without_images = config.table_parsing_skip_pages_without_images
|
||||
skip_pages_without_images = config.table_parsing.skip_pages_without_images
|
||||
|
||||
def inner(data: bytes, operation) -> list:
|
||||
def inner(data: bytes, message: dict) -> list:
|
||||
operation = message["operation"]
|
||||
analyse = get_analysis_pipeline(operation, skip_pages_without_images)
|
||||
return list(analyse(data))
|
||||
|
||||
@ -29,11 +30,9 @@ def make_dispatched_data_analysis(config):
|
||||
def main():
|
||||
logger.info(make_art())
|
||||
|
||||
process_data = make_dispatched_data_analysis(config=CV_CONFIG)
|
||||
process_payload = make_payload_processor(process_data, config=PYINFRA_CONFIG)
|
||||
|
||||
queue_manager = QueueManager(PYINFRA_CONFIG)
|
||||
queue_manager.start_consuming(process_payload)
|
||||
process = make_dispatched_data_analysis(settings)
|
||||
callback = make_download_process_upload_callback(process, settings)
|
||||
start_standard_queue_consumer(callback, settings)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
8
test/fixtures/table_parsing.py
vendored
8
test/fixtures/table_parsing.py
vendored
@ -5,7 +5,7 @@ import cv2
|
||||
import pytest
|
||||
from dvc.repo import Repo
|
||||
from funcy import first
|
||||
from loguru import logger
|
||||
from kn_utils.logging import logger
|
||||
|
||||
from cv_analysis.config import get_config
|
||||
from cv_analysis.locations import REPO_ROOT_PATH, TEST_DATA_DVC
|
||||
@ -13,12 +13,12 @@ from cv_analysis.utils.draw import draw_rectangles
|
||||
from cv_analysis.utils.open_pdf import open_pdf
|
||||
from test.fixtures.figure_detection import paste_text
|
||||
|
||||
CV_CONFIG = get_config()
|
||||
settings = get_config()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def client_page_with_table(test_file_index, dvc_test_data):
|
||||
img_path = join(CV_CONFIG.test_data_dir, f"test{test_file_index}.png")
|
||||
img_path = join(settings.paths.test_data_dir, f"test{test_file_index}.png")
|
||||
return first(open_pdf(img_path))
|
||||
|
||||
|
||||
@ -32,7 +32,7 @@ def dvc_test_data():
|
||||
|
||||
@pytest.fixture
|
||||
def expected_table_annotation(test_file_index):
|
||||
json_path = join(CV_CONFIG.test_data_dir, f"test{test_file_index}.json")
|
||||
json_path = join(settings.paths.test_data_dir, f"test{test_file_index}.json")
|
||||
with open(json_path) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
@ -7,6 +7,7 @@ from cv_analysis.table_parsing import parse_tables
|
||||
from cv_analysis.utils.test_metrics import compute_document_score
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="Azure Connection String is not set and cannot be found. Where is it hiding?")
|
||||
@pytest.mark.parametrize("score_threshold", [0.95])
|
||||
@pytest.mark.parametrize("test_file_index", range(1, 11))
|
||||
def test_table_parsing_on_client_pages(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user