Matthias Bisping 94254e1681 Pull request #38: 2.0.0 input output file pattern for download strategy
Merge in RR/pyinfra from 2.0.0-input-output-file-pattern-for-download-strategy to 2.0.0

Squashed commit of the following:

commit c7ce79ebbeace6a8cb7925ed69eda2d7cd2a4783
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Fri Jun 24 12:35:29 2022 +0200

    refactor

commit 80f04e544962760adb2dc60c9dd03ccca22167d6
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Fri Jun 24 11:06:10 2022 +0200

    refactoring of component factory, callback and client-pipeline getter

commit 6c024e1a789e1d55f0739c6846e5c02e8b7c943d
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Thu Jun 23 20:04:10 2022 +0200

    operations section in config cleaned up; added upload formatter

commit c85800aefc224967cea591c1ec4cf1aaa3ac8215
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Thu Jun 23 19:22:51 2022 +0200

    refactoring; removed obsolete config entries and code

commit 4be125952d82dc868935c8c73ad87fd8f0bd1d6c
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Thu Jun 23 19:14:47 2022 +0200

    removed obsolete code

commit ac69a5c8e3f1e2fd7e828a17eeab97984f4f9746
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Thu Jun 23 18:58:41 2022 +0200

    refactoring: rm dl strat module

commit efd36d0fc4f8f36d267bfa9d35415811fe723ccc
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Thu Jun 23 18:33:51 2022 +0200

    refactoring: multi dl strat -> downloader, rm single dl strat

commit afffdeb993500a6abdb6fe85a549e3d6e97e9ee7
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Thu Jun 23 16:39:22 2022 +0200

    operations section in config cleaned up

commit 671129af3e343490e0fb277a2b0329aa3027fd73
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Thu Jun 23 16:09:16 2022 +0200

    rename prometheus metric name to include service name

commit 932a3e314b382315492aecab95b1f02f2916f8a6
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Thu Jun 23 14:43:23 2022 +0200

    cleaned up file descr mngr

commit 79350b4ce71fcd095ed6a5e1d3a598ea246fae53
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Thu Jun 23 12:26:15 2022 +0200

    refactoring WIP: moving response stratgey logic into storage strategy (needs to be refactored as well, later) and file descr mngr. Here the moved code needs to be cleaned up.

commit 7e48c66f0c378b25a433a4034eefdc8a0957e775
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Thu Jun 23 12:00:48 2022 +0200

    refactoring; removed operation / response folder from output path

commit 8e6cbdaf23c48f6eeb52512b7f382d5727e206d6
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Thu Jun 23 11:08:37 2022 +0200

    refactoring; added operation -> file pattern mapping to file descr mngr (mainly for self-documentaton purposes)

commit 2c80d7cec0cc171e099e5b13aadd2ae0f9bf4f02
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Thu Jun 23 10:59:57 2022 +0200

    refactoring: introduced input- and output-file specific methods to file descr mngr

commit ecced37150eaac3008cc1b01b235e5f7135e504b
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Thu Jun 23 10:43:26 2022 +0200

    refactoring

commit 3828341e98861ff8d63035ee983309ad5064bb30
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Thu Jun 23 10:42:46 2022 +0200

    refactoring

commit 9a7c412523d467af40feb6924823ca89e28aadfe
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Wed Jun 22 17:04:54 2022 +0200

    add prometheus metric name for default operation

commit d207b2e274ba53b2a21a18c367bb130fb05ee1cd
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Wed Jun 22 17:02:55 2022 +0200

    Merge config

commit d3fdf36b12d8def18810454765e731599b833bfc
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date:   Wed Jun 22 17:01:12 2022 +0200

    added fixmes / todos

commit f49d0b9cb7764473ef9d127bc5d88525a4a16a23
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date:   Wed Jun 22 16:28:25 2022 +0200

    update script

... and 47 more commits
2022-06-24 12:59:26 +02:00

241 lines
6.7 KiB
Python

import json
import re
from itertools import starmap, repeat, chain
from operator import itemgetter
import pytest
from funcy import compose, first, second, pluck, lflatten, lzip
from pyinfra.config import CONFIG
from pyinfra.default_objects import get_component_factory
from pyinfra.server.packing import unpack, pack
from pyinfra.utils.encoding import compress, decompress
from test.config import CONFIG as TEST_CONFIG
@pytest.mark.parametrize(
"batched",
[
False,
True,
],
)
@pytest.mark.parametrize(
"one_to_many",
[
False,
True,
],
)
@pytest.mark.parametrize(
"analysis_task",
[
False,
True,
],
)
@pytest.mark.parametrize(
"n_items",
[
1,
3,
],
)
@pytest.mark.parametrize("n_pages", [2])
@pytest.mark.parametrize("buffer_size", [1, 2])
@pytest.mark.parametrize(
"item_type",
[
"string",
"image",
"pdf",
],
)
@pytest.mark.parametrize(
"queue_manager_name",
[
# "mock",
"pika",
],
scope="session",
)
@pytest.mark.parametrize(
"client_name",
[
"mock",
# "s3",
# "azure",
],
scope="session",
)
@pytest.mark.parametrize(
"components_type",
[
# "test",
"real",
],
)
@pytest.mark.parametrize(
"many_to_n",
[
True,
# False,
],
)
def test_serving(server_process, bucket_name, components, targets, data_message_pairs, n_items, many_to_n):
storage, queue_manager, consumer, file_descriptor_manager = components
assert queue_manager.input_queue.to_list() == []
assert queue_manager.output_queue.to_list() == []
assert [*storage.get_all_object_names(bucket_name)] == []
if n_items:
assert data_message_pairs
if many_to_n:
upload_data_to_folder_in_storage_and_publish_single_request_to_queue(
storage, queue_manager, data_message_pairs, file_descriptor_manager
)
else:
upload_data_to_storage_and_publish_requests_to_queue(
storage, queue_manager, data_message_pairs, file_descriptor_manager
)
consumer.consume_and_publish(n=int(many_to_n) or n_items)
outputs = get_data_uploaded_by_consumer(queue_manager, storage, bucket_name)
# TODO: correctness of target should be validated as well, since production has become non-trivial
assert sorted(outputs) == sorted(targets)
@pytest.fixture
def data_metadata_packs(input_data_items, metadata):
return list(starmap(compose(lambda s: s.encode(), json.dumps, pack), zip(input_data_items, metadata)))
@pytest.fixture
def data_message_pairs(data_metadata_packs, queue_message_metadata):
return lzip(data_metadata_packs, queue_message_metadata)
# TODO: refactor; too many params
def upload_data_to_storage_and_publish_requests_to_queue(
storage, queue_manager, data_message_pairs, file_descriptor_manager
):
for data, message in data_message_pairs:
upload_data_to_storage_and_publish_request_to_queue(
storage, queue_manager, data, message, file_descriptor_manager
)
# TODO: refactor; too many params
def upload_data_to_storage_and_publish_request_to_queue(storage, queue_manager, data, message, file_descriptor_manager):
storage.put_object(**file_descriptor_manager.get_input_object_descriptor(message), data=compress(data))
queue_manager.publish_request(message)
# TODO: refactor body; too long and scripty
def upload_data_to_folder_in_storage_and_publish_single_request_to_queue(
storage, queue_manager, data_message_pairs, file_descriptor_manager
):
assert data_message_pairs
ref_message = second(first(data_message_pairs))
pages = ref_message["pages"]
for data, page in zip(map(first, data_message_pairs), pages):
object_descriptor = file_descriptor_manager.get_input_object_descriptor(ref_message)
object_descriptor["object_name"] = build_filepath(object_descriptor, page)
storage.put_object(**object_descriptor, data=compress(data))
queue_manager.publish_request(ref_message)
def build_filepath(object_descriptor, page):
object_name = object_descriptor["object_name"]
parts = object_name.split("/")
path = "/".join(parts)
path = re.sub(r"id:\d", f"id:{page}", path)
return path
def get_data_uploaded_by_consumer(queue_manager, storage, bucket_name):
names_of_uploaded_files = lflatten(pluck("response_files", queue_manager.output_queue.to_list()))
uploaded_files = starmap(storage.get_object, zip(repeat(bucket_name), names_of_uploaded_files))
outputs = sorted(chain(*map(decode, uploaded_files)), key=itemgetter(0))
return outputs
@pytest.fixture
def components(components_type, real_components, test_components, bucket_name):
if components_type == "real":
components = real_components
elif components_type == "test":
components = test_components
else:
raise ValueError(f"Unknown component type '{components_type}'.")
storage, queue_manager, consumer, file_descriptor_manager = components
queue_manager.clear()
storage.make_bucket(bucket_name)
storage.clear_bucket(bucket_name)
yield storage, queue_manager, consumer, file_descriptor_manager
queue_manager.clear()
storage.clear_bucket(bucket_name)
def decode(storage_item):
storage_item = json.loads(decompress(storage_item).decode())
if not isinstance(storage_item, list):
storage_item = [storage_item]
yield from map(unpack, storage_item)
@pytest.fixture(params=["real", "mixed"])
def components_type(request):
return request.param
@pytest.fixture
def real_components(url):
CONFIG["service"]["operations"] = TEST_CONFIG.service.operations
CONFIG["service"]["response_formatter"] = TEST_CONFIG.service.response_formatter
CONFIG["service"]["upload_formatter"] = "identity"
component_factory = get_component_factory(CONFIG)
callback = component_factory.get_callback(url)
consumer = component_factory.get_consumer(callback)
queue_manager = component_factory.get_queue_manager()
storage = component_factory.get_storage()
file_descriptor_manager = component_factory.get_file_descriptor_manager()
return storage, queue_manager, consumer, file_descriptor_manager
@pytest.fixture
def test_components(url, queue_manager, storage):
pass
#
# component_factory = ComponentFactory(CONFIG)
#
# file_descriptor_manager = component_factory.get_file_descriptor_manager()
#
# visitor = QueueVisitor(
# storage=storage,
# callback=component_factory.get_callback(url),
# response_strategy=component_factory.get_response_strategy(storage),
# )
# consumer = Consumer(visitor, queue_manager)
#
# return storage, queue_manager, consumer, file_descriptor_manager