Merge in RR/pyinfra from image-prediction-v2-support to 2.0.0
Squashed commit of the following:
commit 37c536324e847357e86dd9b72d1e07ad792ed90f
Merge: 77d1db8 01bfb1d
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Mon Jul 11 13:53:56 2022 +0200
Merge branch '2.0.0' of ssh://git.iqser.com:2222/rr/pyinfra into image-prediction-v2-support
commit 77d1db8e8630de8822c124eb39f4cd817ed1d3e1
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Mon Jul 11 13:07:41 2022 +0200
add operation assignment via config if operation is not defined by caller
commit 36c8ca48a8c6151f713c093a23de110901ba6b02
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Mon Jul 11 10:33:34 2022 +0200
refactor nothing part 2
commit f6cd0ef986802554dd544b9b7a24073d3b3f05b5
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Mon Jul 11 10:28:49 2022 +0200
refactor nothing
commit 1e70d49531e89613c70903be49290b94ee014f65
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Wed Jul 6 17:42:12 2022 +0200
enable docker-compose fixture
commit 9fee32cecdd120cfac3e065fb8ad2b4f37b49226
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Wed Jul 6 17:40:35 2022 +0200
added 'multi' key to actual operation configurations
commit 4287f6d9878dd361489b8490eafd06f81df472ce
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Wed Jul 6 16:56:12 2022 +0200
removed debug prints
commit 23a533e8f99222c7e598fb0864f65e9aa3508a3b
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Wed Jul 6 16:31:50 2022 +0200
completed correcting / cleaning upload and download logic with regard to operations and ids. next: remove debug code
commit 33246d1ff94989d2ea70242c7ae2e58afa4d35c1
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Wed Jul 6 14:37:17 2022 +0200
corrected / cleaned upload and download logic with regard to operations and ids
commit 7f2b4e882022c6843cb2f80df202caa495c54ee9
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Tue Jul 5 18:41:07 2022 +0200
partially decomplected file descriptor manager from concrete and non-generic descriptor code
commit 40b892da17670dae3b8eba1700877c1dcf219852
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Tue Jul 5 09:53:46 2022 +0200
typo
commit ec4fa8e6f4551ff1f8d4f78c484b7a260f274898
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Tue Jul 5 09:52:41 2022 +0200
typo
commit 701b43403c328161fd96a73ce388a66035cca348
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Mon Jul 4 17:26:53 2022 +0200
made adjustments for image classification with pyinfra 2.x; added related fixmes
commit 7a794bdcc987631cdc4d89b5620359464e2e018e
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Mon Jul 4 13:05:26 2022 +0200
removed obsolete imports
commit 3fc6a7ef5d0172dbce1c4292d245eced2f378b5a
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Mon Jul 4 11:47:12 2022 +0200
enable docker-compose fixture
commit 36d8d3bc851b06d94cf12a73048a00a67ef79c42
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Mon Jul 4 11:46:53 2022 +0200
renaming
commit 3bf00d11cd041dff325b66f13fcd00d3ce96b8b5
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Thu Jun 30 12:47:57 2022 +0200
refactoring: added cached pipeline factory
commit 90e735852af2f86e35be845fabf28494de952edb
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Wed Jun 29 13:47:08 2022 +0200
renaming
commit 93b3d4b202b41183ed8cabe193a4bfa03f520787
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Wed Jun 29 13:25:03 2022 +0200
further refactored server setup code: moving and decomplecting
commit 8b2ed83c7ade5bd811cb045d56fbfb0353fa385e
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Wed Jun 29 12:53:09 2022 +0200
refactored server setup code: factored out and decoupled operation registry and prometheus summary registry
... and 6 more commits
116 lines
3.9 KiB
Python
116 lines
3.9 KiB
Python
import argparse
|
|
import json
|
|
|
|
import pika
|
|
|
|
from pyinfra.config import CONFIG, parse_disjunction_string
|
|
from pyinfra.storage.storages import get_s3_storage
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--bucket_name", "-b")
|
|
parser.add_argument(
|
|
"--analysis_container",
|
|
"-a",
|
|
choices=["detr", "ner", "image", "conversion", "extraction", "dl_error", "table_parsing"],
|
|
required=True,
|
|
)
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
|
|
def read_connection_params():
|
|
credentials = pika.PlainCredentials(CONFIG.rabbitmq.user, CONFIG.rabbitmq.password)
|
|
parameters = pika.ConnectionParameters(
|
|
host=CONFIG.rabbitmq.host,
|
|
port=CONFIG.rabbitmq.port,
|
|
heartbeat=CONFIG.rabbitmq.heartbeat,
|
|
credentials=credentials,
|
|
)
|
|
return parameters
|
|
|
|
|
|
def make_channel(connection) -> pika.adapters.blocking_connection.BlockingChannel:
|
|
channel = connection.channel()
|
|
channel.basic_qos(prefetch_count=1)
|
|
return channel
|
|
|
|
|
|
def declare_queue(channel, queue: str):
|
|
args = {"x-dead-letter-exchange": "", "x-dead-letter-routing-key": CONFIG.rabbitmq.queues.dead_letter}
|
|
return channel.queue_declare(queue=queue, auto_delete=False, durable=True, arguments=args)
|
|
|
|
|
|
def make_connection() -> pika.BlockingConnection:
|
|
parameters = read_connection_params()
|
|
connection = pika.BlockingConnection(parameters)
|
|
return connection
|
|
|
|
|
|
def build_message_bodies(analysis_container, bucket_name):
|
|
def update_message(message_dict):
|
|
if analysis_container == "detr" or analysis_container == "image":
|
|
message_dict.update({"pages": []})
|
|
if analysis_container == "conversion":
|
|
message_dict.update(
|
|
{
|
|
"targetFileExtension": "ORIGIN.pdf.gz",
|
|
"responseFileExtension": "json.gz",
|
|
"operation": "conversion",
|
|
"pages": [1, 2, 3],
|
|
}
|
|
)
|
|
if analysis_container == "table_parsing":
|
|
message_dict.update(
|
|
{
|
|
"operation": "table_parsing",
|
|
"pages": [1, 2, 3],
|
|
}
|
|
)
|
|
if analysis_container == "extraction":
|
|
message_dict.update(
|
|
{"targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "json.gz", "operation": "extraction"}
|
|
)
|
|
if analysis_container == "dl_error":
|
|
message_dict.update({"targetFileExtension": "no_such_file", "responseFileExtension": "IMAGE_INFO.json.gz"})
|
|
if analysis_container == "ner":
|
|
message_dict.update(
|
|
{"targetFileExtension": "TEXT.json.gz", "responseFileExtension": "NER_ENTITIES.json.gz"}
|
|
)
|
|
return message_dict
|
|
|
|
storage = get_s3_storage()
|
|
for pdf_name in storage.get_all_object_names(bucket_name):
|
|
if "pdf" not in pdf_name:
|
|
continue
|
|
file_id = pdf_name.split(".")[0]
|
|
dossier_id, file_id = file_id.split("/")
|
|
message_dict = {"dossierId": dossier_id, "fileId": file_id}
|
|
update_message(message_dict)
|
|
yield json.dumps(message_dict).encode()
|
|
|
|
|
|
def main(args):
|
|
connection = make_connection()
|
|
channel = make_channel(connection)
|
|
declare_queue(channel, CONFIG.rabbitmq.queues.input)
|
|
declare_queue(channel, CONFIG.rabbitmq.queues.output)
|
|
|
|
bucket_name = args.bucket_name or parse_disjunction_string(CONFIG.storage.bucket)
|
|
|
|
for body in build_message_bodies(args.analysis_container, bucket_name):
|
|
channel.basic_publish("", CONFIG.rabbitmq.queues.input, body)
|
|
print(f"Put {body} on {CONFIG.rabbitmq.queues.input}")
|
|
|
|
for method_frame, _, body in channel.consume(queue=CONFIG.rabbitmq.queues.output, inactivity_timeout=1):
|
|
if not body:
|
|
break
|
|
print(f"Received {json.loads(body)}")
|
|
channel.basic_ack(method_frame.delivery_tag)
|
|
channel.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main(parse_args())
|