Merge in RR/pyinfra from 2.0.0-input-output-file-pattern-for-download-strategy to 2.0.0
Squashed commit of the following:
commit c7ce79ebbeace6a8cb7925ed69eda2d7cd2a4783
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Fri Jun 24 12:35:29 2022 +0200
refactor
commit 80f04e544962760adb2dc60c9dd03ccca22167d6
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Fri Jun 24 11:06:10 2022 +0200
refactoring of component factory, callback and client-pipeline getter
commit 6c024e1a789e1d55f0739c6846e5c02e8b7c943d
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Thu Jun 23 20:04:10 2022 +0200
operations section in config cleaned up; added upload formatter
commit c85800aefc224967cea591c1ec4cf1aaa3ac8215
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Thu Jun 23 19:22:51 2022 +0200
refactoring; removed obsolete config entries and code
commit 4be125952d82dc868935c8c73ad87fd8f0bd1d6c
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Thu Jun 23 19:14:47 2022 +0200
removed obsolete code
commit ac69a5c8e3f1e2fd7e828a17eeab97984f4f9746
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Thu Jun 23 18:58:41 2022 +0200
refactoring: rm dl strat module
commit efd36d0fc4f8f36d267bfa9d35415811fe723ccc
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Thu Jun 23 18:33:51 2022 +0200
refactoring: multi dl strat -> downloader, rm single dl strat
commit afffdeb993500a6abdb6fe85a549e3d6e97e9ee7
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Thu Jun 23 16:39:22 2022 +0200
operations section in config cleaned up
commit 671129af3e343490e0fb277a2b0329aa3027fd73
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Thu Jun 23 16:09:16 2022 +0200
rename prometheus metric name to include service name
commit 932a3e314b382315492aecab95b1f02f2916f8a6
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Thu Jun 23 14:43:23 2022 +0200
cleaned up file descr mngr
commit 79350b4ce71fcd095ed6a5e1d3a598ea246fae53
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Thu Jun 23 12:26:15 2022 +0200
refactoring WIP: moving response stratgey logic into storage strategy (needs to be refactored as well, later) and file descr mngr. Here the moved code needs to be cleaned up.
commit 7e48c66f0c378b25a433a4034eefdc8a0957e775
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Thu Jun 23 12:00:48 2022 +0200
refactoring; removed operation / response folder from output path
commit 8e6cbdaf23c48f6eeb52512b7f382d5727e206d6
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Thu Jun 23 11:08:37 2022 +0200
refactoring; added operation -> file pattern mapping to file descr mngr (mainly for self-documentaton purposes)
commit 2c80d7cec0cc171e099e5b13aadd2ae0f9bf4f02
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Thu Jun 23 10:59:57 2022 +0200
refactoring: introduced input- and output-file specific methods to file descr mngr
commit ecced37150eaac3008cc1b01b235e5f7135e504b
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Thu Jun 23 10:43:26 2022 +0200
refactoring
commit 3828341e98861ff8d63035ee983309ad5064bb30
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Thu Jun 23 10:42:46 2022 +0200
refactoring
commit 9a7c412523d467af40feb6924823ca89e28aadfe
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Jun 22 17:04:54 2022 +0200
add prometheus metric name for default operation
commit d207b2e274ba53b2a21a18c367bb130fb05ee1cd
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Jun 22 17:02:55 2022 +0200
Merge config
commit d3fdf36b12d8def18810454765e731599b833bfc
Author: Matthias Bisping <matthias.bisping@iqser.com>
Date: Wed Jun 22 17:01:12 2022 +0200
added fixmes / todos
commit f49d0b9cb7764473ef9d127bc5d88525a4a16a23
Author: Julius Unverfehrt <julius.unverfehrt@iqser.com>
Date: Wed Jun 22 16:28:25 2022 +0200
update script
... and 47 more commits
114 lines
3.9 KiB
Python
114 lines
3.9 KiB
Python
import argparse
|
|
import json
|
|
|
|
import pika
|
|
|
|
from pyinfra.config import CONFIG, parse_disjunction_string
|
|
from pyinfra.storage.storages import get_s3_storage
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--bucket_name", "-b", required=True)
|
|
parser.add_argument(
|
|
"--analysis_container",
|
|
"-a",
|
|
choices=["detr", "ner", "image", "conversion", "extraction", "dl_error", "table_parsing"],
|
|
required=True,
|
|
)
|
|
args = parser.parse_args()
|
|
return args
|
|
|
|
|
|
def read_connection_params():
|
|
credentials = pika.PlainCredentials(CONFIG.rabbitmq.user, CONFIG.rabbitmq.password)
|
|
parameters = pika.ConnectionParameters(
|
|
host=CONFIG.rabbitmq.host,
|
|
port=CONFIG.rabbitmq.port,
|
|
heartbeat=CONFIG.rabbitmq.heartbeat,
|
|
credentials=credentials,
|
|
)
|
|
return parameters
|
|
|
|
|
|
def make_channel(connection) -> pika.adapters.blocking_connection.BlockingChannel:
|
|
channel = connection.channel()
|
|
channel.basic_qos(prefetch_count=1)
|
|
return channel
|
|
|
|
|
|
def declare_queue(channel, queue: str):
|
|
args = {"x-dead-letter-exchange": "", "x-dead-letter-routing-key": CONFIG.rabbitmq.queues.dead_letter}
|
|
return channel.queue_declare(queue=queue, auto_delete=False, durable=True, arguments=args)
|
|
|
|
|
|
def make_connection() -> pika.BlockingConnection:
|
|
parameters = read_connection_params()
|
|
connection = pika.BlockingConnection(parameters)
|
|
return connection
|
|
|
|
|
|
def build_message_bodies(analysis_container, bucket_name):
|
|
def update_message(message_dict):
|
|
if analysis_container == "detr" or analysis_container == "image":
|
|
message_dict.update({"targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz"})
|
|
if analysis_container == "conversion":
|
|
message_dict.update(
|
|
{
|
|
"targetFileExtension": "ORIGIN.pdf.gz",
|
|
"responseFileExtension": "json.gz",
|
|
"operation": "conversion",
|
|
"pages": [1, 2, 3],
|
|
}
|
|
)
|
|
if analysis_container == "table_parsing":
|
|
message_dict.update(
|
|
{
|
|
"operation": "table_parsing",
|
|
"pages": [1, 2, 3],
|
|
}
|
|
)
|
|
if analysis_container == "extraction":
|
|
message_dict.update(
|
|
{"targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "json.gz", "operation": "extraction"}
|
|
)
|
|
if analysis_container == "dl_error":
|
|
message_dict.update({"targetFileExtension": "no_such_file", "responseFileExtension": "IMAGE_INFO.json.gz"})
|
|
if analysis_container == "ner":
|
|
message_dict.update(
|
|
{"targetFileExtension": "TEXT.json.gz", "responseFileExtension": "NER_ENTITIES.json.gz"}
|
|
)
|
|
return message_dict
|
|
|
|
storage = get_s3_storage()
|
|
for pdf_name in storage.get_all_object_names(bucket_name):
|
|
if "pdf" not in pdf_name:
|
|
continue
|
|
file_id = pdf_name.split(".")[0]
|
|
dossier_id, file_id = file_id.split("/")
|
|
message_dict = {"dossierId": dossier_id, "fileId": file_id}
|
|
update_message(message_dict)
|
|
yield json.dumps(message_dict).encode()
|
|
|
|
|
|
def main(args):
|
|
connection = make_connection()
|
|
channel = make_channel(connection)
|
|
declare_queue(channel, CONFIG.rabbitmq.queues.input)
|
|
declare_queue(channel, CONFIG.rabbitmq.queues.output)
|
|
|
|
for body in build_message_bodies(args.analysis_container, args.bucket_name):
|
|
channel.basic_publish("", CONFIG.rabbitmq.queues.input, body)
|
|
print(f"Put {body} on {CONFIG.rabbitmq.queues.input}")
|
|
|
|
for method_frame, _, body in channel.consume(queue=CONFIG.rabbitmq.queues.output, inactivity_timeout=1):
|
|
if not body:
|
|
break
|
|
print(f"Received {json.loads(body)}")
|
|
channel.basic_ack(method_frame.delivery_tag)
|
|
channel.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main(parse_args())
|