add compression for storage item before upload, update script for extraction

This commit is contained in:
Julius Unverfehrt 2022-06-02 15:49:28 +02:00
parent 90f8f9da36
commit e7ee0cda42
3 changed files with 16 additions and 5 deletions

View File

@ -121,7 +121,7 @@ class AggregationStorageStrategy(ResponseStrategy):
object_descriptor = get_response_object_descriptor(storage_upload_info)
# TODO: object_descriptor needs suffix
# Note: what did I mean with that?
self.storage.put_object(**object_descriptor, data=data)
self.storage.put_object(**object_descriptor, data=gzip.compress(data))
return {**storage_upload_info, "responseFile": object_descriptor["object_name"]}
def merge_queue_items(self):

View File

@ -10,7 +10,9 @@ from pyinfra.storage.storages import get_s3_storage
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--bucket_name", "-b", required=True)
parser.add_argument("--analysis_container", "-a", choices=["detr", "ner", "image", "conversion", "extraction"], required=True)
parser.add_argument(
"--analysis_container", "-a", choices=["detr", "ner", "image", "conversion", "extraction"], required=True
)
args = parser.parse_args()
return args
@ -48,9 +50,18 @@ def build_message_bodies(analyse_container_type, bucket_name):
if analyse_container_type == "detr" or analyse_container_type == "image":
message_dict.update({"targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz"})
if analyse_container_type == "conversion":
message_dict.update({"targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "png.gz", "operation": "conversion", "pages": [1,2,3]})
message_dict.update(
{
"targetFileExtension": "ORIGIN.pdf.gz",
"responseFileExtension": "json.gz",
"operation": "conversion",
"pages": [1, 2, 3],
}
)
if analyse_container_type == "extraction":
message_dict.update({"targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "json.gz", "operation": "extraction"})
message_dict.update(
{"targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "json.gz", "operation": "extraction"}
)
if analyse_container_type == "ner":
message_dict.update(
{"targetFileExtension": "TEXT.json.gz", "responseFileExtension": "NER_ENTITIES.json.gz"}

View File

@ -54,7 +54,7 @@ def components_type(request):
def decode(storage_item):
storage_item = json.loads(storage_item.decode())
storage_item = json.loads(gzip.decompress(storage_item).decode())
if not isinstance(storage_item, list):
storage_item = [storage_item]