add compression for storage item before upload, update script for extraction
This commit is contained in:
parent
90f8f9da36
commit
e7ee0cda42
@ -121,7 +121,7 @@ class AggregationStorageStrategy(ResponseStrategy):
|
||||
object_descriptor = get_response_object_descriptor(storage_upload_info)
|
||||
# TODO: object_descriptor needs suffix
|
||||
# Note: what did I mean with that?
|
||||
self.storage.put_object(**object_descriptor, data=data)
|
||||
self.storage.put_object(**object_descriptor, data=gzip.compress(data))
|
||||
return {**storage_upload_info, "responseFile": object_descriptor["object_name"]}
|
||||
|
||||
def merge_queue_items(self):
|
||||
|
||||
@ -10,7 +10,9 @@ from pyinfra.storage.storages import get_s3_storage
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--bucket_name", "-b", required=True)
|
||||
parser.add_argument("--analysis_container", "-a", choices=["detr", "ner", "image", "conversion", "extraction"], required=True)
|
||||
parser.add_argument(
|
||||
"--analysis_container", "-a", choices=["detr", "ner", "image", "conversion", "extraction"], required=True
|
||||
)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
@ -48,9 +50,18 @@ def build_message_bodies(analyse_container_type, bucket_name):
|
||||
if analyse_container_type == "detr" or analyse_container_type == "image":
|
||||
message_dict.update({"targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "IMAGE_INFO.json.gz"})
|
||||
if analyse_container_type == "conversion":
|
||||
message_dict.update({"targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "png.gz", "operation": "conversion", "pages": [1,2,3]})
|
||||
message_dict.update(
|
||||
{
|
||||
"targetFileExtension": "ORIGIN.pdf.gz",
|
||||
"responseFileExtension": "json.gz",
|
||||
"operation": "conversion",
|
||||
"pages": [1, 2, 3],
|
||||
}
|
||||
)
|
||||
if analyse_container_type == "extraction":
|
||||
message_dict.update({"targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "json.gz", "operation": "extraction"})
|
||||
message_dict.update(
|
||||
{"targetFileExtension": "ORIGIN.pdf.gz", "responseFileExtension": "json.gz", "operation": "extraction"}
|
||||
)
|
||||
if analyse_container_type == "ner":
|
||||
message_dict.update(
|
||||
{"targetFileExtension": "TEXT.json.gz", "responseFileExtension": "NER_ENTITIES.json.gz"}
|
||||
|
||||
@ -54,7 +54,7 @@ def components_type(request):
|
||||
|
||||
|
||||
def decode(storage_item):
|
||||
storage_item = json.loads(storage_item.decode())
|
||||
storage_item = json.loads(gzip.decompress(storage_item).decode())
|
||||
if not isinstance(storage_item, list):
|
||||
storage_item = [storage_item]
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user