import argparse import gzip import json import os from pathlib import Path from pyinfra.config import get_config from pyinfra.storage.storage import get_s3_storage from tqdm import tqdm CONFIG = get_config() def parse_args(): parser = argparse.ArgumentParser() subparsers = parser.add_subparsers(help="sub-command help", dest="command") parser_add = subparsers.add_parser("add", help="Add file(s) to the MinIO store") parser_add.add_argument("dossier_id") add_group = parser_add.add_mutually_exclusive_group(required=True) add_group.add_argument("--file", "-f") add_group.add_argument("--directory", "-d") subparsers.add_parser("purge", help="Delete all files and buckets in the MinIO store") parser_show = subparsers.add_parser("show", help="Show compressed json from storage") parser_show.add_argument("dossier_id") parser_show.add_argument("file_name") args = parser.parse_args() return args def combine_dossier_id_and_file_id_and_extension(dossier_id, file_id, extension): return f"{dossier_id}/{file_id}{extension}" def add_file_compressed(storage, bucket_name, dossier_id, path) -> None: if Path(path).suffix == ".pdf": suffix_gz = ".ORIGIN.pdf.gz" if Path(path).suffix == ".json": suffix_gz = ".TEXT.json.gz" path_gz = combine_dossier_id_and_file_id_and_extension(dossier_id, Path(path).stem, suffix_gz) with open(path, "rb") as f: data = gzip.compress(f.read()) storage.put_object(bucket_name, path_gz, data) if __name__ == "__main__": storage = get_s3_storage(CONFIG) bucket_name = CONFIG.storage_bucket if not storage.has_bucket(bucket_name): storage.make_bucket(bucket_name) args = parse_args() if args.command == "add": if args.file: add_file_compressed(storage, bucket_name, args.dossier_id, args.file) elif args.directory: for fname in tqdm([*os.listdir(args.directory)], desc="Adding files"): path = Path(args.directory) / fname add_file_compressed(storage, bucket_name, args.dossier_id, path) elif args.command == "purge": storage.clear_bucket(bucket_name) elif args.command == "show": json_file = storage.get_object(bucket_name, f"{args.dossier_id}/{args.file_name}") json_file = gzip.decompress(json_file) parsed = json.loads(json_file) print(json.dumps(parsed, indent=2))