amazon web services - Why is this python boto S3 multipart upload code not working? -
i trying upload 10 gb file aws s3, , said use s3 multipart upload, stumbled upon someone's github gist:
import os import sys import glob import subprocess import contextlib import functools import multiprocessing multiprocessing.pool import imapiterator optparse import optionparser boto.s3.connection import s3connection #import rfc822 import boto aws_access_key_id = 'key id here' aws_secret_access_key = 'access key here' def main(transfer_file, bucket_name, s3_key_name=none, use_rr=true, make_public=true, cores=none): if s3_key_name none: s3_key_name = os.path.basename(transfer_file) conn = s3connection(aws_access_key_id, aws_secret_access_key) bucket = conn.lookup(bucket_name) if bucket none: bucket = conn.create_bucket(bucket_name) mb_size = os.path.getsize(transfer_file) / 1e6 if mb_size < 10: _standard_transfer(bucket, s3_key_name, transfer_file, use_rr) else: _multipart_upload(bucket, s3_key_name, transfer_file, mb_size, use_rr, cores) s3_key = bucket.get_key(s3_key_name) if make_public: s3_key.set_acl("public-read") def upload_cb(complete, total): sys.stdout.write(".") sys.stdout.flush() def _standard_transfer(bucket, s3_key_name, transfer_file, use_rr): print(" upload standard transfer, not multipart", end=' ') new_s3_item = bucket.new_key(s3_key_name) new_s3_item.set_contents_from_filename(transfer_file, reduced_redundancy=use_rr, cb=upload_cb, num_cb=10) print() def map_wrap(f): @functools.wraps(f) def wrapper(*args, **kwargs): return f(*args, **kwargs) return wrapper def mp_from_ids(mp_id, mp_keyname, mp_bucketname): """get multipart upload bucket , multipart ids. allows reconstitute connection upload within multiprocessing functions. """ conn = s3connection(aws_access_key_id, aws_secret_access_key) bucket = conn.lookup(mp_bucketname) mp = boto.s3.multipart.multipartupload(bucket) mp.key_name = mp_keyname mp.id = mp_id return mp @map_wrap def transfer_part(mp_id, mp_keyname, mp_bucketname, i, part): """transfer part of multipart upload. designed run in parallel. """ mp = mp_from_ids(mp_id, mp_keyname, mp_bucketname) print(" transferring", i, part) open(part) t_handle: mp.upload_part_from_file(t_handle, i+1) os.remove(part) def _multipart_upload(bucket, s3_key_name, tarball, mb_size, use_rr=true, cores=none): """upload large files using amazon's multipart upload functionality. """ def split_file(in_file, mb_size, split_num=5): prefix = os.path.join(os.path.dirname(in_file), "%ss3part" % (os.path.basename(s3_key_name))) # require split size between 5mb (aws minimum) , 250mb split_size = int(max(min(mb_size / (split_num * 2.0), 250), 5)) if not os.path.exists("%saa" % prefix): cl = ["split", "-b%sm" % split_size, in_file, prefix] subprocess.check_call(cl) return sorted(glob.glob("%s*" % prefix)) mp = bucket.initiate_multipart_upload(s3_key_name, reduced_redundancy=use_rr) print(mp.id) print(mp.key_name) multimap(cores) pmap: _ in pmap(transfer_part, ((mp.id, mp.key_name, mp.bucket_name, i, part) (i, part) in enumerate(split_file(tarball, mb_size, cores)))): pass mp.complete_upload() @contextlib.contextmanager def multimap(cores=none): """provide multiprocessing imap function. context manager handles setting pool, worked around interrupt issues , terminating pool on completion. """ if cores none: cores = max(multiprocessing.cpu_count() - 1, 1) def wrapper(func): def wrap(self, timeout=none): return func(self, timeout=timeout if timeout not none else 1e100) return wrap imapiterator.next = wrapper(imapiterator.next) pool = multiprocessing.pool(cores) yield pool.imap pool.terminate() if __name__ == "__main__": parser = optionparser() parser.add_option("-r", "--norr", dest="use_rr", action="store_false", default=true) parser.add_option("-p", "--public", dest="make_public", action="store_true", default=false) parser.add_option("-c", "--cores", dest="cores", default=multiprocessing.cpu_count()) (options, args) = parser.parse_args() if len(args) < 2: print("no args") sys.exit() kwargs = dict(use_rr=options.use_rr, make_public=options.make_public, cores=int(options.cores)) main(*args, **kwargs)
but not working, , not sure how fix error: "typeerror: transfer_part() missing 4 required positional arguments: 'mp_keyname', 'mp_bucketname', 'i', , 'part'"
edit:
full error trace requested:
multiprocessing.pool.remotetraceback: """ traceback (most recent call last): file "/library/frameworks/python.framework/versions/3.4/lib/python3.4/multiprocessing/pool.py", line 119, in worker result = (true, func(*args, **kwds)) file "test.py", line 53, in wrapper return f(*args, **kwargs) typeerror: transfer_part() missing 4 required positional arguments: 'mp_keyname', 'mp_bucketname', 'i', , 'part' """ above exception direct cause of following exception: traceback (most recent call last): file "test.py", line 132, in <module> main(*args, **kwargs) file "test.py", line 34, in main cores) file "test.py", line 96, in _multipart_upload _ in pmap(transfer_part, ((mp.id, mp.key_name, mp.bucket_name, i, part) (i, part) in enumerate(split_file(tarball, mb_size, cores)))): file "/library/frameworks/python.framework/versions/3.4/lib/python3.4/multiprocessing/pool.py", line 689, in next raise value typeerror: transfer_part() missing 4 required positional arguments: 'mp_keyname', 'mp_bucketname', 'i', , 'part'
if fits use case, may want use aws command-line interface (cli), can automatically use multi-part upload you.
aws s3 cp file.txt s3://bucket/file.txt
Comments
Post a Comment