Mercurial > repos > shellac > guppy_basecaller
diff env/bin/s3put @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author | shellac |
---|---|
date | Sat, 02 May 2020 07:14:21 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/env/bin/s3put Sat May 02 07:14:21 2020 -0400 @@ -0,0 +1,438 @@ +#!/Users/pldms/Development/Projects/2020/david-matthews-galaxy/guppy_basecaller/env/bin/python3 +# Copyright (c) 2006,2007,2008 Mitch Garnaat http://garnaat.org/ +# +# Permission is hereby granted, free of charge, to any person obtaining a +# copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, dis- +# tribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to the fol- +# lowing conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL- +# ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +# SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +# IN THE SOFTWARE. +# +import getopt +import sys +import os +import boto + +from boto.compat import six + +try: + # multipart portions copyright Fabian Topfstedt + # https://gist.github.com/924094 + + import math + import mimetypes + from multiprocessing import Pool + from boto.s3.connection import S3Connection + from filechunkio import FileChunkIO + multipart_capable = True + usage_flag_multipart_capable = """ [--multipart]""" + usage_string_multipart_capable = """ + multipart - Upload files as multiple parts. This needs filechunkio. + Requires ListBucket, ListMultipartUploadParts, + ListBucketMultipartUploads and PutObject permissions.""" +except ImportError as err: + multipart_capable = False + usage_flag_multipart_capable = "" + if six.PY2: + attribute = 'message' + else: + attribute = 'msg' + usage_string_multipart_capable = '\n\n "' + \ + getattr(err, attribute)[len('No module named '):] + \ + '" is missing for multipart support ' + + +DEFAULT_REGION = 'us-east-1' + +usage_string = """ +SYNOPSIS + s3put [-a/--access_key <access_key>] [-s/--secret_key <secret_key>] + -b/--bucket <bucket_name> [-c/--callback <num_cb>] + [-d/--debug <debug_level>] [-i/--ignore <ignore_dirs>] + [-n/--no_op] [-p/--prefix <prefix>] [-k/--key_prefix <key_prefix>] + [-q/--quiet] [-g/--grant grant] [-w/--no_overwrite] [-r/--reduced] + [--header] [--region <name>] [--host <s3_host>]""" + \ + usage_flag_multipart_capable + """ path [path...] + + Where + access_key - Your AWS Access Key ID. If not supplied, boto will + use the value of the environment variable + AWS_ACCESS_KEY_ID + secret_key - Your AWS Secret Access Key. If not supplied, boto + will use the value of the environment variable + AWS_SECRET_ACCESS_KEY + bucket_name - The name of the S3 bucket the file(s) should be + copied to. + path - A path to a directory or file that represents the items + to be uploaded. If the path points to an individual file, + that file will be uploaded to the specified bucket. If the + path points to a directory, it will recursively traverse + the directory and upload all files to the specified bucket. + debug_level - 0 means no debug output (default), 1 means normal + debug output from boto, and 2 means boto debug output + plus request/response output from httplib + ignore_dirs - a comma-separated list of directory names that will + be ignored and not uploaded to S3. + num_cb - The number of progress callbacks to display. The default + is zero which means no callbacks. If you supplied a value + of "-c 10" for example, the progress callback would be + called 10 times for each file transferred. + prefix - A file path prefix that will be stripped from the full + path of the file when determining the key name in S3. + For example, if the full path of a file is: + /home/foo/bar/fie.baz + and the prefix is specified as "-p /home/foo/" the + resulting key name in S3 will be: + /bar/fie.baz + The prefix must end in a trailing separator and if it + does not then one will be added. + key_prefix - A prefix to be added to the S3 key name, after any + stripping of the file path is done based on the + "-p/--prefix" option. + reduced - Use Reduced Redundancy storage + grant - A canned ACL policy that will be granted on each file + transferred to S3. The value of provided must be one + of the "canned" ACL policies supported by S3: + private|public-read|public-read-write|authenticated-read + no_overwrite - No files will be overwritten on S3, if the file/key + exists on s3 it will be kept. This is useful for + resuming interrupted transfers. Note this is not a + sync, even if the file has been updated locally if + the key exists on s3 the file on s3 will not be + updated. + header - key=value pairs of extra header(s) to pass along in the + request + region - Manually set a region for buckets that are not in the US + classic region. Normally the region is autodetected, but + setting this yourself is more efficient. + host - Hostname override, for using an endpoint other then AWS S3 +""" + usage_string_multipart_capable + """ + + + If the -n option is provided, no files will be transferred to S3 but + informational messages will be printed about what would happen. +""" + + +def usage(status=1): + print(usage_string) + sys.exit(status) + + +def submit_cb(bytes_so_far, total_bytes): + print('%d bytes transferred / %d bytes total' % (bytes_so_far, total_bytes)) + + +def get_key_name(fullpath, prefix, key_prefix): + if fullpath.startswith(prefix): + key_name = fullpath[len(prefix):] + else: + key_name = fullpath + l = key_name.split(os.sep) + return key_prefix + '/'.join(l) + + +def _upload_part(bucketname, aws_key, aws_secret, multipart_id, part_num, + source_path, offset, bytes, debug, cb, num_cb, + amount_of_retries=10): + """ + Uploads a part with retries. + """ + if debug == 1: + print("_upload_part(%s, %s, %s)" % (source_path, offset, bytes)) + + def _upload(retries_left=amount_of_retries): + try: + if debug == 1: + print('Start uploading part #%d ...' % part_num) + conn = S3Connection(aws_key, aws_secret) + conn.debug = debug + bucket = conn.get_bucket(bucketname) + for mp in bucket.get_all_multipart_uploads(): + if mp.id == multipart_id: + with FileChunkIO(source_path, 'r', offset=offset, + bytes=bytes) as fp: + mp.upload_part_from_file(fp=fp, part_num=part_num, + cb=cb, num_cb=num_cb) + break + except Exception as exc: + if retries_left: + _upload(retries_left=retries_left - 1) + else: + print('Failed uploading part #%d' % part_num) + raise exc + else: + if debug == 1: + print('... Uploaded part #%d' % part_num) + + _upload() + +def check_valid_region(conn, region): + if conn is None: + print('Invalid region (%s)' % region) + sys.exit(1) + +def multipart_upload(bucketname, aws_key, aws_secret, source_path, keyname, + reduced, debug, cb, num_cb, acl='private', headers={}, + guess_mimetype=True, parallel_processes=4, + region=DEFAULT_REGION): + """ + Parallel multipart upload. + """ + conn = boto.s3.connect_to_region(region, aws_access_key_id=aws_key, + aws_secret_access_key=aws_secret) + check_valid_region(conn, region) + conn.debug = debug + bucket = conn.get_bucket(bucketname) + + if guess_mimetype: + mtype = mimetypes.guess_type(keyname)[0] or 'application/octet-stream' + headers.update({'Content-Type': mtype}) + + mp = bucket.initiate_multipart_upload(keyname, headers=headers, + reduced_redundancy=reduced) + + source_size = os.stat(source_path).st_size + bytes_per_chunk = max(int(math.sqrt(5242880) * math.sqrt(source_size)), + 5242880) + chunk_amount = int(math.ceil(source_size / float(bytes_per_chunk))) + + pool = Pool(processes=parallel_processes) + for i in range(chunk_amount): + offset = i * bytes_per_chunk + remaining_bytes = source_size - offset + bytes = min([bytes_per_chunk, remaining_bytes]) + part_num = i + 1 + pool.apply_async(_upload_part, [bucketname, aws_key, aws_secret, mp.id, + part_num, source_path, offset, bytes, + debug, cb, num_cb]) + pool.close() + pool.join() + + if len(mp.get_all_parts()) == chunk_amount: + mp.complete_upload() + key = bucket.get_key(keyname) + key.set_acl(acl) + else: + mp.cancel_upload() + + +def singlepart_upload(bucket, key_name, fullpath, *kargs, **kwargs): + """ + Single upload. + """ + k = bucket.new_key(key_name) + k.set_contents_from_filename(fullpath, *kargs, **kwargs) + + +def expand_path(path): + path = os.path.expanduser(path) + path = os.path.expandvars(path) + return os.path.abspath(path) + + +def main(): + + # default values + aws_access_key_id = None + aws_secret_access_key = None + bucket_name = '' + ignore_dirs = [] + debug = 0 + cb = None + num_cb = 0 + quiet = False + no_op = False + prefix = '/' + key_prefix = '' + grant = None + no_overwrite = False + reduced = False + headers = {} + host = None + multipart_requested = False + region = None + + try: + opts, args = getopt.getopt( + sys.argv[1:], 'a:b:c::d:g:hi:k:np:qs:wr', + ['access_key=', 'bucket=', 'callback=', 'debug=', 'help', 'grant=', + 'ignore=', 'key_prefix=', 'no_op', 'prefix=', 'quiet', + 'secret_key=', 'no_overwrite', 'reduced', 'header=', 'multipart', + 'host=', 'region=']) + except: + usage(1) + + # parse opts + for o, a in opts: + if o in ('-h', '--help'): + usage(0) + if o in ('-a', '--access_key'): + aws_access_key_id = a + if o in ('-b', '--bucket'): + bucket_name = a + if o in ('-c', '--callback'): + num_cb = int(a) + cb = submit_cb + if o in ('-d', '--debug'): + debug = int(a) + if o in ('-g', '--grant'): + grant = a + if o in ('-i', '--ignore'): + ignore_dirs = a.split(',') + if o in ('-n', '--no_op'): + no_op = True + if o in ('-w', '--no_overwrite'): + no_overwrite = True + if o in ('-p', '--prefix'): + prefix = a + if prefix[-1] != os.sep: + prefix = prefix + os.sep + prefix = expand_path(prefix) + if o in ('-k', '--key_prefix'): + key_prefix = a + if o in ('-q', '--quiet'): + quiet = True + if o in ('-s', '--secret_key'): + aws_secret_access_key = a + if o in ('-r', '--reduced'): + reduced = True + if o == '--header': + (k, v) = a.split("=", 1) + headers[k] = v + if o == '--host': + host = a + if o == '--multipart': + if multipart_capable: + multipart_requested = True + else: + print("multipart upload requested but not capable") + sys.exit(4) + if o == '--region': + regions = boto.s3.regions() + for region_info in regions: + if region_info.name == a: + region = a + break + else: + raise ValueError('Invalid region %s specified' % a) + + if len(args) < 1: + usage(2) + + if not bucket_name: + print("bucket name is required!") + usage(3) + + connect_args = { + 'aws_access_key_id': aws_access_key_id, + 'aws_secret_access_key': aws_secret_access_key + } + + if host: + connect_args['host'] = host + + c = boto.s3.connect_to_region(region or DEFAULT_REGION, **connect_args) + check_valid_region(c, region or DEFAULT_REGION) + c.debug = debug + b = c.get_bucket(bucket_name, validate=False) + + # Attempt to determine location and warn if no --host or --region + # arguments were passed. Then try to automagically figure out + # what should have been passed and fix it. + if host is None and region is None: + try: + location = b.get_location() + + # Classic region will be '', any other will have a name + if location: + print('Bucket exists in %s but no host or region given!' % location) + + # Override for EU, which is really Ireland according to the docs + if location == 'EU': + location = 'eu-west-1' + + print('Automatically setting region to %s' % location) + + # Here we create a new connection, and then take the existing + # bucket and set it to use the new connection + c = boto.s3.connect_to_region(location, **connect_args) + c.debug = debug + b.connection = c + except Exception as e: + if debug > 0: + print(e) + print('Could not get bucket region info, skipping...') + + existing_keys_to_check_against = [] + files_to_check_for_upload = [] + + for path in args: + path = expand_path(path) + # upload a directory of files recursively + if os.path.isdir(path): + if no_overwrite: + if not quiet: + print('Getting list of existing keys to check against') + for key in b.list(get_key_name(path, prefix, key_prefix)): + existing_keys_to_check_against.append(key.name) + for root, dirs, files in os.walk(path): + for ignore in ignore_dirs: + if ignore in dirs: + dirs.remove(ignore) + for path in files: + if path.startswith("."): + continue + files_to_check_for_upload.append(os.path.join(root, path)) + + # upload a single file + elif os.path.isfile(path): + fullpath = os.path.abspath(path) + key_name = get_key_name(fullpath, prefix, key_prefix) + files_to_check_for_upload.append(fullpath) + existing_keys_to_check_against.append(key_name) + + # we are trying to upload something unknown + else: + print("I don't know what %s is, so i can't upload it" % path) + + for fullpath in files_to_check_for_upload: + key_name = get_key_name(fullpath, prefix, key_prefix) + + if no_overwrite and key_name in existing_keys_to_check_against: + if b.get_key(key_name): + if not quiet: + print('Skipping %s as it exists in s3' % fullpath) + continue + + if not quiet: + print('Copying %s to %s/%s' % (fullpath, bucket_name, key_name)) + + if not no_op: + # 0-byte files don't work and also don't need multipart upload + if os.stat(fullpath).st_size != 0 and multipart_capable and \ + multipart_requested: + multipart_upload(bucket_name, aws_access_key_id, + aws_secret_access_key, fullpath, key_name, + reduced, debug, cb, num_cb, + grant or 'private', headers, + region=region or DEFAULT_REGION) + else: + singlepart_upload(b, key_name, fullpath, cb=cb, num_cb=num_cb, + policy=grant, reduced_redundancy=reduced, + headers=headers) + +if __name__ == "__main__": + main()