comparison env/bin/s3put @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 #!/Users/pldms/Development/Projects/2020/david-matthews-galaxy/guppy_basecaller/env/bin/python3
2 # Copyright (c) 2006,2007,2008 Mitch Garnaat http://garnaat.org/
3 #
4 # Permission is hereby granted, free of charge, to any person obtaining a
5 # copy of this software and associated documentation files (the
6 # "Software"), to deal in the Software without restriction, including
7 # without limitation the rights to use, copy, modify, merge, publish, dis-
8 # tribute, sublicense, and/or sell copies of the Software, and to permit
9 # persons to whom the Software is furnished to do so, subject to the fol-
10 # lowing conditions:
11 #
12 # The above copyright notice and this permission notice shall be included
13 # in all copies or substantial portions of the Software.
14 #
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
16 # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
17 # ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
18 # SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
19 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 # IN THE SOFTWARE.
22 #
23 import getopt
24 import sys
25 import os
26 import boto
27
28 from boto.compat import six
29
30 try:
31 # multipart portions copyright Fabian Topfstedt
32 # https://gist.github.com/924094
33
34 import math
35 import mimetypes
36 from multiprocessing import Pool
37 from boto.s3.connection import S3Connection
38 from filechunkio import FileChunkIO
39 multipart_capable = True
40 usage_flag_multipart_capable = """ [--multipart]"""
41 usage_string_multipart_capable = """
42 multipart - Upload files as multiple parts. This needs filechunkio.
43 Requires ListBucket, ListMultipartUploadParts,
44 ListBucketMultipartUploads and PutObject permissions."""
45 except ImportError as err:
46 multipart_capable = False
47 usage_flag_multipart_capable = ""
48 if six.PY2:
49 attribute = 'message'
50 else:
51 attribute = 'msg'
52 usage_string_multipart_capable = '\n\n "' + \
53 getattr(err, attribute)[len('No module named '):] + \
54 '" is missing for multipart support '
55
56
57 DEFAULT_REGION = 'us-east-1'
58
59 usage_string = """
60 SYNOPSIS
61 s3put [-a/--access_key <access_key>] [-s/--secret_key <secret_key>]
62 -b/--bucket <bucket_name> [-c/--callback <num_cb>]
63 [-d/--debug <debug_level>] [-i/--ignore <ignore_dirs>]
64 [-n/--no_op] [-p/--prefix <prefix>] [-k/--key_prefix <key_prefix>]
65 [-q/--quiet] [-g/--grant grant] [-w/--no_overwrite] [-r/--reduced]
66 [--header] [--region <name>] [--host <s3_host>]""" + \
67 usage_flag_multipart_capable + """ path [path...]
68
69 Where
70 access_key - Your AWS Access Key ID. If not supplied, boto will
71 use the value of the environment variable
72 AWS_ACCESS_KEY_ID
73 secret_key - Your AWS Secret Access Key. If not supplied, boto
74 will use the value of the environment variable
75 AWS_SECRET_ACCESS_KEY
76 bucket_name - The name of the S3 bucket the file(s) should be
77 copied to.
78 path - A path to a directory or file that represents the items
79 to be uploaded. If the path points to an individual file,
80 that file will be uploaded to the specified bucket. If the
81 path points to a directory, it will recursively traverse
82 the directory and upload all files to the specified bucket.
83 debug_level - 0 means no debug output (default), 1 means normal
84 debug output from boto, and 2 means boto debug output
85 plus request/response output from httplib
86 ignore_dirs - a comma-separated list of directory names that will
87 be ignored and not uploaded to S3.
88 num_cb - The number of progress callbacks to display. The default
89 is zero which means no callbacks. If you supplied a value
90 of "-c 10" for example, the progress callback would be
91 called 10 times for each file transferred.
92 prefix - A file path prefix that will be stripped from the full
93 path of the file when determining the key name in S3.
94 For example, if the full path of a file is:
95 /home/foo/bar/fie.baz
96 and the prefix is specified as "-p /home/foo/" the
97 resulting key name in S3 will be:
98 /bar/fie.baz
99 The prefix must end in a trailing separator and if it
100 does not then one will be added.
101 key_prefix - A prefix to be added to the S3 key name, after any
102 stripping of the file path is done based on the
103 "-p/--prefix" option.
104 reduced - Use Reduced Redundancy storage
105 grant - A canned ACL policy that will be granted on each file
106 transferred to S3. The value of provided must be one
107 of the "canned" ACL policies supported by S3:
108 private|public-read|public-read-write|authenticated-read
109 no_overwrite - No files will be overwritten on S3, if the file/key
110 exists on s3 it will be kept. This is useful for
111 resuming interrupted transfers. Note this is not a
112 sync, even if the file has been updated locally if
113 the key exists on s3 the file on s3 will not be
114 updated.
115 header - key=value pairs of extra header(s) to pass along in the
116 request
117 region - Manually set a region for buckets that are not in the US
118 classic region. Normally the region is autodetected, but
119 setting this yourself is more efficient.
120 host - Hostname override, for using an endpoint other then AWS S3
121 """ + usage_string_multipart_capable + """
122
123
124 If the -n option is provided, no files will be transferred to S3 but
125 informational messages will be printed about what would happen.
126 """
127
128
129 def usage(status=1):
130 print(usage_string)
131 sys.exit(status)
132
133
134 def submit_cb(bytes_so_far, total_bytes):
135 print('%d bytes transferred / %d bytes total' % (bytes_so_far, total_bytes))
136
137
138 def get_key_name(fullpath, prefix, key_prefix):
139 if fullpath.startswith(prefix):
140 key_name = fullpath[len(prefix):]
141 else:
142 key_name = fullpath
143 l = key_name.split(os.sep)
144 return key_prefix + '/'.join(l)
145
146
147 def _upload_part(bucketname, aws_key, aws_secret, multipart_id, part_num,
148 source_path, offset, bytes, debug, cb, num_cb,
149 amount_of_retries=10):
150 """
151 Uploads a part with retries.
152 """
153 if debug == 1:
154 print("_upload_part(%s, %s, %s)" % (source_path, offset, bytes))
155
156 def _upload(retries_left=amount_of_retries):
157 try:
158 if debug == 1:
159 print('Start uploading part #%d ...' % part_num)
160 conn = S3Connection(aws_key, aws_secret)
161 conn.debug = debug
162 bucket = conn.get_bucket(bucketname)
163 for mp in bucket.get_all_multipart_uploads():
164 if mp.id == multipart_id:
165 with FileChunkIO(source_path, 'r', offset=offset,
166 bytes=bytes) as fp:
167 mp.upload_part_from_file(fp=fp, part_num=part_num,
168 cb=cb, num_cb=num_cb)
169 break
170 except Exception as exc:
171 if retries_left:
172 _upload(retries_left=retries_left - 1)
173 else:
174 print('Failed uploading part #%d' % part_num)
175 raise exc
176 else:
177 if debug == 1:
178 print('... Uploaded part #%d' % part_num)
179
180 _upload()
181
182 def check_valid_region(conn, region):
183 if conn is None:
184 print('Invalid region (%s)' % region)
185 sys.exit(1)
186
187 def multipart_upload(bucketname, aws_key, aws_secret, source_path, keyname,
188 reduced, debug, cb, num_cb, acl='private', headers={},
189 guess_mimetype=True, parallel_processes=4,
190 region=DEFAULT_REGION):
191 """
192 Parallel multipart upload.
193 """
194 conn = boto.s3.connect_to_region(region, aws_access_key_id=aws_key,
195 aws_secret_access_key=aws_secret)
196 check_valid_region(conn, region)
197 conn.debug = debug
198 bucket = conn.get_bucket(bucketname)
199
200 if guess_mimetype:
201 mtype = mimetypes.guess_type(keyname)[0] or 'application/octet-stream'
202 headers.update({'Content-Type': mtype})
203
204 mp = bucket.initiate_multipart_upload(keyname, headers=headers,
205 reduced_redundancy=reduced)
206
207 source_size = os.stat(source_path).st_size
208 bytes_per_chunk = max(int(math.sqrt(5242880) * math.sqrt(source_size)),
209 5242880)
210 chunk_amount = int(math.ceil(source_size / float(bytes_per_chunk)))
211
212 pool = Pool(processes=parallel_processes)
213 for i in range(chunk_amount):
214 offset = i * bytes_per_chunk
215 remaining_bytes = source_size - offset
216 bytes = min([bytes_per_chunk, remaining_bytes])
217 part_num = i + 1
218 pool.apply_async(_upload_part, [bucketname, aws_key, aws_secret, mp.id,
219 part_num, source_path, offset, bytes,
220 debug, cb, num_cb])
221 pool.close()
222 pool.join()
223
224 if len(mp.get_all_parts()) == chunk_amount:
225 mp.complete_upload()
226 key = bucket.get_key(keyname)
227 key.set_acl(acl)
228 else:
229 mp.cancel_upload()
230
231
232 def singlepart_upload(bucket, key_name, fullpath, *kargs, **kwargs):
233 """
234 Single upload.
235 """
236 k = bucket.new_key(key_name)
237 k.set_contents_from_filename(fullpath, *kargs, **kwargs)
238
239
240 def expand_path(path):
241 path = os.path.expanduser(path)
242 path = os.path.expandvars(path)
243 return os.path.abspath(path)
244
245
246 def main():
247
248 # default values
249 aws_access_key_id = None
250 aws_secret_access_key = None
251 bucket_name = ''
252 ignore_dirs = []
253 debug = 0
254 cb = None
255 num_cb = 0
256 quiet = False
257 no_op = False
258 prefix = '/'
259 key_prefix = ''
260 grant = None
261 no_overwrite = False
262 reduced = False
263 headers = {}
264 host = None
265 multipart_requested = False
266 region = None
267
268 try:
269 opts, args = getopt.getopt(
270 sys.argv[1:], 'a:b:c::d:g:hi:k:np:qs:wr',
271 ['access_key=', 'bucket=', 'callback=', 'debug=', 'help', 'grant=',
272 'ignore=', 'key_prefix=', 'no_op', 'prefix=', 'quiet',
273 'secret_key=', 'no_overwrite', 'reduced', 'header=', 'multipart',
274 'host=', 'region='])
275 except:
276 usage(1)
277
278 # parse opts
279 for o, a in opts:
280 if o in ('-h', '--help'):
281 usage(0)
282 if o in ('-a', '--access_key'):
283 aws_access_key_id = a
284 if o in ('-b', '--bucket'):
285 bucket_name = a
286 if o in ('-c', '--callback'):
287 num_cb = int(a)
288 cb = submit_cb
289 if o in ('-d', '--debug'):
290 debug = int(a)
291 if o in ('-g', '--grant'):
292 grant = a
293 if o in ('-i', '--ignore'):
294 ignore_dirs = a.split(',')
295 if o in ('-n', '--no_op'):
296 no_op = True
297 if o in ('-w', '--no_overwrite'):
298 no_overwrite = True
299 if o in ('-p', '--prefix'):
300 prefix = a
301 if prefix[-1] != os.sep:
302 prefix = prefix + os.sep
303 prefix = expand_path(prefix)
304 if o in ('-k', '--key_prefix'):
305 key_prefix = a
306 if o in ('-q', '--quiet'):
307 quiet = True
308 if o in ('-s', '--secret_key'):
309 aws_secret_access_key = a
310 if o in ('-r', '--reduced'):
311 reduced = True
312 if o == '--header':
313 (k, v) = a.split("=", 1)
314 headers[k] = v
315 if o == '--host':
316 host = a
317 if o == '--multipart':
318 if multipart_capable:
319 multipart_requested = True
320 else:
321 print("multipart upload requested but not capable")
322 sys.exit(4)
323 if o == '--region':
324 regions = boto.s3.regions()
325 for region_info in regions:
326 if region_info.name == a:
327 region = a
328 break
329 else:
330 raise ValueError('Invalid region %s specified' % a)
331
332 if len(args) < 1:
333 usage(2)
334
335 if not bucket_name:
336 print("bucket name is required!")
337 usage(3)
338
339 connect_args = {
340 'aws_access_key_id': aws_access_key_id,
341 'aws_secret_access_key': aws_secret_access_key
342 }
343
344 if host:
345 connect_args['host'] = host
346
347 c = boto.s3.connect_to_region(region or DEFAULT_REGION, **connect_args)
348 check_valid_region(c, region or DEFAULT_REGION)
349 c.debug = debug
350 b = c.get_bucket(bucket_name, validate=False)
351
352 # Attempt to determine location and warn if no --host or --region
353 # arguments were passed. Then try to automagically figure out
354 # what should have been passed and fix it.
355 if host is None and region is None:
356 try:
357 location = b.get_location()
358
359 # Classic region will be '', any other will have a name
360 if location:
361 print('Bucket exists in %s but no host or region given!' % location)
362
363 # Override for EU, which is really Ireland according to the docs
364 if location == 'EU':
365 location = 'eu-west-1'
366
367 print('Automatically setting region to %s' % location)
368
369 # Here we create a new connection, and then take the existing
370 # bucket and set it to use the new connection
371 c = boto.s3.connect_to_region(location, **connect_args)
372 c.debug = debug
373 b.connection = c
374 except Exception as e:
375 if debug > 0:
376 print(e)
377 print('Could not get bucket region info, skipping...')
378
379 existing_keys_to_check_against = []
380 files_to_check_for_upload = []
381
382 for path in args:
383 path = expand_path(path)
384 # upload a directory of files recursively
385 if os.path.isdir(path):
386 if no_overwrite:
387 if not quiet:
388 print('Getting list of existing keys to check against')
389 for key in b.list(get_key_name(path, prefix, key_prefix)):
390 existing_keys_to_check_against.append(key.name)
391 for root, dirs, files in os.walk(path):
392 for ignore in ignore_dirs:
393 if ignore in dirs:
394 dirs.remove(ignore)
395 for path in files:
396 if path.startswith("."):
397 continue
398 files_to_check_for_upload.append(os.path.join(root, path))
399
400 # upload a single file
401 elif os.path.isfile(path):
402 fullpath = os.path.abspath(path)
403 key_name = get_key_name(fullpath, prefix, key_prefix)
404 files_to_check_for_upload.append(fullpath)
405 existing_keys_to_check_against.append(key_name)
406
407 # we are trying to upload something unknown
408 else:
409 print("I don't know what %s is, so i can't upload it" % path)
410
411 for fullpath in files_to_check_for_upload:
412 key_name = get_key_name(fullpath, prefix, key_prefix)
413
414 if no_overwrite and key_name in existing_keys_to_check_against:
415 if b.get_key(key_name):
416 if not quiet:
417 print('Skipping %s as it exists in s3' % fullpath)
418 continue
419
420 if not quiet:
421 print('Copying %s to %s/%s' % (fullpath, bucket_name, key_name))
422
423 if not no_op:
424 # 0-byte files don't work and also don't need multipart upload
425 if os.stat(fullpath).st_size != 0 and multipart_capable and \
426 multipart_requested:
427 multipart_upload(bucket_name, aws_access_key_id,
428 aws_secret_access_key, fullpath, key_name,
429 reduced, debug, cb, num_cb,
430 grant or 'private', headers,
431 region=region or DEFAULT_REGION)
432 else:
433 singlepart_upload(b, key_name, fullpath, cb=cb, num_cb=num_cb,
434 policy=grant, reduced_redundancy=reduced,
435 headers=headers)
436
437 if __name__ == "__main__":
438 main()