Mercurial > repos > shellac > guppy_basecaller
comparison env/bin/s3put @ 0:26e78fe6e8c4 draft
"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author | shellac |
---|---|
date | Sat, 02 May 2020 07:14:21 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:26e78fe6e8c4 |
---|---|
1 #!/Users/pldms/Development/Projects/2020/david-matthews-galaxy/guppy_basecaller/env/bin/python3 | |
2 # Copyright (c) 2006,2007,2008 Mitch Garnaat http://garnaat.org/ | |
3 # | |
4 # Permission is hereby granted, free of charge, to any person obtaining a | |
5 # copy of this software and associated documentation files (the | |
6 # "Software"), to deal in the Software without restriction, including | |
7 # without limitation the rights to use, copy, modify, merge, publish, dis- | |
8 # tribute, sublicense, and/or sell copies of the Software, and to permit | |
9 # persons to whom the Software is furnished to do so, subject to the fol- | |
10 # lowing conditions: | |
11 # | |
12 # The above copyright notice and this permission notice shall be included | |
13 # in all copies or substantial portions of the Software. | |
14 # | |
15 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS | |
16 # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL- | |
17 # ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT | |
18 # SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
19 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
20 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
21 # IN THE SOFTWARE. | |
22 # | |
23 import getopt | |
24 import sys | |
25 import os | |
26 import boto | |
27 | |
28 from boto.compat import six | |
29 | |
30 try: | |
31 # multipart portions copyright Fabian Topfstedt | |
32 # https://gist.github.com/924094 | |
33 | |
34 import math | |
35 import mimetypes | |
36 from multiprocessing import Pool | |
37 from boto.s3.connection import S3Connection | |
38 from filechunkio import FileChunkIO | |
39 multipart_capable = True | |
40 usage_flag_multipart_capable = """ [--multipart]""" | |
41 usage_string_multipart_capable = """ | |
42 multipart - Upload files as multiple parts. This needs filechunkio. | |
43 Requires ListBucket, ListMultipartUploadParts, | |
44 ListBucketMultipartUploads and PutObject permissions.""" | |
45 except ImportError as err: | |
46 multipart_capable = False | |
47 usage_flag_multipart_capable = "" | |
48 if six.PY2: | |
49 attribute = 'message' | |
50 else: | |
51 attribute = 'msg' | |
52 usage_string_multipart_capable = '\n\n "' + \ | |
53 getattr(err, attribute)[len('No module named '):] + \ | |
54 '" is missing for multipart support ' | |
55 | |
56 | |
57 DEFAULT_REGION = 'us-east-1' | |
58 | |
59 usage_string = """ | |
60 SYNOPSIS | |
61 s3put [-a/--access_key <access_key>] [-s/--secret_key <secret_key>] | |
62 -b/--bucket <bucket_name> [-c/--callback <num_cb>] | |
63 [-d/--debug <debug_level>] [-i/--ignore <ignore_dirs>] | |
64 [-n/--no_op] [-p/--prefix <prefix>] [-k/--key_prefix <key_prefix>] | |
65 [-q/--quiet] [-g/--grant grant] [-w/--no_overwrite] [-r/--reduced] | |
66 [--header] [--region <name>] [--host <s3_host>]""" + \ | |
67 usage_flag_multipart_capable + """ path [path...] | |
68 | |
69 Where | |
70 access_key - Your AWS Access Key ID. If not supplied, boto will | |
71 use the value of the environment variable | |
72 AWS_ACCESS_KEY_ID | |
73 secret_key - Your AWS Secret Access Key. If not supplied, boto | |
74 will use the value of the environment variable | |
75 AWS_SECRET_ACCESS_KEY | |
76 bucket_name - The name of the S3 bucket the file(s) should be | |
77 copied to. | |
78 path - A path to a directory or file that represents the items | |
79 to be uploaded. If the path points to an individual file, | |
80 that file will be uploaded to the specified bucket. If the | |
81 path points to a directory, it will recursively traverse | |
82 the directory and upload all files to the specified bucket. | |
83 debug_level - 0 means no debug output (default), 1 means normal | |
84 debug output from boto, and 2 means boto debug output | |
85 plus request/response output from httplib | |
86 ignore_dirs - a comma-separated list of directory names that will | |
87 be ignored and not uploaded to S3. | |
88 num_cb - The number of progress callbacks to display. The default | |
89 is zero which means no callbacks. If you supplied a value | |
90 of "-c 10" for example, the progress callback would be | |
91 called 10 times for each file transferred. | |
92 prefix - A file path prefix that will be stripped from the full | |
93 path of the file when determining the key name in S3. | |
94 For example, if the full path of a file is: | |
95 /home/foo/bar/fie.baz | |
96 and the prefix is specified as "-p /home/foo/" the | |
97 resulting key name in S3 will be: | |
98 /bar/fie.baz | |
99 The prefix must end in a trailing separator and if it | |
100 does not then one will be added. | |
101 key_prefix - A prefix to be added to the S3 key name, after any | |
102 stripping of the file path is done based on the | |
103 "-p/--prefix" option. | |
104 reduced - Use Reduced Redundancy storage | |
105 grant - A canned ACL policy that will be granted on each file | |
106 transferred to S3. The value of provided must be one | |
107 of the "canned" ACL policies supported by S3: | |
108 private|public-read|public-read-write|authenticated-read | |
109 no_overwrite - No files will be overwritten on S3, if the file/key | |
110 exists on s3 it will be kept. This is useful for | |
111 resuming interrupted transfers. Note this is not a | |
112 sync, even if the file has been updated locally if | |
113 the key exists on s3 the file on s3 will not be | |
114 updated. | |
115 header - key=value pairs of extra header(s) to pass along in the | |
116 request | |
117 region - Manually set a region for buckets that are not in the US | |
118 classic region. Normally the region is autodetected, but | |
119 setting this yourself is more efficient. | |
120 host - Hostname override, for using an endpoint other then AWS S3 | |
121 """ + usage_string_multipart_capable + """ | |
122 | |
123 | |
124 If the -n option is provided, no files will be transferred to S3 but | |
125 informational messages will be printed about what would happen. | |
126 """ | |
127 | |
128 | |
129 def usage(status=1): | |
130 print(usage_string) | |
131 sys.exit(status) | |
132 | |
133 | |
134 def submit_cb(bytes_so_far, total_bytes): | |
135 print('%d bytes transferred / %d bytes total' % (bytes_so_far, total_bytes)) | |
136 | |
137 | |
138 def get_key_name(fullpath, prefix, key_prefix): | |
139 if fullpath.startswith(prefix): | |
140 key_name = fullpath[len(prefix):] | |
141 else: | |
142 key_name = fullpath | |
143 l = key_name.split(os.sep) | |
144 return key_prefix + '/'.join(l) | |
145 | |
146 | |
147 def _upload_part(bucketname, aws_key, aws_secret, multipart_id, part_num, | |
148 source_path, offset, bytes, debug, cb, num_cb, | |
149 amount_of_retries=10): | |
150 """ | |
151 Uploads a part with retries. | |
152 """ | |
153 if debug == 1: | |
154 print("_upload_part(%s, %s, %s)" % (source_path, offset, bytes)) | |
155 | |
156 def _upload(retries_left=amount_of_retries): | |
157 try: | |
158 if debug == 1: | |
159 print('Start uploading part #%d ...' % part_num) | |
160 conn = S3Connection(aws_key, aws_secret) | |
161 conn.debug = debug | |
162 bucket = conn.get_bucket(bucketname) | |
163 for mp in bucket.get_all_multipart_uploads(): | |
164 if mp.id == multipart_id: | |
165 with FileChunkIO(source_path, 'r', offset=offset, | |
166 bytes=bytes) as fp: | |
167 mp.upload_part_from_file(fp=fp, part_num=part_num, | |
168 cb=cb, num_cb=num_cb) | |
169 break | |
170 except Exception as exc: | |
171 if retries_left: | |
172 _upload(retries_left=retries_left - 1) | |
173 else: | |
174 print('Failed uploading part #%d' % part_num) | |
175 raise exc | |
176 else: | |
177 if debug == 1: | |
178 print('... Uploaded part #%d' % part_num) | |
179 | |
180 _upload() | |
181 | |
182 def check_valid_region(conn, region): | |
183 if conn is None: | |
184 print('Invalid region (%s)' % region) | |
185 sys.exit(1) | |
186 | |
187 def multipart_upload(bucketname, aws_key, aws_secret, source_path, keyname, | |
188 reduced, debug, cb, num_cb, acl='private', headers={}, | |
189 guess_mimetype=True, parallel_processes=4, | |
190 region=DEFAULT_REGION): | |
191 """ | |
192 Parallel multipart upload. | |
193 """ | |
194 conn = boto.s3.connect_to_region(region, aws_access_key_id=aws_key, | |
195 aws_secret_access_key=aws_secret) | |
196 check_valid_region(conn, region) | |
197 conn.debug = debug | |
198 bucket = conn.get_bucket(bucketname) | |
199 | |
200 if guess_mimetype: | |
201 mtype = mimetypes.guess_type(keyname)[0] or 'application/octet-stream' | |
202 headers.update({'Content-Type': mtype}) | |
203 | |
204 mp = bucket.initiate_multipart_upload(keyname, headers=headers, | |
205 reduced_redundancy=reduced) | |
206 | |
207 source_size = os.stat(source_path).st_size | |
208 bytes_per_chunk = max(int(math.sqrt(5242880) * math.sqrt(source_size)), | |
209 5242880) | |
210 chunk_amount = int(math.ceil(source_size / float(bytes_per_chunk))) | |
211 | |
212 pool = Pool(processes=parallel_processes) | |
213 for i in range(chunk_amount): | |
214 offset = i * bytes_per_chunk | |
215 remaining_bytes = source_size - offset | |
216 bytes = min([bytes_per_chunk, remaining_bytes]) | |
217 part_num = i + 1 | |
218 pool.apply_async(_upload_part, [bucketname, aws_key, aws_secret, mp.id, | |
219 part_num, source_path, offset, bytes, | |
220 debug, cb, num_cb]) | |
221 pool.close() | |
222 pool.join() | |
223 | |
224 if len(mp.get_all_parts()) == chunk_amount: | |
225 mp.complete_upload() | |
226 key = bucket.get_key(keyname) | |
227 key.set_acl(acl) | |
228 else: | |
229 mp.cancel_upload() | |
230 | |
231 | |
232 def singlepart_upload(bucket, key_name, fullpath, *kargs, **kwargs): | |
233 """ | |
234 Single upload. | |
235 """ | |
236 k = bucket.new_key(key_name) | |
237 k.set_contents_from_filename(fullpath, *kargs, **kwargs) | |
238 | |
239 | |
240 def expand_path(path): | |
241 path = os.path.expanduser(path) | |
242 path = os.path.expandvars(path) | |
243 return os.path.abspath(path) | |
244 | |
245 | |
246 def main(): | |
247 | |
248 # default values | |
249 aws_access_key_id = None | |
250 aws_secret_access_key = None | |
251 bucket_name = '' | |
252 ignore_dirs = [] | |
253 debug = 0 | |
254 cb = None | |
255 num_cb = 0 | |
256 quiet = False | |
257 no_op = False | |
258 prefix = '/' | |
259 key_prefix = '' | |
260 grant = None | |
261 no_overwrite = False | |
262 reduced = False | |
263 headers = {} | |
264 host = None | |
265 multipart_requested = False | |
266 region = None | |
267 | |
268 try: | |
269 opts, args = getopt.getopt( | |
270 sys.argv[1:], 'a:b:c::d:g:hi:k:np:qs:wr', | |
271 ['access_key=', 'bucket=', 'callback=', 'debug=', 'help', 'grant=', | |
272 'ignore=', 'key_prefix=', 'no_op', 'prefix=', 'quiet', | |
273 'secret_key=', 'no_overwrite', 'reduced', 'header=', 'multipart', | |
274 'host=', 'region=']) | |
275 except: | |
276 usage(1) | |
277 | |
278 # parse opts | |
279 for o, a in opts: | |
280 if o in ('-h', '--help'): | |
281 usage(0) | |
282 if o in ('-a', '--access_key'): | |
283 aws_access_key_id = a | |
284 if o in ('-b', '--bucket'): | |
285 bucket_name = a | |
286 if o in ('-c', '--callback'): | |
287 num_cb = int(a) | |
288 cb = submit_cb | |
289 if o in ('-d', '--debug'): | |
290 debug = int(a) | |
291 if o in ('-g', '--grant'): | |
292 grant = a | |
293 if o in ('-i', '--ignore'): | |
294 ignore_dirs = a.split(',') | |
295 if o in ('-n', '--no_op'): | |
296 no_op = True | |
297 if o in ('-w', '--no_overwrite'): | |
298 no_overwrite = True | |
299 if o in ('-p', '--prefix'): | |
300 prefix = a | |
301 if prefix[-1] != os.sep: | |
302 prefix = prefix + os.sep | |
303 prefix = expand_path(prefix) | |
304 if o in ('-k', '--key_prefix'): | |
305 key_prefix = a | |
306 if o in ('-q', '--quiet'): | |
307 quiet = True | |
308 if o in ('-s', '--secret_key'): | |
309 aws_secret_access_key = a | |
310 if o in ('-r', '--reduced'): | |
311 reduced = True | |
312 if o == '--header': | |
313 (k, v) = a.split("=", 1) | |
314 headers[k] = v | |
315 if o == '--host': | |
316 host = a | |
317 if o == '--multipart': | |
318 if multipart_capable: | |
319 multipart_requested = True | |
320 else: | |
321 print("multipart upload requested but not capable") | |
322 sys.exit(4) | |
323 if o == '--region': | |
324 regions = boto.s3.regions() | |
325 for region_info in regions: | |
326 if region_info.name == a: | |
327 region = a | |
328 break | |
329 else: | |
330 raise ValueError('Invalid region %s specified' % a) | |
331 | |
332 if len(args) < 1: | |
333 usage(2) | |
334 | |
335 if not bucket_name: | |
336 print("bucket name is required!") | |
337 usage(3) | |
338 | |
339 connect_args = { | |
340 'aws_access_key_id': aws_access_key_id, | |
341 'aws_secret_access_key': aws_secret_access_key | |
342 } | |
343 | |
344 if host: | |
345 connect_args['host'] = host | |
346 | |
347 c = boto.s3.connect_to_region(region or DEFAULT_REGION, **connect_args) | |
348 check_valid_region(c, region or DEFAULT_REGION) | |
349 c.debug = debug | |
350 b = c.get_bucket(bucket_name, validate=False) | |
351 | |
352 # Attempt to determine location and warn if no --host or --region | |
353 # arguments were passed. Then try to automagically figure out | |
354 # what should have been passed and fix it. | |
355 if host is None and region is None: | |
356 try: | |
357 location = b.get_location() | |
358 | |
359 # Classic region will be '', any other will have a name | |
360 if location: | |
361 print('Bucket exists in %s but no host or region given!' % location) | |
362 | |
363 # Override for EU, which is really Ireland according to the docs | |
364 if location == 'EU': | |
365 location = 'eu-west-1' | |
366 | |
367 print('Automatically setting region to %s' % location) | |
368 | |
369 # Here we create a new connection, and then take the existing | |
370 # bucket and set it to use the new connection | |
371 c = boto.s3.connect_to_region(location, **connect_args) | |
372 c.debug = debug | |
373 b.connection = c | |
374 except Exception as e: | |
375 if debug > 0: | |
376 print(e) | |
377 print('Could not get bucket region info, skipping...') | |
378 | |
379 existing_keys_to_check_against = [] | |
380 files_to_check_for_upload = [] | |
381 | |
382 for path in args: | |
383 path = expand_path(path) | |
384 # upload a directory of files recursively | |
385 if os.path.isdir(path): | |
386 if no_overwrite: | |
387 if not quiet: | |
388 print('Getting list of existing keys to check against') | |
389 for key in b.list(get_key_name(path, prefix, key_prefix)): | |
390 existing_keys_to_check_against.append(key.name) | |
391 for root, dirs, files in os.walk(path): | |
392 for ignore in ignore_dirs: | |
393 if ignore in dirs: | |
394 dirs.remove(ignore) | |
395 for path in files: | |
396 if path.startswith("."): | |
397 continue | |
398 files_to_check_for_upload.append(os.path.join(root, path)) | |
399 | |
400 # upload a single file | |
401 elif os.path.isfile(path): | |
402 fullpath = os.path.abspath(path) | |
403 key_name = get_key_name(fullpath, prefix, key_prefix) | |
404 files_to_check_for_upload.append(fullpath) | |
405 existing_keys_to_check_against.append(key_name) | |
406 | |
407 # we are trying to upload something unknown | |
408 else: | |
409 print("I don't know what %s is, so i can't upload it" % path) | |
410 | |
411 for fullpath in files_to_check_for_upload: | |
412 key_name = get_key_name(fullpath, prefix, key_prefix) | |
413 | |
414 if no_overwrite and key_name in existing_keys_to_check_against: | |
415 if b.get_key(key_name): | |
416 if not quiet: | |
417 print('Skipping %s as it exists in s3' % fullpath) | |
418 continue | |
419 | |
420 if not quiet: | |
421 print('Copying %s to %s/%s' % (fullpath, bucket_name, key_name)) | |
422 | |
423 if not no_op: | |
424 # 0-byte files don't work and also don't need multipart upload | |
425 if os.stat(fullpath).st_size != 0 and multipart_capable and \ | |
426 multipart_requested: | |
427 multipart_upload(bucket_name, aws_access_key_id, | |
428 aws_secret_access_key, fullpath, key_name, | |
429 reduced, debug, cb, num_cb, | |
430 grant or 'private', headers, | |
431 region=region or DEFAULT_REGION) | |
432 else: | |
433 singlepart_upload(b, key_name, fullpath, cb=cb, num_cb=num_cb, | |
434 policy=grant, reduced_redundancy=reduced, | |
435 headers=headers) | |
436 | |
437 if __name__ == "__main__": | |
438 main() |