annotate data_manager/data_manager_plant_tribes_scaffolds_download.py @ 0:4c96b684f0fd draft

Uploaded
author iuc
date Fri, 13 Jan 2017 10:34:15 -0500
parents
children 80b0bd65cbfb
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
1 #!/usr/bin/env python
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
2 #
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
3 # Data manager for downloading Plant Tribes scaffolds data.
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
4 import argparse
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
5 import json
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
6 import os
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
7 import shutil
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
8 import sys
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
9 import tarfile
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
10 import urllib2
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
11 import zipfile
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
12
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
13
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
14 DEFAULT_DATA_TABLE_NAMES = ["plant_tribes_scaffolds"]
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
15
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
16
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
17 def add_data_table_entry(data_manager_dict, data_table_name, data_table_entry):
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
18 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
19 data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get(data_table_name, [])
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
20 data_manager_dict['data_tables'][data_table_name].append(data_table_entry)
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
21 return data_manager_dict
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
22
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
23
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
24 def make_directory(dir):
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
25 if not os.path.exists(dir):
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
26 os.makedirs(dir)
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
27
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
28
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
29 def remove_directory(dir):
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
30 if os.path.exists(dir):
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
31 shutil.rmtree(dir)
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
32
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
33
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
34 def url_download(target_directory, url, description, data_table_names=DEFAULT_DATA_TABLE_NAMES):
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
35 work_directory = os.path.abspath(os.path.join(os.getcwd(), 'scaffolds'))
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
36 make_directory(work_directory)
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
37 file_path = os.path.join(work_directory, os.path.basename(url))
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
38 src = None
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
39 dst = None
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
40 try:
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
41 req = urllib2.Request(url)
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
42 src = urllib2.urlopen(req)
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
43 dst = open(file_path, 'wb')
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
44 while True:
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
45 chunk = src.read(2**10)
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
46 if chunk:
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
47 dst.write(chunk)
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
48 else:
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
49 break
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
50 except Exception, e:
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
51 print >>sys.stderr, str(e)
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
52 finally:
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
53 if src:
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
54 src.close()
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
55 if dst:
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
56 dst.close()
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
57 if tarfile.is_tarfile(file_path):
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
58 fh = tarfile.open(file_path, 'r:*')
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
59 elif zipfile.is_zipfile(file_path):
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
60 fh = zipfile.ZipFile(file_path, 'r')
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
61 else:
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
62 return
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
63 fh.extractall(work_directory)
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
64 os.remove(file_path)
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
65 # Move the scaffolds data files into defined output directory.
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
66 for filename in os.listdir(work_directory):
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
67 shutil.move(os.path.join(work_directory, filename), target_directory)
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
68 remove_directory(work_directory)
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
69 data_manager_dict = {}
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
70 # Populate the data table, there should be a single entry in target_directory.
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
71 for file_path in os.listdir(target_directory):
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
72 full_path = os.path.abspath(os.path.join(target_directory, file_path))
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
73 entry_name = "%s" % os.path.basename(file_path)
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
74 data_table_entry = dict(value=entry_name, name=entry_name, path=full_path, description=description)
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
75 for data_table_name in data_table_names:
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
76 data_manager_dict = add_data_table_entry(data_manager_dict, data_table_name, data_table_entry)
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
77 return data_manager_dict
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
78
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
79
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
80 parser = argparse.ArgumentParser()
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
81 parser.add_argument('--description', dest='description', default=None, help='Description')
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
82 parser.add_argument('--name', dest='name', help='Data table entry unique ID')
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
83 parser.add_argument('--out_file', dest='out_file', help='JSON output file')
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
84 parser.add_argument('--web_url', dest='web_url', help='Web URL')
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
85
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
86 args = parser.parse_args()
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
87
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
88 # Some magic happens with tools of type "manage_data" in that the output
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
89 # file contains some JSON data that allows us to define the target directory.
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
90 params = json.loads(open(args.out_file).read())
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
91 target_directory = params['output_data'][0]['extra_files_path']
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
92 make_directory(target_directory)
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
93
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
94 if args.description is None:
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
95 description = ''
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
96 else:
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
97 description = args.description.strip()
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
98
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
99 # Get the scaffolds data.
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
100 data_manager_dict = url_download(target_directory, args.web_url, description)
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
101 # Write the JSON output dataset.
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
102 fh = open(args.out_file, 'wb')
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
103 fh.write(json.dumps(data_manager_dict))
4c96b684f0fd Uploaded
iuc
parents:
diff changeset
104 fh.close()