Mercurial > repos > rnateam > data_manager_sortmerna_database_downloader
comparison data_manager/data_manager_sortmerna_download.py @ 0:765f4ec851f0 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_sortmerna_database_downloader commit 65d322f9ab2f24d65b307f3553589149a1d678d5
author | rnateam |
---|---|
date | Wed, 31 May 2017 14:53:00 -0400 |
parents | |
children | 30bb49887172 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:765f4ec851f0 |
---|---|
1 #!/usr/bin/env python | |
2 # Data manager for reference data for the SortMeRNA Galaxy tools | |
3 | |
4 import argparse | |
5 import json | |
6 import os | |
7 import tarfile | |
8 import requests | |
9 import subprocess | |
10 | |
11 | |
12 # Utility functions for interacting with Galaxy JSON | |
13 def read_input_json(jsonfile): | |
14 """Read the JSON supplied from the data manager tool | |
15 | |
16 Returns a tuple (param_dict,extra_files_path) | |
17 | |
18 'param_dict' is an arbitrary dictionary of parameters | |
19 input into the tool; 'extra_files_path' is the path | |
20 to a directory where output files must be put for the | |
21 receiving data manager to pick them up. | |
22 | |
23 NB the directory pointed to by 'extra_files_path' | |
24 doesn't exist initially, it is the job of the script | |
25 to create it if necessary. | |
26 | |
27 """ | |
28 params = json.loads(open(jsonfile).read()) | |
29 return (params['param_dict'], | |
30 params['output_data'][0]['extra_files_path']) | |
31 | |
32 | |
33 # Utility functions for creating data table dictionaries | |
34 # | |
35 # Example usage: | |
36 # >>> d = create_data_tables_dict() | |
37 # >>> add_data_table(d,'my_data') | |
38 # >>> add_data_table_entry(dict(dbkey='hg19',value='human')) | |
39 # >>> add_data_table_entry(dict(dbkey='mm9',value='mouse')) | |
40 # >>> print str(json.dumps(d)) | |
41 def create_data_tables_dict(): | |
42 """Return a dictionary for storing data table information | |
43 | |
44 Returns a dictionary that can be used with 'add_data_table' | |
45 and 'add_data_table_entry' to store information about a | |
46 data table. It can be converted to JSON to be sent back to | |
47 the data manager. | |
48 | |
49 """ | |
50 d = {} | |
51 d['data_tables'] = {} | |
52 return d | |
53 | |
54 | |
55 def add_data_table(d, table): | |
56 """Add a data table to the data tables dictionary | |
57 | |
58 Creates a placeholder for a data table called 'table'. | |
59 | |
60 """ | |
61 d['data_tables'][table] = [] | |
62 | |
63 | |
64 def add_data_table_entry(d, table, entry): | |
65 """Add an entry to a data table | |
66 | |
67 Appends an entry to the data table 'table'. 'entry' | |
68 should be a dictionary where the keys are the names of | |
69 columns in the data table. | |
70 | |
71 Raises an exception if the named data table doesn't | |
72 exist. | |
73 | |
74 """ | |
75 try: | |
76 d['data_tables'][table].append(entry) | |
77 except KeyError: | |
78 raise Exception("add_data_table_entry: no table '%s'" % table) | |
79 | |
80 | |
81 def download_archive(version): | |
82 """ | |
83 | |
84 """ | |
85 filepath = "%s.tar.gz" % (version) | |
86 url = "https://github.com/biocore/sortmerna/archive/%s.tar.gz" % (version) | |
87 r = requests.get(url, stream=True) | |
88 r.raise_for_status() | |
89 with open(filepath, "wb") as fd: | |
90 for chunk in r.iter_content(chunk_size=128): | |
91 fd.write(chunk) | |
92 return filepath | |
93 | |
94 | |
95 def find_archive_content_path(archive_content_path): | |
96 """ | |
97 """ | |
98 content = os.listdir(archive_content_path) | |
99 archive_content = [] | |
100 for x in content: | |
101 if not x.startswith(".") and not x.startswith("_"): | |
102 archive_content.append(x) | |
103 if len(archive_content) == 1: | |
104 archive_content_path = os.path.join( | |
105 archive_content_path, | |
106 archive_content[0]) | |
107 return archive_content_path | |
108 | |
109 | |
110 def extract_archive(filepath): | |
111 """ | |
112 """ | |
113 archive_content_path = "tmp" | |
114 tar = tarfile.open(filepath) | |
115 tar.extractall(path=archive_content_path) | |
116 tar.close() | |
117 archive_content_path = find_archive_content_path(archive_content_path) | |
118 return archive_content_path | |
119 | |
120 | |
121 def move_index_files(archive_content_path, target_dir, data_tables, version): | |
122 """ | |
123 """ | |
124 file_dir = os.path.join(archive_content_path, "rRNA_databases") | |
125 for filename in os.listdir(file_dir): | |
126 if not filename.endswith("fasta"): | |
127 continue | |
128 input_filepath = os.path.join(file_dir, filename) | |
129 output_filepath = os.path.join(target_dir, filename) | |
130 # Move file | |
131 os.rename(input_filepath, output_filepath) | |
132 # Index the file with indexdb_rna | |
133 command = "indexdb_rna --ref %s,%s" % ( | |
134 output_filepath, | |
135 os.path.splitext(output_filepath)[0]) | |
136 process = subprocess.call(command, shell=True ) | |
137 # Add entry in the data table | |
138 db_name = os.path.splitext(filename)[0] | |
139 add_data_table_entry( | |
140 data_tables, | |
141 "rRNA_databases", | |
142 dict( | |
143 dbkey=db_name, | |
144 value=version, | |
145 name=db_name, | |
146 path=output_filepath)) | |
147 | |
148 | |
149 def download_db(data_tables, version, target_dir): | |
150 """Download SortMeRNA database | |
151 | |
152 Creates references to the specified file(s) on the Galaxy | |
153 server in the appropriate data table (determined from the | |
154 file extension). | |
155 | |
156 The 'data_tables' dictionary should have been created using | |
157 the 'create_data_tables_dict' and 'add_data_table' functions. | |
158 | |
159 Arguments: | |
160 data_tables: a dictionary containing the data table info | |
161 version: version of the database | |
162 table_name: name of the table | |
163 target_dir: directory to put copy or link to the data file | |
164 """ | |
165 print("Download archive") | |
166 filepath = download_archive(version) | |
167 | |
168 print("Extract archive %s" % filepath) | |
169 archive_content_path = extract_archive(filepath) | |
170 | |
171 print("Moving fasta file from %s and index them" % archive_content_path) | |
172 move_index_files( | |
173 archive_content_path, | |
174 target_dir, | |
175 data_tables, | |
176 version) | |
177 | |
178 | |
179 if __name__ == "__main__": | |
180 print("Starting...") | |
181 | |
182 # Read command line | |
183 parser = argparse.ArgumentParser( | |
184 description='Download QIIME reference database') | |
185 parser.add_argument('--version', help="Database version") | |
186 parser.add_argument('--jsonfile', help="Output JSON file") | |
187 args = parser.parse_args() | |
188 | |
189 jsonfile = args.jsonfile | |
190 | |
191 # Read the input JSON | |
192 params, target_dir = read_input_json(jsonfile) | |
193 | |
194 # Make the target directory | |
195 print("Making %s" % target_dir) | |
196 os.mkdir(target_dir) | |
197 os.mkdir(os.path.join(target_dir, "rRNA_databases")) | |
198 target_dir = os.path.join(target_dir, "rRNA_databases") | |
199 | |
200 # Set up data tables dictionary | |
201 data_tables = create_data_tables_dict() | |
202 add_data_table(data_tables, "rRNA_databases") | |
203 | |
204 # Fetch data from specified data sources | |
205 download_db( | |
206 data_tables, | |
207 args.version, | |
208 target_dir) | |
209 | |
210 # Write output JSON | |
211 print("Outputting JSON") | |
212 print(str(json.dumps(data_tables))) | |
213 with open(jsonfile, 'w') as out: | |
214 json.dump(data_tables, out) | |
215 print("Done.") |