comparison data_manager/data_manager_sortmerna_download.py @ 0:765f4ec851f0 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_sortmerna_database_downloader commit 65d322f9ab2f24d65b307f3553589149a1d678d5
author rnateam
date Wed, 31 May 2017 14:53:00 -0400
parents
children 30bb49887172
comparison
equal deleted inserted replaced
-1:000000000000 0:765f4ec851f0
1 #!/usr/bin/env python
2 # Data manager for reference data for the SortMeRNA Galaxy tools
3
4 import argparse
5 import json
6 import os
7 import tarfile
8 import requests
9 import subprocess
10
11
12 # Utility functions for interacting with Galaxy JSON
13 def read_input_json(jsonfile):
14 """Read the JSON supplied from the data manager tool
15
16 Returns a tuple (param_dict,extra_files_path)
17
18 'param_dict' is an arbitrary dictionary of parameters
19 input into the tool; 'extra_files_path' is the path
20 to a directory where output files must be put for the
21 receiving data manager to pick them up.
22
23 NB the directory pointed to by 'extra_files_path'
24 doesn't exist initially, it is the job of the script
25 to create it if necessary.
26
27 """
28 params = json.loads(open(jsonfile).read())
29 return (params['param_dict'],
30 params['output_data'][0]['extra_files_path'])
31
32
33 # Utility functions for creating data table dictionaries
34 #
35 # Example usage:
36 # >>> d = create_data_tables_dict()
37 # >>> add_data_table(d,'my_data')
38 # >>> add_data_table_entry(dict(dbkey='hg19',value='human'))
39 # >>> add_data_table_entry(dict(dbkey='mm9',value='mouse'))
40 # >>> print str(json.dumps(d))
41 def create_data_tables_dict():
42 """Return a dictionary for storing data table information
43
44 Returns a dictionary that can be used with 'add_data_table'
45 and 'add_data_table_entry' to store information about a
46 data table. It can be converted to JSON to be sent back to
47 the data manager.
48
49 """
50 d = {}
51 d['data_tables'] = {}
52 return d
53
54
55 def add_data_table(d, table):
56 """Add a data table to the data tables dictionary
57
58 Creates a placeholder for a data table called 'table'.
59
60 """
61 d['data_tables'][table] = []
62
63
64 def add_data_table_entry(d, table, entry):
65 """Add an entry to a data table
66
67 Appends an entry to the data table 'table'. 'entry'
68 should be a dictionary where the keys are the names of
69 columns in the data table.
70
71 Raises an exception if the named data table doesn't
72 exist.
73
74 """
75 try:
76 d['data_tables'][table].append(entry)
77 except KeyError:
78 raise Exception("add_data_table_entry: no table '%s'" % table)
79
80
81 def download_archive(version):
82 """
83
84 """
85 filepath = "%s.tar.gz" % (version)
86 url = "https://github.com/biocore/sortmerna/archive/%s.tar.gz" % (version)
87 r = requests.get(url, stream=True)
88 r.raise_for_status()
89 with open(filepath, "wb") as fd:
90 for chunk in r.iter_content(chunk_size=128):
91 fd.write(chunk)
92 return filepath
93
94
95 def find_archive_content_path(archive_content_path):
96 """
97 """
98 content = os.listdir(archive_content_path)
99 archive_content = []
100 for x in content:
101 if not x.startswith(".") and not x.startswith("_"):
102 archive_content.append(x)
103 if len(archive_content) == 1:
104 archive_content_path = os.path.join(
105 archive_content_path,
106 archive_content[0])
107 return archive_content_path
108
109
110 def extract_archive(filepath):
111 """
112 """
113 archive_content_path = "tmp"
114 tar = tarfile.open(filepath)
115 tar.extractall(path=archive_content_path)
116 tar.close()
117 archive_content_path = find_archive_content_path(archive_content_path)
118 return archive_content_path
119
120
121 def move_index_files(archive_content_path, target_dir, data_tables, version):
122 """
123 """
124 file_dir = os.path.join(archive_content_path, "rRNA_databases")
125 for filename in os.listdir(file_dir):
126 if not filename.endswith("fasta"):
127 continue
128 input_filepath = os.path.join(file_dir, filename)
129 output_filepath = os.path.join(target_dir, filename)
130 # Move file
131 os.rename(input_filepath, output_filepath)
132 # Index the file with indexdb_rna
133 command = "indexdb_rna --ref %s,%s" % (
134 output_filepath,
135 os.path.splitext(output_filepath)[0])
136 process = subprocess.call(command, shell=True )
137 # Add entry in the data table
138 db_name = os.path.splitext(filename)[0]
139 add_data_table_entry(
140 data_tables,
141 "rRNA_databases",
142 dict(
143 dbkey=db_name,
144 value=version,
145 name=db_name,
146 path=output_filepath))
147
148
149 def download_db(data_tables, version, target_dir):
150 """Download SortMeRNA database
151
152 Creates references to the specified file(s) on the Galaxy
153 server in the appropriate data table (determined from the
154 file extension).
155
156 The 'data_tables' dictionary should have been created using
157 the 'create_data_tables_dict' and 'add_data_table' functions.
158
159 Arguments:
160 data_tables: a dictionary containing the data table info
161 version: version of the database
162 table_name: name of the table
163 target_dir: directory to put copy or link to the data file
164 """
165 print("Download archive")
166 filepath = download_archive(version)
167
168 print("Extract archive %s" % filepath)
169 archive_content_path = extract_archive(filepath)
170
171 print("Moving fasta file from %s and index them" % archive_content_path)
172 move_index_files(
173 archive_content_path,
174 target_dir,
175 data_tables,
176 version)
177
178
179 if __name__ == "__main__":
180 print("Starting...")
181
182 # Read command line
183 parser = argparse.ArgumentParser(
184 description='Download QIIME reference database')
185 parser.add_argument('--version', help="Database version")
186 parser.add_argument('--jsonfile', help="Output JSON file")
187 args = parser.parse_args()
188
189 jsonfile = args.jsonfile
190
191 # Read the input JSON
192 params, target_dir = read_input_json(jsonfile)
193
194 # Make the target directory
195 print("Making %s" % target_dir)
196 os.mkdir(target_dir)
197 os.mkdir(os.path.join(target_dir, "rRNA_databases"))
198 target_dir = os.path.join(target_dir, "rRNA_databases")
199
200 # Set up data tables dictionary
201 data_tables = create_data_tables_dict()
202 add_data_table(data_tables, "rRNA_databases")
203
204 # Fetch data from specified data sources
205 download_db(
206 data_tables,
207 args.version,
208 target_dir)
209
210 # Write output JSON
211 print("Outputting JSON")
212 print(str(json.dumps(data_tables)))
213 with open(jsonfile, 'w') as out:
214 json.dump(data_tables, out)
215 print("Done.")