6
|
1 #!/usr/bin/env python
|
|
2 # encoding: utf-8
|
|
3 '''
|
|
4 Module to query a set of identifications against the METabolomics EXPlorer database.
|
|
5
|
|
6 It will take the input file and for each record it will query the
|
|
7 molecular mass in the selected MetExp DB. If one or more compounds are found in the
|
|
8 MetExp DB then extra information regarding these compounds is added to the output file.
|
|
9
|
|
10 The output file is thus the input file enriched with information about
|
|
11 related items found in the selected MetExp DB.
|
|
12 '''
|
|
13 import csv
|
|
14 import sys
|
|
15 import fileinput
|
|
16 import urllib2
|
|
17 import time
|
|
18 from collections import OrderedDict
|
|
19
|
|
20 __author__ = "Pieter Lukasse"
|
|
21 __contact__ = "pieter.lukasse@wur.nl"
|
|
22 __copyright__ = "Copyright, 2014, Plant Research International, WUR"
|
|
23 __license__ = "Apache v2"
|
|
24
|
|
25 def _process_file(in_xsv, delim='\t'):
|
|
26 '''
|
|
27 Generic method to parse a tab-separated file returning a dictionary with named columns
|
|
28 @param in_csv: input filename to be parsed
|
|
29 '''
|
|
30 data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
|
|
31 return _process_data(data)
|
|
32
|
|
33 def _process_data(data):
|
|
34
|
|
35 header = data.pop(0)
|
|
36 # Create dictionary with column name as key
|
|
37 output = OrderedDict()
|
|
38 for index in xrange(len(header)):
|
|
39 output[header[index]] = [row[index] for row in data]
|
|
40 return output
|
|
41
|
|
42
|
|
43 def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method):
|
|
44 '''
|
|
45 This method will iterate over the record in the input_data and
|
|
46 will enrich them with the related information found (if any) in the
|
|
47 MetExp Database.
|
|
48
|
|
49 # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
|
|
50 '''
|
|
51 merged = []
|
|
52
|
|
53 for i in xrange(len(input_data[input_data.keys()[0]])):
|
|
54 # Get the record in same dictionary format as input_data, but containing
|
|
55 # a value at each column instead of a list of all values of all records:
|
|
56 input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
|
|
57
|
|
58 # read the molecular mass and formula:
|
|
59 cas_id = input_data_record[casid_col]
|
|
60 formula = input_data_record[formula_col]
|
|
61 molecular_mass = input_data_record[molecular_mass_col]
|
|
62
|
|
63 # search for related records in MetExp:
|
|
64 data_found = None
|
|
65 if cas_id != "undef":
|
|
66 # 1- search for other experiments where this CAS id has been found:
|
|
67 query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method
|
|
68 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
|
|
69 data_type_found = "CAS"
|
|
70 if data_found == None:
|
|
71 # 2- search for other experiments where this FORMULA has been found:
|
|
72 query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method
|
|
73 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
|
|
74 data_type_found = "FORMULA"
|
|
75 if data_found == None:
|
|
76 # 3- search for other experiments where this MM has been found:
|
|
77 query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method
|
|
78 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
|
|
79 data_type_found = "MM"
|
|
80
|
|
81 if data_found == None:
|
|
82 # If still nothing found, just add empty columns
|
|
83 extra_cols = ['', '','','','','','','']
|
|
84 else:
|
|
85 # Add info found:
|
|
86 extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
|
|
87
|
|
88 # Take all data and merge it into a "flat"/simple array of values:
|
|
89 field_values_list = _merge_data(input_data_record, extra_cols)
|
|
90
|
|
91 merged.append(field_values_list)
|
|
92
|
|
93 # return the merged/enriched records:
|
|
94 return merged
|
|
95
|
|
96
|
|
97 def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
|
|
98 '''
|
|
99 This method will go over the data found and will return a
|
|
100 list with the following items:
|
|
101 - Experiment details where hits have been found :
|
|
102 'organism', 'tissue','experiment_name','user_name','column_type'
|
|
103 - Link that executes same query
|
|
104
|
|
105 '''
|
|
106 # set() makes a unique list:
|
|
107 organism_set = []
|
|
108 tissue_set = []
|
|
109 experiment_name_set = []
|
|
110 user_name_set = []
|
|
111 column_type_set = []
|
|
112 cas_nr_set = []
|
|
113
|
|
114 if 'organism' in data_found:
|
|
115 organism_set = set(data_found['organism'])
|
|
116 if 'tissue' in data_found:
|
|
117 tissue_set = set(data_found['tissue'])
|
|
118 if 'experiment_name' in data_found:
|
|
119 experiment_name_set = set(data_found['experiment_name'])
|
|
120 if 'user_name' in data_found:
|
|
121 user_name_set = set(data_found['user_name'])
|
|
122 if 'column_type' in data_found:
|
|
123 column_type_set = set(data_found['column_type'])
|
|
124 if 'CAS' in data_found:
|
|
125 cas_nr_set = set(data_found['CAS'])
|
|
126
|
|
127
|
|
128 result = [data_type_found,
|
|
129 _to_xsv(organism_set),
|
|
130 _to_xsv(tissue_set),
|
|
131 _to_xsv(experiment_name_set),
|
|
132 _to_xsv(user_name_set),
|
|
133 _to_xsv(column_type_set),
|
|
134 _to_xsv(cas_nr_set),
|
|
135 #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"):
|
|
136 "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
|
|
137 return result
|
|
138
|
|
139
|
|
140 def _to_xsv(data_set):
|
|
141 result = ""
|
|
142 for item in data_set:
|
|
143 result = result + str(item) + "|"
|
|
144 return result
|
|
145
|
|
146
|
|
147 def _fire_query_and_return_dict(url):
|
|
148 '''
|
|
149 This method will fire the query as a web-service call and
|
|
150 return the results as a list of dictionary objects
|
|
151 '''
|
|
152
|
|
153 try:
|
|
154 data = urllib2.urlopen(url).read()
|
|
155
|
|
156 # transform to dictionary:
|
|
157 result = []
|
|
158 data_rows = data.split("\n")
|
|
159
|
|
160 # check if there is any data in the response:
|
|
161 if len(data_rows) <= 1 or data_rows[1].strip() == '':
|
|
162 # means there is only the header row...so no hits:
|
|
163 return None
|
|
164
|
|
165 for data_row in data_rows:
|
|
166 if not data_row.strip() == '':
|
|
167 row_as_list = _str_to_list(data_row, delimiter='\t')
|
|
168 result.append(row_as_list)
|
|
169
|
|
170 # return result processed into a dict:
|
|
171 return _process_data(result)
|
|
172
|
|
173 except urllib2.HTTPError, e:
|
|
174 raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
|
|
175 except urllib2.URLError, e:
|
|
176 raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ")
|
|
177
|
|
178 def _str_to_list(data_row, delimiter='\t'):
|
|
179 result = []
|
|
180 for column in data_row.split(delimiter):
|
|
181 result.append(column)
|
|
182 return result
|
|
183
|
|
184
|
|
185 # alternative: ?
|
|
186 # s = requests.Session()
|
|
187 # s.verify = False
|
|
188 # #s.auth = (token01, token02)
|
|
189 # resp = s.get(url, params={'name': 'anonymous'}, stream=True)
|
|
190 # content = resp.content
|
|
191 # # transform to dictionary:
|
|
192
|
|
193
|
|
194
|
|
195
|
|
196 def _merge_data(input_data_record, extra_cols):
|
|
197 '''
|
|
198 Adds the extra information to the existing data record and returns
|
|
199 the combined new record.
|
|
200 '''
|
|
201 record = []
|
|
202 for column in input_data_record:
|
|
203 record.append(input_data_record[column])
|
|
204
|
|
205
|
|
206 # add extra columns
|
|
207 for column in extra_cols:
|
|
208 record.append(column)
|
|
209
|
|
210 return record
|
|
211
|
|
212
|
|
213 def _save_data(data_rows, headers, out_csv):
|
|
214 '''
|
|
215 Writes tab-separated data to file
|
|
216 @param data_rows: dictionary containing merged/enriched dataset
|
|
217 @param out_csv: output csv file
|
|
218 '''
|
|
219
|
|
220 # Open output file for writing
|
|
221 outfile_single_handle = open(out_csv, 'wb')
|
|
222 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
|
|
223
|
|
224 # Write headers
|
|
225 output_single_handle.writerow(headers)
|
|
226
|
|
227 # Write one line for each row
|
|
228 for data_row in data_rows:
|
|
229 output_single_handle.writerow(data_row)
|
|
230
|
|
231 def _get_metexp_URL(metexp_dblink_file):
|
|
232 '''
|
|
233 Read out and return the URL stored in the given file.
|
|
234 '''
|
|
235 file_input = fileinput.input(metexp_dblink_file)
|
|
236 try:
|
|
237 for line in file_input:
|
|
238 if line[0] != '#':
|
|
239 # just return the first line that is not a comment line:
|
|
240 return line
|
|
241 finally:
|
|
242 file_input.close()
|
|
243
|
|
244
|
|
245 def main():
|
|
246 '''
|
|
247 MetExp Query main function
|
|
248
|
|
249 The input file can be any tabular file, as long as it contains a column for the molecular mass
|
|
250 and one for the formula of the respective identification. These two columns are then
|
|
251 used to query against MetExp Database.
|
|
252 '''
|
|
253 seconds_start = int(round(time.time()))
|
|
254
|
|
255 input_file = sys.argv[1]
|
|
256 casid_col = sys.argv[2]
|
|
257 formula_col = sys.argv[3]
|
|
258 molecular_mass_col = sys.argv[4]
|
|
259 metexp_dblink_file = sys.argv[5]
|
|
260 separation_method = sys.argv[6]
|
|
261 output_result = sys.argv[7]
|
|
262
|
|
263 # Parse metexp_dblink_file to find the URL to the MetExp service:
|
|
264 metexp_dblink = _get_metexp_URL(metexp_dblink_file)
|
|
265
|
|
266 # Parse tabular input file into dictionary/array:
|
|
267 input_data = _process_file(input_file)
|
|
268
|
|
269 # Query data against MetExp DB :
|
|
270 enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method)
|
|
271 headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues',
|
|
272 'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits']
|
|
273
|
|
274 _save_data(enriched_data, headers, output_result)
|
|
275
|
|
276 seconds_end = int(round(time.time()))
|
|
277 print "Took " + str(seconds_end - seconds_start) + " seconds"
|
|
278
|
|
279
|
|
280
|
|
281 if __name__ == '__main__':
|
|
282 main()
|