6
|
1 #!/usr/bin/env python
|
|
2 # encoding: utf-8
|
|
3 '''
|
|
4 Module to query a set of accurate mass values detected by high-resolution mass spectrometers
|
|
5 against various repositories/services such as METabolomics EXPlorer database or the
|
|
6 MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/).
|
|
7
|
|
8 It will take the input file and for each record it will query the
|
|
9 molecular mass in the selected repository/service. If one or more compounds are found
|
|
10 then extra information regarding these compounds is added to the output file.
|
|
11
|
|
12 The output file is thus the input file enriched with information about
|
|
13 related items found in the selected repository/service.
|
|
14
|
|
15 The service should implement the following interface:
|
|
16
|
|
17 http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data)
|
|
18
|
|
19 The output should be tab separated and should contain the following columns (in this order)
|
|
20 db-name molecular-formula dbe formula-weight id description
|
|
21
|
|
22
|
|
23 '''
|
|
24 import csv
|
|
25 import sys
|
|
26 import fileinput
|
|
27 import urllib2
|
|
28 import time
|
|
29 from collections import OrderedDict
|
|
30
|
|
31 __author__ = "Pieter Lukasse"
|
|
32 __contact__ = "pieter.lukasse@wur.nl"
|
|
33 __copyright__ = "Copyright, 2014, Plant Research International, WUR"
|
|
34 __license__ = "Apache v2"
|
|
35
|
|
36 def _process_file(in_xsv, delim='\t'):
|
|
37 '''
|
|
38 Generic method to parse a tab-separated file returning a dictionary with named columns
|
|
39 @param in_csv: input filename to be parsed
|
|
40 '''
|
|
41 data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
|
|
42 return _process_data(data)
|
|
43
|
|
44 def _process_data(data):
|
|
45
|
|
46 header = data.pop(0)
|
|
47 # Create dictionary with column name as key
|
|
48 output = OrderedDict()
|
|
49 for index in xrange(len(header)):
|
|
50 output[header[index]] = [row[index] for row in data]
|
|
51 return output
|
|
52
|
|
53
|
|
54 def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit):
|
|
55
|
|
56 '''
|
|
57 This method will iterate over the record in the input_data and
|
|
58 will enrich them with the related information found (if any) in the
|
|
59 chosen repository/service
|
|
60
|
|
61 # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
|
|
62 '''
|
|
63 merged = []
|
|
64
|
|
65 for i in xrange(len(input_data[input_data.keys()[0]])):
|
|
66 # Get the record in same dictionary format as input_data, but containing
|
|
67 # a value at each column instead of a list of all values of all records:
|
|
68 input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
|
|
69
|
|
70 # read the molecular mass :
|
|
71 molecular_mass = input_data_record[molecular_mass_col]
|
|
72
|
|
73
|
|
74 # search for related records in repository/service:
|
|
75 data_found = None
|
|
76 if molecular_mass != "":
|
|
77 molecular_mass = float(molecular_mass)
|
|
78
|
|
79 # 1- search for data around this MM:
|
|
80 query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth"
|
|
81
|
|
82 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
|
|
83 data_type_found = "MM"
|
|
84
|
|
85
|
|
86 if data_found == None:
|
|
87 # If still nothing found, just add empty columns
|
|
88 extra_cols = ['', '','','','','']
|
|
89 else:
|
|
90 # Add info found:
|
|
91 extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
|
|
92
|
|
93 # Take all data and merge it into a "flat"/simple array of values:
|
|
94 field_values_list = _merge_data(input_data_record, extra_cols)
|
|
95
|
|
96 merged.append(field_values_list)
|
|
97
|
|
98 # return the merged/enriched records:
|
|
99 return merged
|
|
100
|
|
101
|
|
102 def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
|
|
103 '''
|
|
104 This method will go over the data found and will return a
|
|
105 list with the following items:
|
|
106 - details of hits found :
|
|
107 db-name molecular-formula dbe formula-weight id description
|
|
108 - Link that executes same query
|
|
109
|
|
110 '''
|
|
111
|
|
112 # set() makes a unique list:
|
|
113 db_name_set = []
|
|
114 molecular_formula_set = []
|
|
115 id_set = []
|
|
116 description_set = []
|
|
117
|
|
118
|
|
119 if 'db-name' in data_found:
|
|
120 db_name_set = set(data_found['db-name'])
|
|
121 elif '# db-name' in data_found:
|
|
122 db_name_set = set(data_found['# db-name'])
|
|
123 if 'molecular-formula' in data_found:
|
|
124 molecular_formula_set = set(data_found['molecular-formula'])
|
|
125 if 'id' in data_found:
|
|
126 id_set = set(data_found['id'])
|
|
127 if 'description' in data_found:
|
|
128 description_set = set(data_found['description'])
|
|
129
|
|
130 result = [data_type_found,
|
|
131 _to_xsv(db_name_set),
|
|
132 _to_xsv(molecular_formula_set),
|
|
133 _to_xsv(id_set),
|
|
134 _to_xsv(description_set),
|
|
135 #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"):
|
|
136 "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
|
|
137 return result
|
|
138
|
|
139
|
|
140 def _to_xsv(data_set):
|
|
141 result = ""
|
|
142 for item in data_set:
|
|
143 result = result + str(item) + "|"
|
|
144 return result
|
|
145
|
|
146
|
|
147 def _fire_query_and_return_dict(url):
|
|
148 '''
|
|
149 This method will fire the query as a web-service call and
|
|
150 return the results as a list of dictionary objects
|
|
151 '''
|
|
152
|
|
153 try:
|
|
154 data = urllib2.urlopen(url).read()
|
|
155
|
|
156 # transform to dictionary:
|
|
157 result = []
|
|
158 data_rows = data.split("\n")
|
|
159
|
|
160 # remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service):
|
|
161 data_rows_to_remove = []
|
|
162 for data_row in data_rows:
|
|
163 if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row):
|
|
164 data_rows_to_remove.append(data_row)
|
|
165
|
|
166 for data_row in data_rows_to_remove:
|
|
167 data_rows.remove(data_row)
|
|
168
|
|
169 # check if there is any data in the response:
|
|
170 if len(data_rows) <= 1 or data_rows[1].strip() == '':
|
|
171 # means there is only the header row...so no hits:
|
|
172 return None
|
|
173
|
|
174 for data_row in data_rows:
|
|
175 if not data_row.strip() == '':
|
|
176 row_as_list = _str_to_list(data_row, delimiter='\t')
|
|
177 result.append(row_as_list)
|
|
178
|
|
179 # return result processed into a dict:
|
|
180 return _process_data(result)
|
|
181
|
|
182 except urllib2.HTTPError, e:
|
|
183 raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
|
|
184 except urllib2.URLError, e:
|
|
185 raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ")
|
|
186
|
|
187 def _str_to_list(data_row, delimiter='\t'):
|
|
188 result = []
|
|
189 for column in data_row.split(delimiter):
|
|
190 result.append(column)
|
|
191 return result
|
|
192
|
|
193
|
|
194 # alternative: ?
|
|
195 # s = requests.Session()
|
|
196 # s.verify = False
|
|
197 # #s.auth = (token01, token02)
|
|
198 # resp = s.get(url, params={'name': 'anonymous'}, stream=True)
|
|
199 # content = resp.content
|
|
200 # # transform to dictionary:
|
|
201
|
|
202
|
|
203
|
|
204
|
|
205 def _merge_data(input_data_record, extra_cols):
|
|
206 '''
|
|
207 Adds the extra information to the existing data record and returns
|
|
208 the combined new record.
|
|
209 '''
|
|
210 record = []
|
|
211 for column in input_data_record:
|
|
212 record.append(input_data_record[column])
|
|
213
|
|
214
|
|
215 # add extra columns
|
|
216 for column in extra_cols:
|
|
217 record.append(column)
|
|
218
|
|
219 return record
|
|
220
|
|
221
|
|
222 def _save_data(data_rows, headers, out_csv):
|
|
223 '''
|
|
224 Writes tab-separated data to file
|
|
225 @param data_rows: dictionary containing merged/enriched dataset
|
|
226 @param out_csv: output csv file
|
|
227 '''
|
|
228
|
|
229 # Open output file for writing
|
|
230 outfile_single_handle = open(out_csv, 'wb')
|
|
231 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
|
|
232
|
|
233 # Write headers
|
|
234 output_single_handle.writerow(headers)
|
|
235
|
|
236 # Write one line for each row
|
|
237 for data_row in data_rows:
|
|
238 output_single_handle.writerow(data_row)
|
|
239
|
|
240 def _get_repository_URL(repository_file):
|
|
241 '''
|
|
242 Read out and return the URL stored in the given file.
|
|
243 '''
|
|
244 file_input = fileinput.input(repository_file)
|
|
245 try:
|
|
246 for line in file_input:
|
|
247 if line[0] != '#':
|
|
248 # just return the first line that is not a comment line:
|
|
249 return line
|
|
250 finally:
|
|
251 file_input.close()
|
|
252
|
|
253
|
|
254 def main():
|
|
255 '''
|
|
256 Query main function
|
|
257
|
|
258 The input file can be any tabular file, as long as it contains a column for the molecular mass.
|
|
259 This column is then used to query against the chosen repository/service Database.
|
|
260 '''
|
|
261 seconds_start = int(round(time.time()))
|
|
262
|
|
263 input_file = sys.argv[1]
|
|
264 molecular_mass_col = sys.argv[2]
|
|
265 repository_file = sys.argv[3]
|
|
266 error_margin = float(sys.argv[4])
|
|
267 margin_unit = sys.argv[5]
|
|
268 output_result = sys.argv[6]
|
|
269
|
|
270 # Parse repository_file to find the URL to the service:
|
|
271 repository_dblink = _get_repository_URL(repository_file)
|
|
272
|
|
273 # Parse tabular input file into dictionary/array:
|
|
274 input_data = _process_file(input_file)
|
|
275
|
|
276 # Query data against repository :
|
|
277 enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit)
|
|
278 headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ',
|
|
279 'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits'] #TODO - add min and max formula weigth columns
|
|
280
|
|
281 _save_data(enriched_data, headers, output_result)
|
|
282
|
|
283 seconds_end = int(round(time.time()))
|
|
284 print "Took " + str(seconds_end - seconds_start) + " seconds"
|
|
285
|
|
286
|
|
287
|
|
288 if __name__ == '__main__':
|
|
289 main()
|