comparison query_mass_repos.py @ 0:dffc38727496

initial commit
author pieter.lukasse@wur.nl
date Sat, 07 Feb 2015 22:02:00 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dffc38727496
1 #!/usr/bin/env python
2 # encoding: utf-8
3 '''
4 Module to query a set of accurate mass values detected by high-resolution mass spectrometers
5 against various repositories/services such as METabolomics EXPlorer database or the
6 MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/).
7
8 It will take the input file and for each record it will query the
9 molecular mass in the selected repository/service. If one or more compounds are found
10 then extra information regarding these compounds is added to the output file.
11
12 The output file is thus the input file enriched with information about
13 related items found in the selected repository/service.
14
15 The service should implement the following interface:
16
17 http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data)
18
19 The output should be tab separated and should contain the following columns (in this order)
20 db-name molecular-formula dbe formula-weight id description
21
22
23 '''
24 import csv
25 import sys
26 import fileinput
27 import urllib2
28 import time
29 from collections import OrderedDict
30
31 __author__ = "Pieter Lukasse"
32 __contact__ = "pieter.lukasse@wur.nl"
33 __copyright__ = "Copyright, 2014, Plant Research International, WUR"
34 __license__ = "Apache v2"
35
36 def _process_file(in_xsv, delim='\t'):
37 '''
38 Generic method to parse a tab-separated file returning a dictionary with named columns
39 @param in_csv: input filename to be parsed
40 '''
41 data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
42 return _process_data(data)
43
44 def _process_data(data):
45
46 header = data.pop(0)
47 # Create dictionary with column name as key
48 output = OrderedDict()
49 for index in xrange(len(header)):
50 output[header[index]] = [row[index] for row in data]
51 return output
52
53
54 def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit):
55
56 '''
57 This method will iterate over the record in the input_data and
58 will enrich them with the related information found (if any) in the
59 chosen repository/service
60
61 # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
62 '''
63 merged = []
64
65 for i in xrange(len(input_data[input_data.keys()[0]])):
66 # Get the record in same dictionary format as input_data, but containing
67 # a value at each column instead of a list of all values of all records:
68 input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
69
70 # read the molecular mass :
71 molecular_mass = input_data_record[molecular_mass_col]
72
73
74 # search for related records in repository/service:
75 data_found = None
76 if molecular_mass != "":
77 molecular_mass = float(molecular_mass)
78
79 # 1- search for data around this MM:
80 query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth"
81
82 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
83 data_type_found = "MM"
84
85
86 if data_found == None:
87 # If still nothing found, just add empty columns
88 extra_cols = ['', '','','','','']
89 else:
90 # Add info found:
91 extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
92
93 # Take all data and merge it into a "flat"/simple array of values:
94 field_values_list = _merge_data(input_data_record, extra_cols)
95
96 merged.append(field_values_list)
97
98 # return the merged/enriched records:
99 return merged
100
101
102 def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
103 '''
104 This method will go over the data found and will return a
105 list with the following items:
106 - details of hits found :
107 db-name molecular-formula dbe formula-weight id description
108 - Link that executes same query
109
110 '''
111
112 # set() makes a unique list:
113 db_name_set = []
114 molecular_formula_set = []
115 id_set = []
116 description_set = []
117
118
119 if 'db-name' in data_found:
120 db_name_set = set(data_found['db-name'])
121 elif '# db-name' in data_found:
122 db_name_set = set(data_found['# db-name'])
123 if 'molecular-formula' in data_found:
124 molecular_formula_set = set(data_found['molecular-formula'])
125 if 'id' in data_found:
126 id_set = set(data_found['id'])
127 if 'description' in data_found:
128 description_set = set(data_found['description'])
129
130 result = [data_type_found,
131 _to_xsv(db_name_set),
132 _to_xsv(molecular_formula_set),
133 _to_xsv(id_set),
134 _to_xsv(description_set),
135 #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"):
136 "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
137 return result
138
139
140 def _to_xsv(data_set):
141 result = ""
142 for item in data_set:
143 result = result + str(item) + "|"
144 return result
145
146
147 def _fire_query_and_return_dict(url):
148 '''
149 This method will fire the query as a web-service call and
150 return the results as a list of dictionary objects
151 '''
152
153 try:
154 data = urllib2.urlopen(url).read()
155
156 # transform to dictionary:
157 result = []
158 data_rows = data.split("\n")
159
160 # remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service):
161 data_rows_to_remove = []
162 for data_row in data_rows:
163 if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row):
164 data_rows_to_remove.append(data_row)
165
166 for data_row in data_rows_to_remove:
167 data_rows.remove(data_row)
168
169 # check if there is any data in the response:
170 if len(data_rows) <= 1 or data_rows[1].strip() == '':
171 # means there is only the header row...so no hits:
172 return None
173
174 for data_row in data_rows:
175 if not data_row.strip() == '':
176 row_as_list = _str_to_list(data_row, delimiter='\t')
177 result.append(row_as_list)
178
179 # return result processed into a dict:
180 return _process_data(result)
181
182 except urllib2.HTTPError, e:
183 raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
184 except urllib2.URLError, e:
185 raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ")
186
187 def _str_to_list(data_row, delimiter='\t'):
188 result = []
189 for column in data_row.split(delimiter):
190 result.append(column)
191 return result
192
193
194 # alternative: ?
195 # s = requests.Session()
196 # s.verify = False
197 # #s.auth = (token01, token02)
198 # resp = s.get(url, params={'name': 'anonymous'}, stream=True)
199 # content = resp.content
200 # # transform to dictionary:
201
202
203
204
205 def _merge_data(input_data_record, extra_cols):
206 '''
207 Adds the extra information to the existing data record and returns
208 the combined new record.
209 '''
210 record = []
211 for column in input_data_record:
212 record.append(input_data_record[column])
213
214
215 # add extra columns
216 for column in extra_cols:
217 record.append(column)
218
219 return record
220
221
222 def _save_data(data_rows, headers, out_csv):
223 '''
224 Writes tab-separated data to file
225 @param data_rows: dictionary containing merged/enriched dataset
226 @param out_csv: output csv file
227 '''
228
229 # Open output file for writing
230 outfile_single_handle = open(out_csv, 'wb')
231 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
232
233 # Write headers
234 output_single_handle.writerow(headers)
235
236 # Write one line for each row
237 for data_row in data_rows:
238 output_single_handle.writerow(data_row)
239
240 def _get_repository_URL(repository_file):
241 '''
242 Read out and return the URL stored in the given file.
243 '''
244 file_input = fileinput.input(repository_file)
245 try:
246 for line in file_input:
247 if line[0] != '#':
248 # just return the first line that is not a comment line:
249 return line
250 finally:
251 file_input.close()
252
253
254 def main():
255 '''
256 Query main function
257
258 The input file can be any tabular file, as long as it contains a column for the molecular mass.
259 This column is then used to query against the chosen repository/service Database.
260 '''
261 seconds_start = int(round(time.time()))
262
263 input_file = sys.argv[1]
264 molecular_mass_col = sys.argv[2]
265 repository_file = sys.argv[3]
266 error_margin = float(sys.argv[4])
267 margin_unit = sys.argv[5]
268 output_result = sys.argv[6]
269
270 # Parse repository_file to find the URL to the service:
271 repository_dblink = _get_repository_URL(repository_file)
272
273 # Parse tabular input file into dictionary/array:
274 input_data = _process_file(input_file)
275
276 # Query data against repository :
277 enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit)
278 headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ',
279 'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits'] #TODO - add min and max formula weigth columns
280
281 _save_data(enriched_data, headers, output_result)
282
283 seconds_end = int(round(time.time()))
284 print "Took " + str(seconds_end - seconds_start) + " seconds"
285
286
287
288 if __name__ == '__main__':
289 main()