comparison query_metexp.py @ 0:dffc38727496

initial commit
author pieter.lukasse@wur.nl
date Sat, 07 Feb 2015 22:02:00 +0100
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:dffc38727496
1 #!/usr/bin/env python
2 # encoding: utf-8
3 '''
4 Module to query a set of identifications against the METabolomics EXPlorer database.
5
6 It will take the input file and for each record it will query the
7 molecular mass in the selected MetExp DB. If one or more compounds are found in the
8 MetExp DB then extra information regarding these compounds is added to the output file.
9
10 The output file is thus the input file enriched with information about
11 related items found in the selected MetExp DB.
12 '''
13 import csv
14 import sys
15 import fileinput
16 import urllib2
17 import time
18 from collections import OrderedDict
19
20 __author__ = "Pieter Lukasse"
21 __contact__ = "pieter.lukasse@wur.nl"
22 __copyright__ = "Copyright, 2014, Plant Research International, WUR"
23 __license__ = "Apache v2"
24
25 def _process_file(in_xsv, delim='\t'):
26 '''
27 Generic method to parse a tab-separated file returning a dictionary with named columns
28 @param in_csv: input filename to be parsed
29 '''
30 data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim))
31 return _process_data(data)
32
33 def _process_data(data):
34
35 header = data.pop(0)
36 # Create dictionary with column name as key
37 output = OrderedDict()
38 for index in xrange(len(header)):
39 output[header[index]] = [row[index] for row in data]
40 return output
41
42
43 def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method):
44 '''
45 This method will iterate over the record in the input_data and
46 will enrich them with the related information found (if any) in the
47 MetExp Database.
48
49 # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies
50 '''
51 merged = []
52
53 for i in xrange(len(input_data[input_data.keys()[0]])):
54 # Get the record in same dictionary format as input_data, but containing
55 # a value at each column instead of a list of all values of all records:
56 input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()]))
57
58 # read the molecular mass and formula:
59 cas_id = input_data_record[casid_col]
60 formula = input_data_record[formula_col]
61 molecular_mass = input_data_record[molecular_mass_col]
62
63 # search for related records in MetExp:
64 data_found = None
65 if cas_id != "undef":
66 # 1- search for other experiments where this CAS id has been found:
67 query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method
68 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
69 data_type_found = "CAS"
70 if data_found == None:
71 # 2- search for other experiments where this FORMULA has been found:
72 query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method
73 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
74 data_type_found = "FORMULA"
75 if data_found == None:
76 # 3- search for other experiments where this MM has been found:
77 query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method
78 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv")
79 data_type_found = "MM"
80
81 if data_found == None:
82 # If still nothing found, just add empty columns
83 extra_cols = ['', '','','','','','','']
84 else:
85 # Add info found:
86 extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link)
87
88 # Take all data and merge it into a "flat"/simple array of values:
89 field_values_list = _merge_data(input_data_record, extra_cols)
90
91 merged.append(field_values_list)
92
93 # return the merged/enriched records:
94 return merged
95
96
97 def _get_extra_info_and_link_cols(data_found, data_type_found, query_link):
98 '''
99 This method will go over the data found and will return a
100 list with the following items:
101 - Experiment details where hits have been found :
102 'organism', 'tissue','experiment_name','user_name','column_type'
103 - Link that executes same query
104
105 '''
106 # set() makes a unique list:
107 organism_set = []
108 tissue_set = []
109 experiment_name_set = []
110 user_name_set = []
111 column_type_set = []
112 cas_nr_set = []
113
114 if 'organism' in data_found:
115 organism_set = set(data_found['organism'])
116 if 'tissue' in data_found:
117 tissue_set = set(data_found['tissue'])
118 if 'experiment_name' in data_found:
119 experiment_name_set = set(data_found['experiment_name'])
120 if 'user_name' in data_found:
121 user_name_set = set(data_found['user_name'])
122 if 'column_type' in data_found:
123 column_type_set = set(data_found['column_type'])
124 if 'CAS' in data_found:
125 cas_nr_set = set(data_found['CAS'])
126
127
128 result = [data_type_found,
129 _to_xsv(organism_set),
130 _to_xsv(tissue_set),
131 _to_xsv(experiment_name_set),
132 _to_xsv(user_name_set),
133 _to_xsv(column_type_set),
134 _to_xsv(cas_nr_set),
135 #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"):
136 "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"]
137 return result
138
139
140 def _to_xsv(data_set):
141 result = ""
142 for item in data_set:
143 result = result + str(item) + "|"
144 return result
145
146
147 def _fire_query_and_return_dict(url):
148 '''
149 This method will fire the query as a web-service call and
150 return the results as a list of dictionary objects
151 '''
152
153 try:
154 data = urllib2.urlopen(url).read()
155
156 # transform to dictionary:
157 result = []
158 data_rows = data.split("\n")
159
160 # check if there is any data in the response:
161 if len(data_rows) <= 1 or data_rows[1].strip() == '':
162 # means there is only the header row...so no hits:
163 return None
164
165 for data_row in data_rows:
166 if not data_row.strip() == '':
167 row_as_list = _str_to_list(data_row, delimiter='\t')
168 result.append(row_as_list)
169
170 # return result processed into a dict:
171 return _process_data(result)
172
173 except urllib2.HTTPError, e:
174 raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason)
175 except urllib2.URLError, e:
176 raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ")
177
178 def _str_to_list(data_row, delimiter='\t'):
179 result = []
180 for column in data_row.split(delimiter):
181 result.append(column)
182 return result
183
184
185 # alternative: ?
186 # s = requests.Session()
187 # s.verify = False
188 # #s.auth = (token01, token02)
189 # resp = s.get(url, params={'name': 'anonymous'}, stream=True)
190 # content = resp.content
191 # # transform to dictionary:
192
193
194
195
196 def _merge_data(input_data_record, extra_cols):
197 '''
198 Adds the extra information to the existing data record and returns
199 the combined new record.
200 '''
201 record = []
202 for column in input_data_record:
203 record.append(input_data_record[column])
204
205
206 # add extra columns
207 for column in extra_cols:
208 record.append(column)
209
210 return record
211
212
213 def _save_data(data_rows, headers, out_csv):
214 '''
215 Writes tab-separated data to file
216 @param data_rows: dictionary containing merged/enriched dataset
217 @param out_csv: output csv file
218 '''
219
220 # Open output file for writing
221 outfile_single_handle = open(out_csv, 'wb')
222 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t")
223
224 # Write headers
225 output_single_handle.writerow(headers)
226
227 # Write one line for each row
228 for data_row in data_rows:
229 output_single_handle.writerow(data_row)
230
231 def _get_metexp_URL(metexp_dblink_file):
232 '''
233 Read out and return the URL stored in the given file.
234 '''
235 file_input = fileinput.input(metexp_dblink_file)
236 try:
237 for line in file_input:
238 if line[0] != '#':
239 # just return the first line that is not a comment line:
240 return line
241 finally:
242 file_input.close()
243
244
245 def main():
246 '''
247 MetExp Query main function
248
249 The input file can be any tabular file, as long as it contains a column for the molecular mass
250 and one for the formula of the respective identification. These two columns are then
251 used to query against MetExp Database.
252 '''
253 seconds_start = int(round(time.time()))
254
255 input_file = sys.argv[1]
256 casid_col = sys.argv[2]
257 formula_col = sys.argv[3]
258 molecular_mass_col = sys.argv[4]
259 metexp_dblink_file = sys.argv[5]
260 separation_method = sys.argv[6]
261 output_result = sys.argv[7]
262
263 # Parse metexp_dblink_file to find the URL to the MetExp service:
264 metexp_dblink = _get_metexp_URL(metexp_dblink_file)
265
266 # Parse tabular input file into dictionary/array:
267 input_data = _process_file(input_file)
268
269 # Query data against MetExp DB :
270 enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method)
271 headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues',
272 'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits']
273
274 _save_data(enriched_data, headers, output_result)
275
276 seconds_end = int(round(time.time()))
277 print "Took " + str(seconds_end - seconds_start) + " seconds"
278
279
280
281 if __name__ == '__main__':
282 main()