Mercurial > repos > pieterlukasse > prims_metabolomics2
comparison query_metexp.py @ 0:dffc38727496
initial commit
author | pieter.lukasse@wur.nl |
---|---|
date | Sat, 07 Feb 2015 22:02:00 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dffc38727496 |
---|---|
1 #!/usr/bin/env python | |
2 # encoding: utf-8 | |
3 ''' | |
4 Module to query a set of identifications against the METabolomics EXPlorer database. | |
5 | |
6 It will take the input file and for each record it will query the | |
7 molecular mass in the selected MetExp DB. If one or more compounds are found in the | |
8 MetExp DB then extra information regarding these compounds is added to the output file. | |
9 | |
10 The output file is thus the input file enriched with information about | |
11 related items found in the selected MetExp DB. | |
12 ''' | |
13 import csv | |
14 import sys | |
15 import fileinput | |
16 import urllib2 | |
17 import time | |
18 from collections import OrderedDict | |
19 | |
20 __author__ = "Pieter Lukasse" | |
21 __contact__ = "pieter.lukasse@wur.nl" | |
22 __copyright__ = "Copyright, 2014, Plant Research International, WUR" | |
23 __license__ = "Apache v2" | |
24 | |
25 def _process_file(in_xsv, delim='\t'): | |
26 ''' | |
27 Generic method to parse a tab-separated file returning a dictionary with named columns | |
28 @param in_csv: input filename to be parsed | |
29 ''' | |
30 data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim)) | |
31 return _process_data(data) | |
32 | |
33 def _process_data(data): | |
34 | |
35 header = data.pop(0) | |
36 # Create dictionary with column name as key | |
37 output = OrderedDict() | |
38 for index in xrange(len(header)): | |
39 output[header[index]] = [row[index] for row in data] | |
40 return output | |
41 | |
42 | |
43 def _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method): | |
44 ''' | |
45 This method will iterate over the record in the input_data and | |
46 will enrich them with the related information found (if any) in the | |
47 MetExp Database. | |
48 | |
49 # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies | |
50 ''' | |
51 merged = [] | |
52 | |
53 for i in xrange(len(input_data[input_data.keys()[0]])): | |
54 # Get the record in same dictionary format as input_data, but containing | |
55 # a value at each column instead of a list of all values of all records: | |
56 input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()])) | |
57 | |
58 # read the molecular mass and formula: | |
59 cas_id = input_data_record[casid_col] | |
60 formula = input_data_record[formula_col] | |
61 molecular_mass = input_data_record[molecular_mass_col] | |
62 | |
63 # search for related records in MetExp: | |
64 data_found = None | |
65 if cas_id != "undef": | |
66 # 1- search for other experiments where this CAS id has been found: | |
67 query_link = metexp_dblink + "/find_entries/query?cas_nr="+ cas_id + "&method=" + separation_method | |
68 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") | |
69 data_type_found = "CAS" | |
70 if data_found == None: | |
71 # 2- search for other experiments where this FORMULA has been found: | |
72 query_link = metexp_dblink + "/find_entries/query?molecule_formula="+ formula + "&method=" + separation_method | |
73 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") | |
74 data_type_found = "FORMULA" | |
75 if data_found == None: | |
76 # 3- search for other experiments where this MM has been found: | |
77 query_link = metexp_dblink + "/find_entries/query?molecule_mass="+ molecular_mass + "&method=" + separation_method | |
78 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") | |
79 data_type_found = "MM" | |
80 | |
81 if data_found == None: | |
82 # If still nothing found, just add empty columns | |
83 extra_cols = ['', '','','','','','',''] | |
84 else: | |
85 # Add info found: | |
86 extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link) | |
87 | |
88 # Take all data and merge it into a "flat"/simple array of values: | |
89 field_values_list = _merge_data(input_data_record, extra_cols) | |
90 | |
91 merged.append(field_values_list) | |
92 | |
93 # return the merged/enriched records: | |
94 return merged | |
95 | |
96 | |
97 def _get_extra_info_and_link_cols(data_found, data_type_found, query_link): | |
98 ''' | |
99 This method will go over the data found and will return a | |
100 list with the following items: | |
101 - Experiment details where hits have been found : | |
102 'organism', 'tissue','experiment_name','user_name','column_type' | |
103 - Link that executes same query | |
104 | |
105 ''' | |
106 # set() makes a unique list: | |
107 organism_set = [] | |
108 tissue_set = [] | |
109 experiment_name_set = [] | |
110 user_name_set = [] | |
111 column_type_set = [] | |
112 cas_nr_set = [] | |
113 | |
114 if 'organism' in data_found: | |
115 organism_set = set(data_found['organism']) | |
116 if 'tissue' in data_found: | |
117 tissue_set = set(data_found['tissue']) | |
118 if 'experiment_name' in data_found: | |
119 experiment_name_set = set(data_found['experiment_name']) | |
120 if 'user_name' in data_found: | |
121 user_name_set = set(data_found['user_name']) | |
122 if 'column_type' in data_found: | |
123 column_type_set = set(data_found['column_type']) | |
124 if 'CAS' in data_found: | |
125 cas_nr_set = set(data_found['CAS']) | |
126 | |
127 | |
128 result = [data_type_found, | |
129 _to_xsv(organism_set), | |
130 _to_xsv(tissue_set), | |
131 _to_xsv(experiment_name_set), | |
132 _to_xsv(user_name_set), | |
133 _to_xsv(column_type_set), | |
134 _to_xsv(cas_nr_set), | |
135 #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): | |
136 "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"] | |
137 return result | |
138 | |
139 | |
140 def _to_xsv(data_set): | |
141 result = "" | |
142 for item in data_set: | |
143 result = result + str(item) + "|" | |
144 return result | |
145 | |
146 | |
147 def _fire_query_and_return_dict(url): | |
148 ''' | |
149 This method will fire the query as a web-service call and | |
150 return the results as a list of dictionary objects | |
151 ''' | |
152 | |
153 try: | |
154 data = urllib2.urlopen(url).read() | |
155 | |
156 # transform to dictionary: | |
157 result = [] | |
158 data_rows = data.split("\n") | |
159 | |
160 # check if there is any data in the response: | |
161 if len(data_rows) <= 1 or data_rows[1].strip() == '': | |
162 # means there is only the header row...so no hits: | |
163 return None | |
164 | |
165 for data_row in data_rows: | |
166 if not data_row.strip() == '': | |
167 row_as_list = _str_to_list(data_row, delimiter='\t') | |
168 result.append(row_as_list) | |
169 | |
170 # return result processed into a dict: | |
171 return _process_data(result) | |
172 | |
173 except urllib2.HTTPError, e: | |
174 raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason) | |
175 except urllib2.URLError, e: | |
176 raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if MetExp service [" + url + "] is accessible from your Galaxy server. ") | |
177 | |
178 def _str_to_list(data_row, delimiter='\t'): | |
179 result = [] | |
180 for column in data_row.split(delimiter): | |
181 result.append(column) | |
182 return result | |
183 | |
184 | |
185 # alternative: ? | |
186 # s = requests.Session() | |
187 # s.verify = False | |
188 # #s.auth = (token01, token02) | |
189 # resp = s.get(url, params={'name': 'anonymous'}, stream=True) | |
190 # content = resp.content | |
191 # # transform to dictionary: | |
192 | |
193 | |
194 | |
195 | |
196 def _merge_data(input_data_record, extra_cols): | |
197 ''' | |
198 Adds the extra information to the existing data record and returns | |
199 the combined new record. | |
200 ''' | |
201 record = [] | |
202 for column in input_data_record: | |
203 record.append(input_data_record[column]) | |
204 | |
205 | |
206 # add extra columns | |
207 for column in extra_cols: | |
208 record.append(column) | |
209 | |
210 return record | |
211 | |
212 | |
213 def _save_data(data_rows, headers, out_csv): | |
214 ''' | |
215 Writes tab-separated data to file | |
216 @param data_rows: dictionary containing merged/enriched dataset | |
217 @param out_csv: output csv file | |
218 ''' | |
219 | |
220 # Open output file for writing | |
221 outfile_single_handle = open(out_csv, 'wb') | |
222 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") | |
223 | |
224 # Write headers | |
225 output_single_handle.writerow(headers) | |
226 | |
227 # Write one line for each row | |
228 for data_row in data_rows: | |
229 output_single_handle.writerow(data_row) | |
230 | |
231 def _get_metexp_URL(metexp_dblink_file): | |
232 ''' | |
233 Read out and return the URL stored in the given file. | |
234 ''' | |
235 file_input = fileinput.input(metexp_dblink_file) | |
236 try: | |
237 for line in file_input: | |
238 if line[0] != '#': | |
239 # just return the first line that is not a comment line: | |
240 return line | |
241 finally: | |
242 file_input.close() | |
243 | |
244 | |
245 def main(): | |
246 ''' | |
247 MetExp Query main function | |
248 | |
249 The input file can be any tabular file, as long as it contains a column for the molecular mass | |
250 and one for the formula of the respective identification. These two columns are then | |
251 used to query against MetExp Database. | |
252 ''' | |
253 seconds_start = int(round(time.time())) | |
254 | |
255 input_file = sys.argv[1] | |
256 casid_col = sys.argv[2] | |
257 formula_col = sys.argv[3] | |
258 molecular_mass_col = sys.argv[4] | |
259 metexp_dblink_file = sys.argv[5] | |
260 separation_method = sys.argv[6] | |
261 output_result = sys.argv[7] | |
262 | |
263 # Parse metexp_dblink_file to find the URL to the MetExp service: | |
264 metexp_dblink = _get_metexp_URL(metexp_dblink_file) | |
265 | |
266 # Parse tabular input file into dictionary/array: | |
267 input_data = _process_file(input_file) | |
268 | |
269 # Query data against MetExp DB : | |
270 enriched_data = _query_and_add_data(input_data, casid_col, formula_col, molecular_mass_col, metexp_dblink, separation_method) | |
271 headers = input_data.keys() + ['METEXP hits for ','METEXP hits: organisms', 'METEXP hits: tissues', | |
272 'METEXP hits: experiments','METEXP hits: user names','METEXP hits: column types', 'METEXP hits: CAS nrs', 'Link to METEXP hits'] | |
273 | |
274 _save_data(enriched_data, headers, output_result) | |
275 | |
276 seconds_end = int(round(time.time())) | |
277 print "Took " + str(seconds_end - seconds_start) + " seconds" | |
278 | |
279 | |
280 | |
281 if __name__ == '__main__': | |
282 main() |