Mercurial > repos > pieterlukasse > prims_metabolomics2
comparison query_mass_repos.py @ 0:dffc38727496
initial commit
author | pieter.lukasse@wur.nl |
---|---|
date | Sat, 07 Feb 2015 22:02:00 +0100 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:dffc38727496 |
---|---|
1 #!/usr/bin/env python | |
2 # encoding: utf-8 | |
3 ''' | |
4 Module to query a set of accurate mass values detected by high-resolution mass spectrometers | |
5 against various repositories/services such as METabolomics EXPlorer database or the | |
6 MFSearcher service (http://webs2.kazusa.or.jp/mfsearcher/). | |
7 | |
8 It will take the input file and for each record it will query the | |
9 molecular mass in the selected repository/service. If one or more compounds are found | |
10 then extra information regarding these compounds is added to the output file. | |
11 | |
12 The output file is thus the input file enriched with information about | |
13 related items found in the selected repository/service. | |
14 | |
15 The service should implement the following interface: | |
16 | |
17 http://service_url/mass?targetMs=500&margin=1&marginUnit=ppm&output=txth (txth means there is guaranteed to be a header line before the data) | |
18 | |
19 The output should be tab separated and should contain the following columns (in this order) | |
20 db-name molecular-formula dbe formula-weight id description | |
21 | |
22 | |
23 ''' | |
24 import csv | |
25 import sys | |
26 import fileinput | |
27 import urllib2 | |
28 import time | |
29 from collections import OrderedDict | |
30 | |
31 __author__ = "Pieter Lukasse" | |
32 __contact__ = "pieter.lukasse@wur.nl" | |
33 __copyright__ = "Copyright, 2014, Plant Research International, WUR" | |
34 __license__ = "Apache v2" | |
35 | |
36 def _process_file(in_xsv, delim='\t'): | |
37 ''' | |
38 Generic method to parse a tab-separated file returning a dictionary with named columns | |
39 @param in_csv: input filename to be parsed | |
40 ''' | |
41 data = list(csv.reader(open(in_xsv, 'rU'), delimiter=delim)) | |
42 return _process_data(data) | |
43 | |
44 def _process_data(data): | |
45 | |
46 header = data.pop(0) | |
47 # Create dictionary with column name as key | |
48 output = OrderedDict() | |
49 for index in xrange(len(header)): | |
50 output[header[index]] = [row[index] for row in data] | |
51 return output | |
52 | |
53 | |
54 def _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit): | |
55 | |
56 ''' | |
57 This method will iterate over the record in the input_data and | |
58 will enrich them with the related information found (if any) in the | |
59 chosen repository/service | |
60 | |
61 # TODO : could optimize this with multi-threading, see also nice example at http://stackoverflow.com/questions/2846653/python-multithreading-for-dummies | |
62 ''' | |
63 merged = [] | |
64 | |
65 for i in xrange(len(input_data[input_data.keys()[0]])): | |
66 # Get the record in same dictionary format as input_data, but containing | |
67 # a value at each column instead of a list of all values of all records: | |
68 input_data_record = OrderedDict(zip(input_data.keys(), [input_data[key][i] for key in input_data.keys()])) | |
69 | |
70 # read the molecular mass : | |
71 molecular_mass = input_data_record[molecular_mass_col] | |
72 | |
73 | |
74 # search for related records in repository/service: | |
75 data_found = None | |
76 if molecular_mass != "": | |
77 molecular_mass = float(molecular_mass) | |
78 | |
79 # 1- search for data around this MM: | |
80 query_link = repository_dblink + "/mass?targetMs=" + str(molecular_mass) + "&margin=" + str(error_margin) + "&marginUnit=" + margin_unit + "&output=txth" | |
81 | |
82 data_found = _fire_query_and_return_dict(query_link + "&_format_result=tsv") | |
83 data_type_found = "MM" | |
84 | |
85 | |
86 if data_found == None: | |
87 # If still nothing found, just add empty columns | |
88 extra_cols = ['', '','','','',''] | |
89 else: | |
90 # Add info found: | |
91 extra_cols = _get_extra_info_and_link_cols(data_found, data_type_found, query_link) | |
92 | |
93 # Take all data and merge it into a "flat"/simple array of values: | |
94 field_values_list = _merge_data(input_data_record, extra_cols) | |
95 | |
96 merged.append(field_values_list) | |
97 | |
98 # return the merged/enriched records: | |
99 return merged | |
100 | |
101 | |
102 def _get_extra_info_and_link_cols(data_found, data_type_found, query_link): | |
103 ''' | |
104 This method will go over the data found and will return a | |
105 list with the following items: | |
106 - details of hits found : | |
107 db-name molecular-formula dbe formula-weight id description | |
108 - Link that executes same query | |
109 | |
110 ''' | |
111 | |
112 # set() makes a unique list: | |
113 db_name_set = [] | |
114 molecular_formula_set = [] | |
115 id_set = [] | |
116 description_set = [] | |
117 | |
118 | |
119 if 'db-name' in data_found: | |
120 db_name_set = set(data_found['db-name']) | |
121 elif '# db-name' in data_found: | |
122 db_name_set = set(data_found['# db-name']) | |
123 if 'molecular-formula' in data_found: | |
124 molecular_formula_set = set(data_found['molecular-formula']) | |
125 if 'id' in data_found: | |
126 id_set = set(data_found['id']) | |
127 if 'description' in data_found: | |
128 description_set = set(data_found['description']) | |
129 | |
130 result = [data_type_found, | |
131 _to_xsv(db_name_set), | |
132 _to_xsv(molecular_formula_set), | |
133 _to_xsv(id_set), | |
134 _to_xsv(description_set), | |
135 #To let Excel interpret as link, use e.g. =HYPERLINK("http://stackoverflow.com", "friendly name"): | |
136 "=HYPERLINK(\""+ query_link + "\", \"Link to entries found in DB \")"] | |
137 return result | |
138 | |
139 | |
140 def _to_xsv(data_set): | |
141 result = "" | |
142 for item in data_set: | |
143 result = result + str(item) + "|" | |
144 return result | |
145 | |
146 | |
147 def _fire_query_and_return_dict(url): | |
148 ''' | |
149 This method will fire the query as a web-service call and | |
150 return the results as a list of dictionary objects | |
151 ''' | |
152 | |
153 try: | |
154 data = urllib2.urlopen(url).read() | |
155 | |
156 # transform to dictionary: | |
157 result = [] | |
158 data_rows = data.split("\n") | |
159 | |
160 # remove comment lines if any (only leave the one that has "molecular-formula" word in it...compatible with kazusa service): | |
161 data_rows_to_remove = [] | |
162 for data_row in data_rows: | |
163 if data_row == "" or (data_row[0] == '#' and "molecular-formula" not in data_row): | |
164 data_rows_to_remove.append(data_row) | |
165 | |
166 for data_row in data_rows_to_remove: | |
167 data_rows.remove(data_row) | |
168 | |
169 # check if there is any data in the response: | |
170 if len(data_rows) <= 1 or data_rows[1].strip() == '': | |
171 # means there is only the header row...so no hits: | |
172 return None | |
173 | |
174 for data_row in data_rows: | |
175 if not data_row.strip() == '': | |
176 row_as_list = _str_to_list(data_row, delimiter='\t') | |
177 result.append(row_as_list) | |
178 | |
179 # return result processed into a dict: | |
180 return _process_data(result) | |
181 | |
182 except urllib2.HTTPError, e: | |
183 raise Exception( "HTTP error for URL: " + url + " : %s - " % e.code + e.reason) | |
184 except urllib2.URLError, e: | |
185 raise Exception( "Network error: %s" % e.reason.args[1] + ". Administrator: please check if service [" + url + "] is accessible from your Galaxy server. ") | |
186 | |
187 def _str_to_list(data_row, delimiter='\t'): | |
188 result = [] | |
189 for column in data_row.split(delimiter): | |
190 result.append(column) | |
191 return result | |
192 | |
193 | |
194 # alternative: ? | |
195 # s = requests.Session() | |
196 # s.verify = False | |
197 # #s.auth = (token01, token02) | |
198 # resp = s.get(url, params={'name': 'anonymous'}, stream=True) | |
199 # content = resp.content | |
200 # # transform to dictionary: | |
201 | |
202 | |
203 | |
204 | |
205 def _merge_data(input_data_record, extra_cols): | |
206 ''' | |
207 Adds the extra information to the existing data record and returns | |
208 the combined new record. | |
209 ''' | |
210 record = [] | |
211 for column in input_data_record: | |
212 record.append(input_data_record[column]) | |
213 | |
214 | |
215 # add extra columns | |
216 for column in extra_cols: | |
217 record.append(column) | |
218 | |
219 return record | |
220 | |
221 | |
222 def _save_data(data_rows, headers, out_csv): | |
223 ''' | |
224 Writes tab-separated data to file | |
225 @param data_rows: dictionary containing merged/enriched dataset | |
226 @param out_csv: output csv file | |
227 ''' | |
228 | |
229 # Open output file for writing | |
230 outfile_single_handle = open(out_csv, 'wb') | |
231 output_single_handle = csv.writer(outfile_single_handle, delimiter="\t") | |
232 | |
233 # Write headers | |
234 output_single_handle.writerow(headers) | |
235 | |
236 # Write one line for each row | |
237 for data_row in data_rows: | |
238 output_single_handle.writerow(data_row) | |
239 | |
240 def _get_repository_URL(repository_file): | |
241 ''' | |
242 Read out and return the URL stored in the given file. | |
243 ''' | |
244 file_input = fileinput.input(repository_file) | |
245 try: | |
246 for line in file_input: | |
247 if line[0] != '#': | |
248 # just return the first line that is not a comment line: | |
249 return line | |
250 finally: | |
251 file_input.close() | |
252 | |
253 | |
254 def main(): | |
255 ''' | |
256 Query main function | |
257 | |
258 The input file can be any tabular file, as long as it contains a column for the molecular mass. | |
259 This column is then used to query against the chosen repository/service Database. | |
260 ''' | |
261 seconds_start = int(round(time.time())) | |
262 | |
263 input_file = sys.argv[1] | |
264 molecular_mass_col = sys.argv[2] | |
265 repository_file = sys.argv[3] | |
266 error_margin = float(sys.argv[4]) | |
267 margin_unit = sys.argv[5] | |
268 output_result = sys.argv[6] | |
269 | |
270 # Parse repository_file to find the URL to the service: | |
271 repository_dblink = _get_repository_URL(repository_file) | |
272 | |
273 # Parse tabular input file into dictionary/array: | |
274 input_data = _process_file(input_file) | |
275 | |
276 # Query data against repository : | |
277 enriched_data = _query_and_add_data(input_data, molecular_mass_col, repository_dblink, error_margin, margin_unit) | |
278 headers = input_data.keys() + ['SEARCH hits for ','SEARCH hits: db-names', 'SEARCH hits: molecular-formulas ', | |
279 'SEARCH hits: ids','SEARCH hits: descriptions', 'Link to SEARCH hits'] #TODO - add min and max formula weigth columns | |
280 | |
281 _save_data(enriched_data, headers, output_result) | |
282 | |
283 seconds_end = int(round(time.time())) | |
284 print "Took " + str(seconds_end - seconds_start) + " seconds" | |
285 | |
286 | |
287 | |
288 if __name__ == '__main__': | |
289 main() |