Mercurial > repos > pieterlukasse > prims_metabolomics2
annotate GCMS/library_lookup.py @ 25:9f03c8587d6b draft default tip
MetExp msclust upload format changed from tabular to csv
author | linda-bakker |
---|---|
date | Fri, 24 Aug 2018 09:56:05 -0400 |
parents | f0c6feab06e7 |
children |
rev | line source |
---|---|
6 | 1 ''' |
2 Logic for searching a Retention Index database file given output from NIST | |
3 ''' | |
22
f0c6feab06e7
fixed import match_library
linda.bakker@wur.nl <linda.bakker@wur.nl>
parents:
20
diff
changeset
|
4 import os |
f0c6feab06e7
fixed import match_library
linda.bakker@wur.nl <linda.bakker@wur.nl>
parents:
20
diff
changeset
|
5 import sys |
f0c6feab06e7
fixed import match_library
linda.bakker@wur.nl <linda.bakker@wur.nl>
parents:
20
diff
changeset
|
6 import inspect |
6 | 7 import re |
8 import csv | |
9 | |
22
f0c6feab06e7
fixed import match_library
linda.bakker@wur.nl <linda.bakker@wur.nl>
parents:
20
diff
changeset
|
10 |
f0c6feab06e7
fixed import match_library
linda.bakker@wur.nl <linda.bakker@wur.nl>
parents:
20
diff
changeset
|
11 #a not so perfect solution to import parent dir file match_library |
f0c6feab06e7
fixed import match_library
linda.bakker@wur.nl <linda.bakker@wur.nl>
parents:
20
diff
changeset
|
12 currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) |
f0c6feab06e7
fixed import match_library
linda.bakker@wur.nl <linda.bakker@wur.nl>
parents:
20
diff
changeset
|
13 parentdir = os.path.dirname(currentdir) |
f0c6feab06e7
fixed import match_library
linda.bakker@wur.nl <linda.bakker@wur.nl>
parents:
20
diff
changeset
|
14 sys.path.append(parentdir) |
f0c6feab06e7
fixed import match_library
linda.bakker@wur.nl <linda.bakker@wur.nl>
parents:
20
diff
changeset
|
15 import match_library |
f0c6feab06e7
fixed import match_library
linda.bakker@wur.nl <linda.bakker@wur.nl>
parents:
20
diff
changeset
|
16 |
f0c6feab06e7
fixed import match_library
linda.bakker@wur.nl <linda.bakker@wur.nl>
parents:
20
diff
changeset
|
17 |
6 | 18 __author__ = "Marcel Kempenaar" |
19 __contact__ = "brs@nbic.nl" | |
20 __copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre" | |
21 __license__ = "MIT" | |
22 | |
23 def create_lookup_table(library_file, column_type_name, statphase): | |
24 ''' | |
25 Creates a dictionary holding the contents of the library to be searched | |
26 @param library_file: library to read | |
27 @param column_type_name: the columns type name | |
28 @param statphase: the columns stationary phase | |
29 ''' | |
30 (data, header) = match_library.read_library(library_file) | |
31 # Test for presence of required columns | |
32 if ('columntype' not in header or | |
33 'columnphasetype' not in header or | |
34 'cas' not in header): | |
20
f70b2c169e3a
fixed select column option
linda.bakker@wur.nl <linda.bakker@wur.nl>
parents:
6
diff
changeset
|
35 raise IOError('Missing columns (create_lookup_table) in ', library_file) |
6 | 36 |
37 column_type_column = header.index("columntype") | |
38 statphase_column = header.index("columnphasetype") | |
39 cas_column = header.index("cas") | |
40 | |
41 filtered_library = [line for line in data if line[column_type_column] == column_type_name | |
42 and line[statphase_column] == statphase] | |
43 lookup_dict = {} | |
44 for element in filtered_library: | |
45 # Here the cas_number is set to the numeric part of the cas_column value, so if the | |
46 # cas_column value is 'C1433' then cas_number will be '1433' | |
47 cas_number = str(re.findall(r'\d+', (element[cas_column]).strip())[0]) | |
48 try: | |
49 lookup_dict[cas_number].append(element) | |
50 except KeyError: | |
51 lookup_dict[cas_number] = [element] | |
52 return lookup_dict | |
53 | |
54 | |
55 def _preferred(hits, pref, ctype, polar, model, method): | |
56 ''' | |
57 Returns all entries in the lookup_dict that have the same column name, type and polarity | |
58 as given by the user, uses regression if selected given the model and method to use. The | |
59 regression is applied on the column with the best R-squared value in the model | |
60 @param hits: all entries in the lookup_dict for the given CAS number | |
61 @param pref: preferred GC-column, can be one or more names | |
62 @param ctype: column type (capillary etc.) | |
63 @param polar: polarity (polar / non-polar etc.) | |
64 @param model: data loaded from file containing regression models | |
65 @param method: supported regression method (i.e. poly(nomial) or linear) | |
66 ''' | |
67 match = [] | |
68 for column in pref: | |
69 for hit in hits: | |
70 if hit[4] == ctype and hit[5] == polar and hit[6] == column: | |
71 # Create copy of found hit since it will be altered downstream | |
72 match.extend(hit) | |
73 return match, False | |
74 | |
75 # No hit found for current CAS number, return if not performing regression | |
76 if not model: | |
77 return False, False | |
78 | |
79 # Perform regression | |
80 for column in pref: | |
81 if column not in model: | |
82 break | |
83 # Order regression candidates by R-squared value (last element) | |
84 order = sorted(model[column].items(), key=lambda col: col[1][-1]) | |
85 # Create list of regression candidate column names | |
86 regress_columns = list(reversed([column for (column, _) in order])) | |
87 # Names of available columns | |
88 available = [hit[6] for hit in hits] | |
89 | |
90 # TODO: combine Rsquared and number of datapoints to get the best regression match | |
91 ''' | |
92 # Iterate regression columns (in order) and retrieve their models | |
93 models = {} | |
94 for col in regress_columns: | |
95 if col in available: | |
96 hit = list(hits[available.index(col)]) | |
97 if hit[4] == ctype: | |
98 # models contains all model data including residuals [-2] and rsquared [-1] | |
99 models[pref[0]] = model[pref[0]][hit[6]] | |
100 # Get the combined maximum for residuals and rsquared | |
101 best_match = models[] | |
102 # Apply regression | |
103 if method == 'poly': | |
104 regressed = _apply_poly_regression(best_match, hit[6], float(hit[3]), model) | |
105 if regressed: | |
106 hit[3] = regressed | |
107 else: | |
108 return False, False | |
109 else: | |
110 hit[3] = _apply_linear_regression(best_match, hit[6], float(hit[3]), model) | |
111 match.extend(hit) | |
112 return match, hit[6] | |
113 ''' | |
114 | |
115 for col in regress_columns: | |
116 if col in available: | |
117 hit = list(hits[available.index(col)]) | |
118 if hit[4] == ctype: | |
119 # Perform regression using a column for which regression is possible | |
120 if method == 'poly': | |
121 # Polynomial is only possible within a set border, if the RI falls outside | |
122 # of this border, skip this lookup | |
123 regressed = _apply_poly_regression(pref[0], hit[6], float(hit[3]), model) | |
124 if regressed: | |
125 hit[3] = regressed | |
126 else: | |
127 return False, False | |
128 else: | |
129 hit[3] = _apply_linear_regression(pref[0], hit[6], float(hit[3]), model) | |
130 match.extend(hit) | |
131 return match, hit[6] | |
132 | |
133 return False, False | |
134 | |
135 | |
136 | |
137 def default_hit(row, cas_nr, compound_id): | |
138 ''' | |
139 This method will return a "default"/empty hit for cases where the | |
140 method _preferred() returns False (i.e. a RI could not be found | |
141 for the given cas nr, also not via regression. | |
142 ''' | |
143 return [ | |
144 #'CAS', | |
145 'C' + cas_nr, | |
146 #'NAME', | |
147 '', | |
148 #'FORMULA', | |
149 '', | |
150 #'RI', | |
151 '0.0', | |
152 #'Column.type', | |
153 '', | |
154 #'Column.phase.type', | |
155 '', | |
156 #'Column.name', | |
157 '', | |
158 #'phase.coding', | |
159 ' ', | |
160 #'CAS_column.Name', | |
161 '', | |
162 #'Centrotype', -> NOTE THAT compound_id is not ALWAYS centrotype...depends on MsClust algorithm used...for now only one MsClust algorithm is used so it is not an issue, but this should be updated/corrected once that changes | |
163 compound_id, | |
164 #'Regression.Column.Name', | |
165 '', | |
166 #'min', | |
167 '', | |
168 #'max', | |
169 '', | |
170 #'nr.duplicates', | |
171 ''] | |
172 | |
173 | |
174 def format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method): | |
175 ''' | |
176 Looks up the compounds in the library lookup table and formats the results | |
177 @param lookup_dict: dictionary containing the library to be searched | |
178 @param nist_tabular_filename: NIST output file to be matched | |
179 @param pref: (list of) column-name(s) to look for | |
180 @param ctype: column type of interest | |
181 @param polar: polarity of the used column | |
182 @param model: data loaded from file containing regression models | |
183 @param method: supported regression method (i.e. poly(nomial) or linear) | |
184 ''' | |
185 (nist_tabular_list, header_clean) = match_library.read_library(nist_tabular_filename) | |
186 # Retrieve indices of the CAS and compound_id columns (exit if not present) | |
187 try: | |
188 casi = header_clean.index("cas") | |
189 idi = header_clean.index("id") | |
190 except: | |
191 raise IOError("'CAS' or 'compound_id' not found in header of library file") | |
192 | |
193 data = [] | |
194 for row in nist_tabular_list: | |
195 casf = str(row[casi].replace('-', '').strip()) | |
196 compound_id = str(row[idi].split('-')[0]) | |
197 if casf in lookup_dict: | |
198 found_hit, regress = _preferred(lookup_dict[casf], pref, ctype, polar, model, method) | |
199 if found_hit: | |
200 # Keep cas nr as 'C'+ numeric part: | |
201 found_hit[0] = 'C' + casf | |
202 # Add compound id | |
203 found_hit.insert(9, compound_id) | |
204 # Add information on regression process | |
205 found_hit.insert(10, regress if regress else 'None') | |
206 # Replace column index references with actual number of duplicates | |
207 dups = len(found_hit[-1].split(',')) | |
208 if dups > 1: | |
209 found_hit[-1] = str(dups + 1) | |
210 else: | |
211 found_hit[-1] = '0' | |
212 data.append(found_hit) | |
213 found_hit = '' | |
214 else: | |
215 data.append(default_hit(row, casf, compound_id)) | |
216 else: | |
217 data.append(default_hit(row, casf, compound_id)) | |
218 | |
219 casf = '' | |
220 compound_id = '' | |
221 found_hit = [] | |
222 dups = [] | |
223 return data | |
224 | |
225 | |
226 def _save_data(content, outfile): | |
227 ''' | |
228 Write to output file | |
229 @param content: content to write | |
230 @param outfile: file to write to | |
231 ''' | |
232 # header | |
233 header = ['CAS', | |
234 'NAME', | |
235 'FORMULA', | |
236 'RI', | |
237 'Column.type', | |
238 'Column.phase.type', | |
239 'Column.name', | |
240 'phase.coding', | |
241 'CAS_column.Name', | |
242 'Centrotype', | |
243 'Regression.Column.Name', | |
244 'min', | |
245 'max', | |
246 'nr.duplicates'] | |
247 output_handle = csv.writer(open(outfile, 'wb'), delimiter="\t") | |
248 output_handle.writerow(header) | |
249 for entry in content: | |
250 output_handle.writerow(entry) | |
251 | |
252 | |
253 def _read_model(model_file): | |
254 ''' | |
255 Creates an easy to search dictionary for getting the regression parameters | |
256 for each valid combination of GC-columns | |
257 @param model_file: filename containing the regression models | |
258 ''' | |
259 regress = list(csv.reader(open(model_file, 'rU'), delimiter='\t')) | |
260 if len(regress.pop(0)) > 9: | |
261 method = 'poly' | |
262 else: | |
263 method = 'linear' | |
264 | |
265 model = {} | |
266 # Create new dictionary for each GC-column | |
267 for line in regress: | |
268 model[line[0]] = {} | |
269 | |
270 # Add data | |
271 for line in regress: | |
272 if method == 'poly': | |
273 model[line[0]][line[1]] = [float(col) for col in line[2:11]] | |
274 else: # linear | |
275 model[line[0]][line[1]] = [float(col) for col in line[2:9]] | |
276 | |
277 return model, method | |
278 | |
279 | |
280 def _apply_poly_regression(column1, column2, retention_index, model): | |
281 ''' | |
282 Calculates a new retention index (RI) value using a given 3rd-degree polynomial | |
283 model based on data from GC columns 1 and 2 | |
284 @param column1: name of the selected GC-column | |
285 @param column2: name of the GC-column to use for regression | |
286 @param retention_index: RI to convert | |
287 @param model: dictionary containing model information for all GC-columns | |
288 ''' | |
289 coeff = model[column1][column2] | |
290 # If the retention index to convert is within range of the data the model is based on, perform regression | |
291 if coeff[4] < retention_index < coeff[5]: | |
292 return (coeff[3] * (retention_index ** 3) + coeff[2] * (retention_index ** 2) + | |
293 (retention_index * coeff[1]) + coeff[0]) | |
294 else: | |
295 return False | |
296 | |
297 | |
298 def _apply_linear_regression(column1, column2, retention_index, model): | |
299 ''' | |
300 Calculates a new retention index (RI) value using a given linear model based on data | |
301 from GC columns 1 and 2 | |
302 @param column1: name of the selected GC-column | |
303 @param column2: name of the GC-column to use for regression | |
304 @param retention_index: RI to convert | |
305 @param model: dictionary containing model information for all GC-columns | |
306 ''' | |
307 # TODO: No use of limits | |
308 coeff = model[column1][column2] | |
309 return coeff[1] * retention_index + coeff[0] | |
310 | |
311 | |
312 def main(): | |
313 ''' | |
314 Library Lookup main function | |
22
f0c6feab06e7
fixed import match_library
linda.bakker@wur.nl <linda.bakker@wur.nl>
parents:
20
diff
changeset
|
315 ''' |
6 | 316 library_file = sys.argv[1] |
317 nist_tabular_filename = sys.argv[2] | |
318 ctype = sys.argv[3] | |
319 polar = sys.argv[4] | |
320 outfile = sys.argv[5] | |
321 pref = sys.argv[6:-1] | |
322 regress = sys.argv[-1] | |
323 | |
324 if regress != 'False': | |
325 model, method = _read_model(regress) | |
326 else: | |
327 model, method = False, None | |
328 | |
329 lookup_dict = create_lookup_table(library_file, ctype, polar) | |
330 data = format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method) | |
331 | |
332 _save_data(data, outfile) | |
333 | |
334 | |
335 if __name__ == "__main__": | |
336 main() |