annotate library_lookup.py @ 3:86d41d9c4a91

removed msclust images
author pieter.lukasse@wur.nl
date Thu, 19 Mar 2015 12:09:38 +0100
parents dffc38727496
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
1 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
2 Logic for searching a Retention Index database file given output from NIST
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
3 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
4 import match_library
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
5 import re
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
6 import sys
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
7 import csv
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
8
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
9 __author__ = "Marcel Kempenaar"
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
10 __contact__ = "brs@nbic.nl"
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
11 __copyright__ = "Copyright, 2012, Netherlands Bioinformatics Centre"
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
12 __license__ = "MIT"
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
13
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
14 def create_lookup_table(library_file, column_type_name, statphase):
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
15 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
16 Creates a dictionary holding the contents of the library to be searched
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
17 @param library_file: library to read
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
18 @param column_type_name: the columns type name
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
19 @param statphase: the columns stationary phase
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
20 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
21 (data, header) = match_library.read_library(library_file)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
22 # Test for presence of required columns
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
23 if ('columntype' not in header or
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
24 'columnphasetype' not in header or
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
25 'cas' not in header):
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
26 raise IOError('Missing columns in ', library_file)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
27
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
28 column_type_column = header.index("columntype")
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
29 statphase_column = header.index("columnphasetype")
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
30 cas_column = header.index("cas")
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
31
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
32 filtered_library = [line for line in data if line[column_type_column] == column_type_name
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
33 and line[statphase_column] == statphase]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
34 lookup_dict = {}
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
35 for element in filtered_library:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
36 # Here the cas_number is set to the numeric part of the cas_column value, so if the
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
37 # cas_column value is 'C1433' then cas_number will be '1433'
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
38 cas_number = str(re.findall(r'\d+', (element[cas_column]).strip())[0])
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
39 try:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
40 lookup_dict[cas_number].append(element)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
41 except KeyError:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
42 lookup_dict[cas_number] = [element]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
43 return lookup_dict
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
44
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
45
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
46 def _preferred(hits, pref, ctype, polar, model, method):
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
47 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
48 Returns all entries in the lookup_dict that have the same column name, type and polarity
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
49 as given by the user, uses regression if selected given the model and method to use. The
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
50 regression is applied on the column with the best R-squared value in the model
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
51 @param hits: all entries in the lookup_dict for the given CAS number
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
52 @param pref: preferred GC-column, can be one or more names
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
53 @param ctype: column type (capillary etc.)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
54 @param polar: polarity (polar / non-polar etc.)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
55 @param model: data loaded from file containing regression models
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
56 @param method: supported regression method (i.e. poly(nomial) or linear)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
57 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
58 match = []
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
59 for column in pref:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
60 for hit in hits:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
61 if hit[4] == ctype and hit[5] == polar and hit[6] == column:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
62 # Create copy of found hit since it will be altered downstream
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
63 match.extend(hit)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
64 return match, False
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
65
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
66 # No hit found for current CAS number, return if not performing regression
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
67 if not model:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
68 return False, False
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
69
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
70 # Perform regression
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
71 for column in pref:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
72 if column not in model:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
73 break
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
74 # Order regression candidates by R-squared value (last element)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
75 order = sorted(model[column].items(), key=lambda col: col[1][-1])
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
76 # Create list of regression candidate column names
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
77 regress_columns = list(reversed([column for (column, _) in order]))
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
78 # Names of available columns
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
79 available = [hit[6] for hit in hits]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
80
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
81 # TODO: combine Rsquared and number of datapoints to get the best regression match
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
82 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
83 # Iterate regression columns (in order) and retrieve their models
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
84 models = {}
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
85 for col in regress_columns:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
86 if col in available:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
87 hit = list(hits[available.index(col)])
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
88 if hit[4] == ctype:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
89 # models contains all model data including residuals [-2] and rsquared [-1]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
90 models[pref[0]] = model[pref[0]][hit[6]]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
91 # Get the combined maximum for residuals and rsquared
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
92 best_match = models[]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
93 # Apply regression
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
94 if method == 'poly':
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
95 regressed = _apply_poly_regression(best_match, hit[6], float(hit[3]), model)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
96 if regressed:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
97 hit[3] = regressed
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
98 else:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
99 return False, False
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
100 else:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
101 hit[3] = _apply_linear_regression(best_match, hit[6], float(hit[3]), model)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
102 match.extend(hit)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
103 return match, hit[6]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
104 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
105
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
106 for col in regress_columns:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
107 if col in available:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
108 hit = list(hits[available.index(col)])
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
109 if hit[4] == ctype:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
110 # Perform regression using a column for which regression is possible
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
111 if method == 'poly':
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
112 # Polynomial is only possible within a set border, if the RI falls outside
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
113 # of this border, skip this lookup
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
114 regressed = _apply_poly_regression(pref[0], hit[6], float(hit[3]), model)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
115 if regressed:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
116 hit[3] = regressed
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
117 else:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
118 return False, False
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
119 else:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
120 hit[3] = _apply_linear_regression(pref[0], hit[6], float(hit[3]), model)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
121 match.extend(hit)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
122 return match, hit[6]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
123
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
124 return False, False
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
125
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
126
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
127
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
128 def default_hit(row, cas_nr, compound_id):
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
129 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
130 This method will return a "default"/empty hit for cases where the
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
131 method _preferred() returns False (i.e. a RI could not be found
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
132 for the given cas nr, also not via regression.
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
133 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
134 return [
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
135 #'CAS',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
136 'C' + cas_nr,
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
137 #'NAME',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
138 '',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
139 #'FORMULA',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
140 '',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
141 #'RI',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
142 '0.0',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
143 #'Column.type',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
144 '',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
145 #'Column.phase.type',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
146 '',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
147 #'Column.name',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
148 '',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
149 #'phase.coding',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
150 ' ',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
151 #'CAS_column.Name',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
152 '',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
153 #'Centrotype', -> NOTE THAT compound_id is not ALWAYS centrotype...depends on MsClust algorithm used...for now only one MsClust algorithm is used so it is not an issue, but this should be updated/corrected once that changes
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
154 compound_id,
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
155 #'Regression.Column.Name',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
156 '',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
157 #'min',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
158 '',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
159 #'max',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
160 '',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
161 #'nr.duplicates',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
162 '']
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
163
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
164
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
165 def format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method):
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
166 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
167 Looks up the compounds in the library lookup table and formats the results
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
168 @param lookup_dict: dictionary containing the library to be searched
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
169 @param nist_tabular_filename: NIST output file to be matched
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
170 @param pref: (list of) column-name(s) to look for
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
171 @param ctype: column type of interest
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
172 @param polar: polarity of the used column
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
173 @param model: data loaded from file containing regression models
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
174 @param method: supported regression method (i.e. poly(nomial) or linear)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
175 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
176 (nist_tabular_list, header_clean) = match_library.read_library(nist_tabular_filename)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
177 # Retrieve indices of the CAS and compound_id columns (exit if not present)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
178 try:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
179 casi = header_clean.index("cas")
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
180 idi = header_clean.index("id")
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
181 except:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
182 raise IOError("'CAS' or 'compound_id' not found in header of library file")
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
183
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
184 data = []
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
185 for row in nist_tabular_list:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
186 casf = str(row[casi].replace('-', '').strip())
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
187 compound_id = str(row[idi].split('-')[0])
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
188 if casf in lookup_dict:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
189 found_hit, regress = _preferred(lookup_dict[casf], pref, ctype, polar, model, method)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
190 if found_hit:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
191 # Keep cas nr as 'C'+ numeric part:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
192 found_hit[0] = 'C' + casf
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
193 # Add compound id
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
194 found_hit.insert(9, compound_id)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
195 # Add information on regression process
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
196 found_hit.insert(10, regress if regress else 'None')
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
197 # Replace column index references with actual number of duplicates
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
198 dups = len(found_hit[-1].split(','))
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
199 if dups > 1:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
200 found_hit[-1] = str(dups + 1)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
201 else:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
202 found_hit[-1] = '0'
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
203 data.append(found_hit)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
204 found_hit = ''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
205 else:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
206 data.append(default_hit(row, casf, compound_id))
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
207 else:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
208 data.append(default_hit(row, casf, compound_id))
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
209
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
210 casf = ''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
211 compound_id = ''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
212 found_hit = []
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
213 dups = []
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
214 return data
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
215
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
216
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
217 def _save_data(content, outfile):
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
218 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
219 Write to output file
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
220 @param content: content to write
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
221 @param outfile: file to write to
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
222 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
223 # header
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
224 header = ['CAS',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
225 'NAME',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
226 'FORMULA',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
227 'RI',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
228 'Column.type',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
229 'Column.phase.type',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
230 'Column.name',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
231 'phase.coding',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
232 'CAS_column.Name',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
233 'Centrotype',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
234 'Regression.Column.Name',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
235 'min',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
236 'max',
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
237 'nr.duplicates']
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
238 output_handle = csv.writer(open(outfile, 'wb'), delimiter="\t")
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
239 output_handle.writerow(header)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
240 for entry in content:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
241 output_handle.writerow(entry)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
242
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
243
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
244 def _read_model(model_file):
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
245 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
246 Creates an easy to search dictionary for getting the regression parameters
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
247 for each valid combination of GC-columns
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
248 @param model_file: filename containing the regression models
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
249 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
250 regress = list(csv.reader(open(model_file, 'rU'), delimiter='\t'))
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
251 if len(regress.pop(0)) > 9:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
252 method = 'poly'
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
253 else:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
254 method = 'linear'
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
255
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
256 model = {}
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
257 # Create new dictionary for each GC-column
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
258 for line in regress:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
259 model[line[0]] = {}
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
260
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
261 # Add data
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
262 for line in regress:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
263 if method == 'poly':
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
264 model[line[0]][line[1]] = [float(col) for col in line[2:11]]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
265 else: # linear
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
266 model[line[0]][line[1]] = [float(col) for col in line[2:9]]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
267
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
268 return model, method
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
269
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
270
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
271 def _apply_poly_regression(column1, column2, retention_index, model):
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
272 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
273 Calculates a new retention index (RI) value using a given 3rd-degree polynomial
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
274 model based on data from GC columns 1 and 2
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
275 @param column1: name of the selected GC-column
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
276 @param column2: name of the GC-column to use for regression
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
277 @param retention_index: RI to convert
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
278 @param model: dictionary containing model information for all GC-columns
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
279 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
280 coeff = model[column1][column2]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
281 # If the retention index to convert is within range of the data the model is based on, perform regression
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
282 if coeff[4] < retention_index < coeff[5]:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
283 return (coeff[3] * (retention_index ** 3) + coeff[2] * (retention_index ** 2) +
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
284 (retention_index * coeff[1]) + coeff[0])
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
285 else:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
286 return False
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
287
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
288
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
289 def _apply_linear_regression(column1, column2, retention_index, model):
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
290 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
291 Calculates a new retention index (RI) value using a given linear model based on data
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
292 from GC columns 1 and 2
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
293 @param column1: name of the selected GC-column
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
294 @param column2: name of the GC-column to use for regression
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
295 @param retention_index: RI to convert
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
296 @param model: dictionary containing model information for all GC-columns
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
297 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
298 # TODO: No use of limits
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
299 coeff = model[column1][column2]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
300 return coeff[1] * retention_index + coeff[0]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
301
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
302
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
303 def main():
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
304 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
305 Library Lookup main function
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
306 '''
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
307 library_file = sys.argv[1]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
308 nist_tabular_filename = sys.argv[2]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
309 ctype = sys.argv[3]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
310 polar = sys.argv[4]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
311 outfile = sys.argv[5]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
312 pref = sys.argv[6:-1]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
313 regress = sys.argv[-1]
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
314
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
315 if regress != 'False':
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
316 model, method = _read_model(regress)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
317 else:
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
318 model, method = False, None
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
319
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
320 lookup_dict = create_lookup_table(library_file, ctype, polar)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
321 data = format_result(lookup_dict, nist_tabular_filename, pref, ctype, polar, model, method)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
322
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
323 _save_data(data, outfile)
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
324
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
325
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
326 if __name__ == "__main__":
dffc38727496 initial commit
pieter.lukasse@wur.nl
parents:
diff changeset
327 main()