Mercurial > repos > bgruening > sklearn_data_preprocess
comparison utils.py @ 19:f196d4715cfb draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit d00173591e4a783a4c1cb2664e4bb192ab5414f7
author | bgruening |
---|---|
date | Fri, 17 Aug 2018 12:28:58 -0400 |
parents | |
children | 2bda387c73e4 |
comparison
equal
deleted
inserted
replaced
18:a886cf4c8392 | 19:f196d4715cfb |
---|---|
1 import sys | |
2 import os | |
3 import pandas | |
4 import re | |
5 import pickle | |
6 import warnings | |
7 import numpy as np | |
8 import xgboost | |
9 import scipy | |
10 import sklearn | |
11 import ast | |
12 from asteval import Interpreter, make_symbol_table | |
13 from sklearn import metrics, model_selection, ensemble, svm, linear_model, naive_bayes, tree, neighbors | |
14 | |
15 N_JOBS = int( os.environ.get('GALAXY_SLOTS', 1) ) | |
16 | |
17 def read_columns(f, c=None, c_option='by_index_number', return_df=False, **args): | |
18 data = pandas.read_csv(f, **args) | |
19 if c_option == 'by_index_number': | |
20 cols = list(map(lambda x: x - 1, c)) | |
21 data = data.iloc[:,cols] | |
22 if c_option == 'all_but_by_index_number': | |
23 cols = list(map(lambda x: x - 1, c)) | |
24 data.drop(data.columns[cols], axis=1, inplace=True) | |
25 if c_option == 'by_header_name': | |
26 cols = [e.strip() for e in c.split(',')] | |
27 data = data[cols] | |
28 if c_option == 'all_but_by_header_name': | |
29 cols = [e.strip() for e in c.split(',')] | |
30 data.drop(cols, axis=1, inplace=True) | |
31 y = data.values | |
32 if return_df: | |
33 return y, data | |
34 else: | |
35 return y | |
36 return y | |
37 | |
38 | |
39 ## generate an instance for one of sklearn.feature_selection classes | |
40 def feature_selector(inputs): | |
41 selector = inputs["selected_algorithm"] | |
42 selector = getattr(sklearn.feature_selection, selector) | |
43 options = inputs["options"] | |
44 | |
45 if inputs['selected_algorithm'] == 'SelectFromModel': | |
46 if not options['threshold'] or options['threshold'] == 'None': | |
47 options['threshold'] = None | |
48 if inputs['model_inputter']['input_mode'] == 'prefitted': | |
49 model_file = inputs['model_inputter']['fitted_estimator'] | |
50 with open(model_file, 'rb') as model_handler: | |
51 fitted_estimator = pickle.load(model_handler) | |
52 new_selector = selector(fitted_estimator, prefit=True, **options) | |
53 else: | |
54 estimator_json = inputs['model_inputter']["estimator_selector"] | |
55 estimator = get_estimator(estimator_json) | |
56 new_selector = selector(estimator, **options) | |
57 | |
58 elif inputs['selected_algorithm'] == 'RFE': | |
59 estimator=get_estimator(inputs["estimator_selector"]) | |
60 new_selector = selector(estimator, **options) | |
61 | |
62 elif inputs['selected_algorithm'] == 'RFECV': | |
63 options['scoring'] = get_scoring(options['scoring']) | |
64 options['n_jobs'] = N_JOBS | |
65 options['cv'] = get_cv( options['cv'].strip() ) | |
66 estimator=get_estimator(inputs["estimator_selector"]) | |
67 new_selector = selector(estimator, **options) | |
68 | |
69 elif inputs['selected_algorithm'] == "VarianceThreshold": | |
70 new_selector = selector(**options) | |
71 | |
72 else: | |
73 score_func = inputs["score_func"] | |
74 score_func = getattr(sklearn.feature_selection, score_func) | |
75 new_selector = selector(score_func, **options) | |
76 | |
77 return new_selector | |
78 | |
79 | |
80 def get_X_y(params, file1, file2): | |
81 input_type = params["selected_tasks"]["selected_algorithms"]["input_options"]["selected_input"] | |
82 if input_type=="tabular": | |
83 header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header1"] else None | |
84 column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["selected_column_selector_option"] | |
85 if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: | |
86 c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["col1"] | |
87 else: | |
88 c = None | |
89 X = read_columns( | |
90 file1, | |
91 c = c, | |
92 c_option = column_option, | |
93 sep='\t', | |
94 header=header, | |
95 parse_dates=True | |
96 ) | |
97 else: | |
98 X = mmread(file1) | |
99 | |
100 header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header2"] else None | |
101 column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] | |
102 if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: | |
103 c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["col2"] | |
104 else: | |
105 c = None | |
106 y = read_columns( | |
107 file2, | |
108 c = c, | |
109 c_option = column_option, | |
110 sep='\t', | |
111 header=header, | |
112 parse_dates=True | |
113 ) | |
114 y=y.ravel() | |
115 return X, y | |
116 | |
117 | |
118 class SafeEval(Interpreter): | |
119 | |
120 def __init__(self, load_scipy=False, load_numpy=False): | |
121 | |
122 # File opening and other unneeded functions could be dropped | |
123 unwanted = ['open', 'type', 'dir', 'id', 'str', 'repr'] | |
124 | |
125 # Allowed symbol table. Add more if needed. | |
126 new_syms = { | |
127 'np_arange': getattr(np, 'arange'), | |
128 'ensemble_ExtraTreesClassifier': getattr(ensemble, 'ExtraTreesClassifier') | |
129 } | |
130 | |
131 syms = make_symbol_table(use_numpy=False, **new_syms) | |
132 | |
133 if load_scipy: | |
134 scipy_distributions = scipy.stats.distributions.__dict__ | |
135 for key in scipy_distributions.keys(): | |
136 if isinstance(scipy_distributions[key], (scipy.stats.rv_continuous, scipy.stats.rv_discrete)): | |
137 syms['scipy_stats_' + key] = scipy_distributions[key] | |
138 | |
139 if load_numpy: | |
140 from_numpy_random = ['beta', 'binomial', 'bytes', 'chisquare', 'choice', 'dirichlet', 'division', | |
141 'exponential', 'f', 'gamma', 'geometric', 'gumbel', 'hypergeometric', | |
142 'laplace', 'logistic', 'lognormal', 'logseries', 'mtrand', 'multinomial', | |
143 'multivariate_normal', 'negative_binomial', 'noncentral_chisquare', 'noncentral_f', | |
144 'normal', 'pareto', 'permutation', 'poisson', 'power', 'rand', 'randint', | |
145 'randn', 'random', 'random_integers', 'random_sample', 'ranf', 'rayleigh', | |
146 'sample', 'seed', 'set_state', 'shuffle', 'standard_cauchy', 'standard_exponential', | |
147 'standard_gamma', 'standard_normal', 'standard_t', 'triangular', 'uniform', | |
148 'vonmises', 'wald', 'weibull', 'zipf' ] | |
149 for f in from_numpy_random: | |
150 syms['np_random_' + f] = getattr(np.random, f) | |
151 | |
152 for key in unwanted: | |
153 syms.pop(key, None) | |
154 | |
155 super(SafeEval, self).__init__( symtable=syms, use_numpy=False, minimal=False, | |
156 no_if=True, no_for=True, no_while=True, no_try=True, | |
157 no_functiondef=True, no_ifexp=True, no_listcomp=False, | |
158 no_augassign=False, no_assert=True, no_delete=True, | |
159 no_raise=True, no_print=True) | |
160 | |
161 | |
162 def get_search_params(params_builder): | |
163 search_params = {} | |
164 safe_eval = SafeEval(load_scipy=True, load_numpy=True) | |
165 | |
166 for p in params_builder['param_set']: | |
167 search_p = p['search_param_selector']['search_p'] | |
168 if search_p.strip() == '': | |
169 continue | |
170 param_type = p['search_param_selector']['selected_param_type'] | |
171 | |
172 lst = search_p.split(":") | |
173 assert (len(lst) == 2), "Error, make sure there is one and only one colon in search parameter input." | |
174 literal = lst[1].strip() | |
175 ev = safe_eval(literal) | |
176 if param_type == "final_estimator_p": | |
177 search_params["estimator__" + lst[0].strip()] = ev | |
178 else: | |
179 search_params["preprocessing_" + param_type[5:6] + "__" + lst[0].strip()] = ev | |
180 | |
181 return search_params | |
182 | |
183 | |
184 def get_estimator(estimator_json): | |
185 estimator_module = estimator_json['selected_module'] | |
186 estimator_cls = estimator_json['selected_estimator'] | |
187 | |
188 if estimator_module == "xgboost": | |
189 cls = getattr(xgboost, estimator_cls) | |
190 else: | |
191 module = getattr(sklearn, estimator_module) | |
192 cls = getattr(module, estimator_cls) | |
193 | |
194 estimator = cls() | |
195 | |
196 estimator_params = estimator_json['text_params'].strip() | |
197 if estimator_params != "": | |
198 try: | |
199 params = safe_eval('dict(' + estimator_params + ')') | |
200 except ValueError: | |
201 sys.exit("Unsupported parameter input: `%s`" %estimator_params) | |
202 estimator.set_params(**params) | |
203 if 'n_jobs' in estimator.get_params(): | |
204 estimator.set_params( n_jobs=N_JOBS ) | |
205 | |
206 return estimator | |
207 | |
208 | |
209 def get_cv(literal): | |
210 safe_eval = SafeEval() | |
211 if literal == "": | |
212 return None | |
213 if literal.isdigit(): | |
214 return int(literal) | |
215 m = re.match(r'^(?P<method>\w+)\((?P<args>.*)\)$', literal) | |
216 if m: | |
217 my_class = getattr( model_selection, m.group('method') ) | |
218 args = safe_eval( 'dict('+ m.group('args') + ')' ) | |
219 return my_class( **args ) | |
220 sys.exit("Unsupported CV input: %s" %literal) | |
221 | |
222 | |
223 def get_scoring(scoring_json): | |
224 def balanced_accuracy_score(y_true, y_pred): | |
225 C = metrics.confusion_matrix(y_true, y_pred) | |
226 with np.errstate(divide='ignore', invalid='ignore'): | |
227 per_class = np.diag(C) / C.sum(axis=1) | |
228 if np.any(np.isnan(per_class)): | |
229 warnings.warn('y_pred contains classes not in y_true') | |
230 per_class = per_class[~np.isnan(per_class)] | |
231 score = np.mean(per_class) | |
232 return score | |
233 | |
234 if scoring_json['primary_scoring'] == "default": | |
235 return None | |
236 | |
237 my_scorers = metrics.SCORERS | |
238 if 'balanced_accuracy' not in my_scorers: | |
239 my_scorers['balanced_accuracy'] = metrics.make_scorer(balanced_accuracy_score) | |
240 | |
241 if scoring_json['secondary_scoring'] != 'None'\ | |
242 and scoring_json['secondary_scoring'] != scoring_json['primary_scoring']: | |
243 scoring = {} | |
244 scoring['primary'] = my_scorers[ scoring_json['primary_scoring'] ] | |
245 for scorer in scoring_json['secondary_scoring'].split(','): | |
246 if scorer != scoring_json['primary_scoring']: | |
247 scoring[scorer] = my_scorers[scorer] | |
248 return scoring | |
249 | |
250 return my_scorers[ scoring_json['primary_scoring'] ] | |
251 |