comparison search_model_validation.py @ 23:d6b8103c909c draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 57f4407e278a615f47a377a3328782b1d8e0b54d
author bgruening
date Sun, 30 Dec 2018 01:58:00 -0500
parents
children 9e43ee712723
comparison
equal deleted inserted replaced
22:b0c31e126fa1 23:d6b8103c909c
1 import imblearn
2 import json
3 import numpy as np
4 import os
5 import pandas
6 import pickle
7 import skrebate
8 import sklearn
9 import sys
10 import xgboost
11 import warnings
12 from imblearn import under_sampling, over_sampling, combine
13 from imblearn.pipeline import Pipeline as imbPipeline
14 from sklearn import (cluster, compose, decomposition, ensemble, feature_extraction,
15 feature_selection, gaussian_process, kernel_approximation, metrics,
16 model_selection, naive_bayes, neighbors, pipeline, preprocessing,
17 svm, linear_model, tree, discriminant_analysis)
18 from sklearn.exceptions import FitFailedWarning
19 from sklearn.externals import joblib
20 from utils import get_cv, get_scoring, get_X_y, load_model, read_columns, SafeEval
21
22
23 N_JOBS = int(os.environ.get('GALAXY_SLOTS', 1))
24
25
26 def get_search_params(params_builder):
27 search_params = {}
28 safe_eval = SafeEval(load_scipy=True, load_numpy=True)
29 safe_eval_es = SafeEval(load_estimators=True)
30
31 for p in params_builder['param_set']:
32 search_p = p['search_param_selector']['search_p']
33 if search_p.strip() == '':
34 continue
35 param_type = p['search_param_selector']['selected_param_type']
36
37 lst = search_p.split(':')
38 assert (len(lst) == 2), "Error, make sure there is one and only one colon in search parameter input."
39 literal = lst[1].strip()
40 param_name = lst[0].strip()
41 if param_name:
42 if param_name.lower() == 'n_jobs':
43 sys.exit("Parameter `%s` is invalid for search." %param_name)
44 elif not param_name.endswith('-'):
45 ev = safe_eval(literal)
46 if param_type == 'final_estimator_p':
47 search_params['estimator__' + param_name] = ev
48 else:
49 search_params['preprocessing_' + param_type[5:6] + '__' + param_name] = ev
50 else:
51 # only for estimator eval, add `-` to the end of param
52 #TODO maybe add regular express check
53 ev = safe_eval_es(literal)
54 for obj in ev:
55 if 'n_jobs' in obj.get_params():
56 obj.set_params( n_jobs=N_JOBS )
57 if param_type == 'final_estimator_p':
58 search_params['estimator__' + param_name[:-1]] = ev
59 else:
60 search_params['preprocessing_' + param_type[5:6] + '__' + param_name[:-1]] = ev
61 elif param_type != 'final_estimator_p':
62 #TODO regular express check ?
63 ev = safe_eval_es(literal)
64 preprocessors = [preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.Imputer(),
65 preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(),
66 preprocessing.PolynomialFeatures(),preprocessing.RobustScaler(),
67 feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(),
68 feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(),
69 feature_selection.SelectFwe(), feature_selection.VarianceThreshold(),
70 decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(),
71 decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS),
72 decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS),
73 decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0),
74 decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
75 decomposition.TruncatedSVD(random_state=0),
76 kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0),
77 kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0),
78 cluster.FeatureAgglomeration(),
79 skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS),
80 skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS),
81 imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS),
82 imblearn.under_sampling.CondensedNearestNeighbour(random_state=0, n_jobs=N_JOBS),
83 imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS),
84 imblearn.under_sampling.RepeatedEditedNearestNeighbours(random_state=0, n_jobs=N_JOBS),
85 imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
86 imblearn.under_sampling.InstanceHardnessThreshold(random_state=0, n_jobs=N_JOBS),
87 imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS),
88 imblearn.under_sampling.NeighbourhoodCleaningRule(random_state=0, n_jobs=N_JOBS),
89 imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS),
90 imblearn.under_sampling.RandomUnderSampler(random_state=0),
91 imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS),
92 imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
93 imblearn.over_sampling.RandomOverSampler(random_state=0),
94 imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
95 imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
96 imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS),
97 imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS),
98 imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0)]
99 newlist = []
100 for obj in ev:
101 if obj is None:
102 newlist.append(None)
103 elif obj == 'all_0':
104 newlist.extend(preprocessors[0:36])
105 elif obj == 'sk_prep_all': # no KernalCenter()
106 newlist.extend(preprocessors[0:8])
107 elif obj == 'fs_all':
108 newlist.extend(preprocessors[8:15])
109 elif obj == 'decomp_all':
110 newlist.extend(preprocessors[15:26])
111 elif obj == 'k_appr_all':
112 newlist.extend(preprocessors[26:30])
113 elif obj == 'reb_all':
114 newlist.extend(preprocessors[31:36])
115 elif obj == 'imb_all':
116 newlist.extend(preprocessors[36:55])
117 elif type(obj) is int and -1 < obj < len(preprocessors):
118 newlist.append(preprocessors[obj])
119 elif hasattr(obj, 'get_params'): # user object
120 if 'n_jobs' in obj.get_params():
121 newlist.append( obj.set_params(n_jobs=N_JOBS) )
122 else:
123 newlist.append(obj)
124 else:
125 sys.exit("Unsupported preprocessor type: %r" %(obj))
126 search_params['preprocessing_' + param_type[5:6]] = newlist
127 else:
128 sys.exit("Parameter name of the final estimator can't be skipped!")
129
130 return search_params
131
132
133 if __name__ == '__main__':
134
135 warnings.simplefilter('ignore')
136
137 input_json_path = sys.argv[1]
138 with open(input_json_path, 'r') as param_handler:
139 params = json.load(param_handler)
140
141 infile_pipeline = sys.argv[2]
142 infile1 = sys.argv[3]
143 infile2 = sys.argv[4]
144 outfile_result = sys.argv[5]
145 if len(sys.argv) > 6:
146 outfile_estimator = sys.argv[6]
147 else:
148 outfile_estimator = None
149
150 params_builder = params['search_schemes']['search_params_builder']
151
152 input_type = params['input_options']['selected_input']
153 if input_type == 'tabular':
154 header = 'infer' if params['input_options']['header1'] else None
155 column_option = params['input_options']['column_selector_options_1']['selected_column_selector_option']
156 if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']:
157 c = params['input_options']['column_selector_options_1']['col1']
158 else:
159 c = None
160 X = read_columns(
161 infile1,
162 c = c,
163 c_option = column_option,
164 sep='\t',
165 header=header,
166 parse_dates=True
167 )
168 else:
169 X = mmread(open(infile1, 'r'))
170
171 header = 'infer' if params['input_options']['header2'] else None
172 column_option = params['input_options']['column_selector_options_2']['selected_column_selector_option2']
173 if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']:
174 c = params['input_options']['column_selector_options_2']['col2']
175 else:
176 c = None
177 y = read_columns(
178 infile2,
179 c = c,
180 c_option = column_option,
181 sep='\t',
182 header=header,
183 parse_dates=True
184 )
185 y = y.ravel()
186
187 optimizer = params['search_schemes']['selected_search_scheme']
188 optimizer = getattr(model_selection, optimizer)
189
190 options = params['search_schemes']['options']
191 splitter, groups = get_cv(options.pop('cv_selector'))
192 if groups is None:
193 options['cv'] = splitter
194 elif groups == '':
195 options['cv'] = list( splitter.split(X, y, groups=None) )
196 else:
197 options['cv'] = list( splitter.split(X, y, groups=groups) )
198 options['n_jobs'] = N_JOBS
199 primary_scoring = options['scoring']['primary_scoring']
200 options['scoring'] = get_scoring(options['scoring'])
201 if options['error_score']:
202 options['error_score'] = 'raise'
203 else:
204 options['error_score'] = np.NaN
205 if options['refit'] and isinstance(options['scoring'], dict):
206 options['refit'] = 'primary'
207 if 'pre_dispatch' in options and options['pre_dispatch'] == '':
208 options['pre_dispatch'] = None
209
210 with open(infile_pipeline, 'rb') as pipeline_handler:
211 pipeline = load_model(pipeline_handler)
212
213 search_params = get_search_params(params_builder)
214 searcher = optimizer(pipeline, search_params, **options)
215
216 if options['error_score'] == 'raise':
217 searcher.fit(X, y)
218 else:
219 warnings.simplefilter('always', FitFailedWarning)
220 with warnings.catch_warnings(record=True) as w:
221 try:
222 searcher.fit(X, y)
223 except ValueError:
224 pass
225 for warning in w:
226 print(repr(warning.message))
227
228 cv_result = pandas.DataFrame(searcher.cv_results_)
229 cv_result.rename(inplace=True, columns={'mean_test_primary': 'mean_test_'+primary_scoring, 'rank_test_primary': 'rank_test_'+primary_scoring})
230 cv_result.to_csv(path_or_buf=outfile_result, sep='\t', header=True, index=False)
231
232 if outfile_estimator:
233 with open(outfile_estimator, 'wb') as output_handler:
234 pickle.dump(searcher.best_estimator_, output_handler, pickle.HIGHEST_PROTOCOL)