comparison utils.py @ 24:97b467e06354 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c0a3a186966888e5787335a7628bf0a4382637e7
author bgruening
date Tue, 14 May 2019 18:07:39 -0400
parents 4ba68dd788b3
children
comparison
equal deleted inserted replaced
23:4ba68dd788b3 24:97b467e06354
1 import ast
1 import json 2 import json
3 import imblearn
2 import numpy as np 4 import numpy as np
3 import os
4 import pandas 5 import pandas
5 import pickle 6 import pickle
6 import re 7 import re
7 import scipy 8 import scipy
8 import sklearn 9 import sklearn
10 import skrebate
9 import sys 11 import sys
10 import warnings 12 import warnings
11 import xgboost 13 import xgboost
12 14
15 from collections import Counter
13 from asteval import Interpreter, make_symbol_table 16 from asteval import Interpreter, make_symbol_table
14 from sklearn import (cluster, compose, decomposition, ensemble, feature_extraction, 17 from imblearn import under_sampling, over_sampling, combine
15 feature_selection, gaussian_process, kernel_approximation, metrics, 18 from imblearn.pipeline import Pipeline as imbPipeline
16 model_selection, naive_bayes, neighbors, pipeline, preprocessing, 19 from mlxtend import regressor, classifier
17 svm, linear_model, tree, discriminant_analysis) 20 from scipy.io import mmread
21 from sklearn import (
22 cluster, compose, decomposition, ensemble, feature_extraction,
23 feature_selection, gaussian_process, kernel_approximation, metrics,
24 model_selection, naive_bayes, neighbors, pipeline, preprocessing,
25 svm, linear_model, tree, discriminant_analysis)
18 26
19 try: 27 try:
20 import skrebate 28 import iraps_classifier
21 except ModuleNotFoundError: 29 except ImportError:
22 pass 30 pass
23 31
24
25 N_JOBS = int(os.environ.get('GALAXY_SLOTS', 1))
26
27 try: 32 try:
28 sk_whitelist 33 import model_validations
29 except NameError: 34 except ImportError:
30 sk_whitelist = None 35 pass
31 36
32 37 try:
33 class SafePickler(pickle.Unpickler): 38 import feature_selectors
34 """ 39 except ImportError:
35 Used to safely deserialize scikit-learn model objects serialized by cPickle.dump 40 pass
41
42 try:
43 import preprocessors
44 except ImportError:
45 pass
46
47 # handle pickle white list file
48 WL_FILE = __import__('os').path.join(
49 __import__('os').path.dirname(__file__), 'pk_whitelist.json')
50
51 N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1))
52
53
54 class _SafePickler(pickle.Unpickler, object):
55 """
56 Used to safely deserialize scikit-learn model objects
36 Usage: 57 Usage:
37 eg.: SafePickler.load(pickled_file_object) 58 eg.: _SafePickler.load(pickled_file_object)
38 """ 59 """
60 def __init__(self, file):
61 super(_SafePickler, self).__init__(file)
62 # load global white list
63 with open(WL_FILE, 'r') as f:
64 self.pk_whitelist = json.load(f)
65
66 self.bad_names = (
67 'and', 'as', 'assert', 'break', 'class', 'continue',
68 'def', 'del', 'elif', 'else', 'except', 'exec',
69 'finally', 'for', 'from', 'global', 'if', 'import',
70 'in', 'is', 'lambda', 'not', 'or', 'pass', 'print',
71 'raise', 'return', 'try', 'system', 'while', 'with',
72 'True', 'False', 'None', 'eval', 'execfile', '__import__',
73 '__package__', '__subclasses__', '__bases__', '__globals__',
74 '__code__', '__closure__', '__func__', '__self__', '__module__',
75 '__dict__', '__class__', '__call__', '__get__',
76 '__getattribute__', '__subclasshook__', '__new__',
77 '__init__', 'func_globals', 'func_code', 'func_closure',
78 'im_class', 'im_func', 'im_self', 'gi_code', 'gi_frame',
79 '__asteval__', 'f_locals', '__mro__')
80
81 # unclassified good globals
82 self.good_names = [
83 'copy_reg._reconstructor', '__builtin__.object',
84 '__builtin__.bytearray', 'builtins.object',
85 'builtins.bytearray', 'keras.engine.sequential.Sequential',
86 'keras.engine.sequential.Model']
87
88 # custom module in Galaxy-ML
89 self.custom_modules = [
90 '__main__', 'keras_galaxy_models', 'feature_selectors',
91 'preprocessors', 'iraps_classifier', 'model_validations']
92
93 # override
39 def find_class(self, module, name): 94 def find_class(self, module, name):
40 95 # balack list first
41 # sk_whitelist could be read from tool 96 if name in self.bad_names:
42 global sk_whitelist 97 raise pickle.UnpicklingError("global '%s.%s' is forbidden"
43 if not sk_whitelist: 98 % (module, name))
44 whitelist_file = os.path.join(os.path.dirname(__file__), 'sk_whitelist.json') 99
45 with open(whitelist_file, 'r') as f: 100 # custom module in Galaxy-ML
46 sk_whitelist = json.load(f) 101 if module in self.custom_modules:
47 102 cutom_module = sys.modules.get(module, None)
48 bad_names = ('and', 'as', 'assert', 'break', 'class', 'continue', 103 if cutom_module:
49 'def', 'del', 'elif', 'else', 'except', 'exec', 104 return getattr(cutom_module, name)
50 'finally', 'for', 'from', 'global', 'if', 'import', 105 else:
51 'in', 'is', 'lambda', 'not', 'or', 'pass', 'print', 106 raise pickle.UnpicklingError("Module %s' is not imported"
52 'raise', 'return', 'try', 'system', 'while', 'with', 107 % module)
53 'True', 'False', 'None', 'eval', 'execfile', '__import__', 108
54 '__package__', '__subclasses__', '__bases__', '__globals__', 109 # For objects from outside libraries, it's necessary to verify
55 '__code__', '__closure__', '__func__', '__self__', '__module__', 110 # both module and name. Currently only a blacklist checker
56 '__dict__', '__class__', '__call__', '__get__', 111 # is working.
57 '__getattribute__', '__subclasshook__', '__new__', 112 # TODO: replace with a whitelist checker.
58 '__init__', 'func_globals', 'func_code', 'func_closure', 113 good_names = self.good_names
59 'im_class', 'im_func', 'im_self', 'gi_code', 'gi_frame', 114 pk_whitelist = self.pk_whitelist
60 '__asteval__', 'f_locals', '__mro__')
61 good_names = ['copy_reg._reconstructor', '__builtin__.object']
62
63 if re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', name): 115 if re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', name):
64 fullname = module + '.' + name 116 fullname = module + '.' + name
65 if (fullname in good_names)\ 117 if (fullname in good_names)\
66 or ( ( module.startswith('sklearn.') 118 or (module.startswith(('sklearn.', 'xgboost.', 'skrebate.',
67 or module.startswith('xgboost.') 119 'imblearn.', 'mlxtend.', 'numpy.'))
68 or module.startswith('skrebate.') 120 or module == 'numpy'):
69 or module.startswith('imblearn') 121 if fullname not in (pk_whitelist['SK_NAMES'] +
70 or module.startswith('numpy.') 122 pk_whitelist['SKR_NAMES'] +
71 or module == 'numpy' 123 pk_whitelist['XGB_NAMES'] +
72 ) 124 pk_whitelist['NUMPY_NAMES'] +
73 and (name not in bad_names) 125 pk_whitelist['IMBLEARN_NAMES'] +
74 ): 126 pk_whitelist['MLXTEND_NAMES'] +
75 # TODO: replace with a whitelist checker 127 good_names):
76 if fullname not in sk_whitelist['SK_NAMES'] + sk_whitelist['SKR_NAMES'] + sk_whitelist['XGB_NAMES'] + sk_whitelist['NUMPY_NAMES'] + sk_whitelist['IMBLEARN_NAMES'] + good_names: 128 # raise pickle.UnpicklingError
77 print("Warning: global %s is not in pickler whitelist yet and will loss support soon. Contact tool author or leave a message at github.com" % fullname) 129 print("Warning: global %s is not in pickler whitelist "
130 "yet and will loss support soon. Contact tool "
131 "author or leave a message at github.com" % fullname)
78 mod = sys.modules[module] 132 mod = sys.modules[module]
79 return getattr(mod, name) 133 return getattr(mod, name)
80 134
81 raise pickle.UnpicklingError("global '%s' is forbidden" % fullname) 135 raise pickle.UnpicklingError("global '%s' is forbidden" % fullname)
82 136
83 137
84 def load_model(file): 138 def load_model(file):
85 return SafePickler(file).load() 139 """Load pickled object with `_SafePicker`
86 140 """
87 141 return _SafePickler(file).load()
88 def read_columns(f, c=None, c_option='by_index_number', return_df=False, **args): 142
143
144 def read_columns(f, c=None, c_option='by_index_number',
145 return_df=False, **args):
146 """Return array from a tabular dataset by various columns selection
147 """
89 data = pandas.read_csv(f, **args) 148 data = pandas.read_csv(f, **args)
90 if c_option == 'by_index_number': 149 if c_option == 'by_index_number':
91 cols = list(map(lambda x: x - 1, c)) 150 cols = list(map(lambda x: x - 1, c))
92 data = data.iloc[:, cols] 151 data = data.iloc[:, cols]
93 if c_option == 'all_but_by_index_number': 152 if c_option == 'all_but_by_index_number':
104 return y, data 163 return y, data
105 else: 164 else:
106 return y 165 return y
107 166
108 167
109 ## generate an instance for one of sklearn.feature_selection classes 168 def feature_selector(inputs, X=None, y=None):
110 def feature_selector(inputs): 169 """generate an instance of sklearn.feature_selection classes
170
171 Parameters
172 ----------
173 inputs : dict
174 From galaxy tool parameters.
175 X : array
176 Containing training features.
177 y : array or list
178 Target values.
179 """
111 selector = inputs['selected_algorithm'] 180 selector = inputs['selected_algorithm']
112 selector = getattr(sklearn.feature_selection, selector) 181 if selector != 'DyRFECV':
182 selector = getattr(sklearn.feature_selection, selector)
113 options = inputs['options'] 183 options = inputs['options']
114 184
115 if inputs['selected_algorithm'] == 'SelectFromModel': 185 if inputs['selected_algorithm'] == 'SelectFromModel':
116 if not options['threshold'] or options['threshold'] == 'None': 186 if not options['threshold'] or options['threshold'] == 'None':
117 options['threshold'] = None 187 options['threshold'] = None
126 fitted_estimator = load_model(model_handler) 196 fitted_estimator = load_model(model_handler)
127 new_selector = selector(fitted_estimator, prefit=True, **options) 197 new_selector = selector(fitted_estimator, prefit=True, **options)
128 else: 198 else:
129 estimator_json = inputs['model_inputter']['estimator_selector'] 199 estimator_json = inputs['model_inputter']['estimator_selector']
130 estimator = get_estimator(estimator_json) 200 estimator = get_estimator(estimator_json)
201 check_feature_importances = try_get_attr(
202 'feature_selectors', 'check_feature_importances')
203 estimator = check_feature_importances(estimator)
131 new_selector = selector(estimator, **options) 204 new_selector = selector(estimator, **options)
132 205
133 elif inputs['selected_algorithm'] == 'RFE': 206 elif inputs['selected_algorithm'] == 'RFE':
134 estimator = get_estimator(inputs['estimator_selector'])
135 step = options.get('step', None) 207 step = options.get('step', None)
136 if step and step >= 1.0: 208 if step and step >= 1.0:
137 options['step'] = int(step) 209 options['step'] = int(step)
210 estimator = get_estimator(inputs["estimator_selector"])
211 check_feature_importances = try_get_attr(
212 'feature_selectors', 'check_feature_importances')
213 estimator = check_feature_importances(estimator)
138 new_selector = selector(estimator, **options) 214 new_selector = selector(estimator, **options)
139 215
140 elif inputs['selected_algorithm'] == 'RFECV': 216 elif inputs['selected_algorithm'] == 'RFECV':
141 options['scoring'] = get_scoring(options['scoring']) 217 options['scoring'] = get_scoring(options['scoring'])
142 options['n_jobs'] = N_JOBS 218 options['n_jobs'] = N_JOBS
143 splitter, groups = get_cv(options.pop('cv_selector')) 219 splitter, groups = get_cv(options.pop('cv_selector'))
144 # TODO support group cv splitters 220 if groups is None:
145 options['cv'] = splitter 221 options['cv'] = splitter
222 else:
223 options['cv'] = list(splitter.split(X, y, groups=groups))
146 step = options.get('step', None) 224 step = options.get('step', None)
147 if step and step >= 1.0: 225 if step and step >= 1.0:
148 options['step'] = int(step) 226 options['step'] = int(step)
149 estimator = get_estimator(inputs['estimator_selector']) 227 estimator = get_estimator(inputs['estimator_selector'])
228 check_feature_importances = try_get_attr(
229 'feature_selectors', 'check_feature_importances')
230 estimator = check_feature_importances(estimator)
150 new_selector = selector(estimator, **options) 231 new_selector = selector(estimator, **options)
232
233 elif inputs['selected_algorithm'] == 'DyRFECV':
234 options['scoring'] = get_scoring(options['scoring'])
235 options['n_jobs'] = N_JOBS
236 splitter, groups = get_cv(options.pop('cv_selector'))
237 if groups is None:
238 options['cv'] = splitter
239 else:
240 options['cv'] = list(splitter.split(X, y, groups=groups))
241 step = options.get('step')
242 if not step or step == 'None':
243 step = None
244 else:
245 step = ast.literal_eval(step)
246 options['step'] = step
247 estimator = get_estimator(inputs["estimator_selector"])
248 check_feature_importances = try_get_attr(
249 'feature_selectors', 'check_feature_importances')
250 estimator = check_feature_importances(estimator)
251 DyRFECV = try_get_attr('feature_selectors', 'DyRFECV')
252
253 new_selector = DyRFECV(estimator, **options)
151 254
152 elif inputs['selected_algorithm'] == 'VarianceThreshold': 255 elif inputs['selected_algorithm'] == 'VarianceThreshold':
153 new_selector = selector(**options) 256 new_selector = selector(**options)
154 257
155 else: 258 else:
159 262
160 return new_selector 263 return new_selector
161 264
162 265
163 def get_X_y(params, file1, file2): 266 def get_X_y(params, file1, file2):
164 input_type = params['selected_tasks']['selected_algorithms']['input_options']['selected_input'] 267 """Return machine learning inputs X, y from tabluar inputs
268 """
269 input_type = (params['selected_tasks']['selected_algorithms']
270 ['input_options']['selected_input'])
165 if input_type == 'tabular': 271 if input_type == 'tabular':
166 header = 'infer' if params['selected_tasks']['selected_algorithms']['input_options']['header1'] else None 272 header = 'infer' if (params['selected_tasks']['selected_algorithms']
167 column_option = params['selected_tasks']['selected_algorithms']['input_options']['column_selector_options_1']['selected_column_selector_option'] 273 ['input_options']['header1']) else None
168 if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']: 274 column_option = (params['selected_tasks']['selected_algorithms']
169 c = params['selected_tasks']['selected_algorithms']['input_options']['column_selector_options_1']['col1'] 275 ['input_options']['column_selector_options_1']
276 ['selected_column_selector_option'])
277 if column_option in ['by_index_number', 'all_but_by_index_number',
278 'by_header_name', 'all_but_by_header_name']:
279 c = (params['selected_tasks']['selected_algorithms']
280 ['input_options']['column_selector_options_1']['col1'])
170 else: 281 else:
171 c = None 282 c = None
172 X = read_columns( 283 X = read_columns(
173 file1, 284 file1,
174 c=c, 285 c=c,
175 c_option=column_option, 286 c_option=column_option,
176 sep='\t', 287 sep='\t',
177 header=header, 288 header=header,
178 parse_dates=True 289 parse_dates=True).astype(float)
179 )
180 else: 290 else:
181 X = mmread(file1) 291 X = mmread(file1)
182 292
183 header = 'infer' if params['selected_tasks']['selected_algorithms']['input_options']['header2'] else None 293 header = 'infer' if (params['selected_tasks']['selected_algorithms']
184 column_option = params['selected_tasks']['selected_algorithms']['input_options']['column_selector_options_2']['selected_column_selector_option2'] 294 ['input_options']['header2']) else None
185 if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']: 295 column_option = (params['selected_tasks']['selected_algorithms']
186 c = params['selected_tasks']['selected_algorithms']['input_options']['column_selector_options_2']['col2'] 296 ['input_options']['column_selector_options_2']
297 ['selected_column_selector_option2'])
298 if column_option in ['by_index_number', 'all_but_by_index_number',
299 'by_header_name', 'all_but_by_header_name']:
300 c = (params['selected_tasks']['selected_algorithms']
301 ['input_options']['column_selector_options_2']['col2'])
187 else: 302 else:
188 c = None 303 c = None
189 y = read_columns( 304 y = read_columns(
190 file2, 305 file2,
191 c=c, 306 c=c,
192 c_option=column_option, 307 c_option=column_option,
193 sep='\t', 308 sep='\t',
194 header=header, 309 header=header,
195 parse_dates=True 310 parse_dates=True)
196 )
197 y = y.ravel() 311 y = y.ravel()
312
198 return X, y 313 return X, y
199 314
200 315
201 class SafeEval(Interpreter): 316 class SafeEval(Interpreter):
202 317 """Customized symbol table for safely literal eval
203 def __init__(self, load_scipy=False, load_numpy=False, load_estimators=False): 318 """
319 def __init__(self, load_scipy=False, load_numpy=False,
320 load_estimators=False):
204 321
205 # File opening and other unneeded functions could be dropped 322 # File opening and other unneeded functions could be dropped
206 unwanted = ['open', 'type', 'dir', 'id', 'str', 'repr'] 323 unwanted = ['open', 'type', 'dir', 'id', 'str', 'repr']
207 324
208 # Allowed symbol table. Add more if needed. 325 # Allowed symbol table. Add more if needed.
209 new_syms = { 326 new_syms = {
210 'np_arange': getattr(np, 'arange'), 327 'np_arange': getattr(np, 'arange'),
211 'ensemble_ExtraTreesClassifier': getattr(ensemble, 'ExtraTreesClassifier') 328 'ensemble_ExtraTreesClassifier':
329 getattr(ensemble, 'ExtraTreesClassifier')
212 } 330 }
213 331
214 syms = make_symbol_table(use_numpy=False, **new_syms) 332 syms = make_symbol_table(use_numpy=False, **new_syms)
215 333
216 if load_scipy: 334 if load_scipy:
217 scipy_distributions = scipy.stats.distributions.__dict__ 335 scipy_distributions = scipy.stats.distributions.__dict__
218 for k, v in scipy_distributions.items(): 336 for k, v in scipy_distributions.items():
219 if isinstance(v, (scipy.stats.rv_continuous, scipy.stats.rv_discrete)): 337 if isinstance(v, (scipy.stats.rv_continuous,
338 scipy.stats.rv_discrete)):
220 syms['scipy_stats_' + k] = v 339 syms['scipy_stats_' + k] = v
221 340
222 if load_numpy: 341 if load_numpy:
223 from_numpy_random = ['beta', 'binomial', 'bytes', 'chisquare', 'choice', 'dirichlet', 'division', 342 from_numpy_random = [
224 'exponential', 'f', 'gamma', 'geometric', 'gumbel', 'hypergeometric', 343 'beta', 'binomial', 'bytes', 'chisquare', 'choice',
225 'laplace', 'logistic', 'lognormal', 'logseries', 'mtrand', 'multinomial', 344 'dirichlet', 'division', 'exponential', 'f', 'gamma',
226 'multivariate_normal', 'negative_binomial', 'noncentral_chisquare', 'noncentral_f', 345 'geometric', 'gumbel', 'hypergeometric', 'laplace',
227 'normal', 'pareto', 'permutation', 'poisson', 'power', 'rand', 'randint', 346 'logistic', 'lognormal', 'logseries', 'mtrand',
228 'randn', 'random', 'random_integers', 'random_sample', 'ranf', 'rayleigh', 347 'multinomial', 'multivariate_normal', 'negative_binomial',
229 'sample', 'seed', 'set_state', 'shuffle', 'standard_cauchy', 'standard_exponential', 348 'noncentral_chisquare', 'noncentral_f', 'normal', 'pareto',
230 'standard_gamma', 'standard_normal', 'standard_t', 'triangular', 'uniform', 349 'permutation', 'poisson', 'power', 'rand', 'randint',
231 'vonmises', 'wald', 'weibull', 'zipf'] 350 'randn', 'random', 'random_integers', 'random_sample',
351 'ranf', 'rayleigh', 'sample', 'seed', 'set_state',
352 'shuffle', 'standard_cauchy', 'standard_exponential',
353 'standard_gamma', 'standard_normal', 'standard_t',
354 'triangular', 'uniform', 'vonmises', 'wald', 'weibull', 'zipf']
232 for f in from_numpy_random: 355 for f in from_numpy_random:
233 syms['np_random_' + f] = getattr(np.random, f) 356 syms['np_random_' + f] = getattr(np.random, f)
234 357
235 if load_estimators: 358 if load_estimators:
236 estimator_table = { 359 estimator_table = {
237 'sklearn_svm' : getattr(sklearn, 'svm'), 360 'sklearn_svm': getattr(sklearn, 'svm'),
238 'sklearn_tree' : getattr(sklearn, 'tree'), 361 'sklearn_tree': getattr(sklearn, 'tree'),
239 'sklearn_ensemble' : getattr(sklearn, 'ensemble'), 362 'sklearn_ensemble': getattr(sklearn, 'ensemble'),
240 'sklearn_neighbors' : getattr(sklearn, 'neighbors'), 363 'sklearn_neighbors': getattr(sklearn, 'neighbors'),
241 'sklearn_naive_bayes' : getattr(sklearn, 'naive_bayes'), 364 'sklearn_naive_bayes': getattr(sklearn, 'naive_bayes'),
242 'sklearn_linear_model' : getattr(sklearn, 'linear_model'), 365 'sklearn_linear_model': getattr(sklearn, 'linear_model'),
243 'sklearn_cluster' : getattr(sklearn, 'cluster'), 366 'sklearn_cluster': getattr(sklearn, 'cluster'),
244 'sklearn_decomposition' : getattr(sklearn, 'decomposition'), 367 'sklearn_decomposition': getattr(sklearn, 'decomposition'),
245 'sklearn_preprocessing' : getattr(sklearn, 'preprocessing'), 368 'sklearn_preprocessing': getattr(sklearn, 'preprocessing'),
246 'sklearn_feature_selection' : getattr(sklearn, 'feature_selection'), 369 'sklearn_feature_selection':
247 'sklearn_kernel_approximation' : getattr(sklearn, 'kernel_approximation'), 370 getattr(sklearn, 'feature_selection'),
371 'sklearn_kernel_approximation':
372 getattr(sklearn, 'kernel_approximation'),
248 'skrebate_ReliefF': getattr(skrebate, 'ReliefF'), 373 'skrebate_ReliefF': getattr(skrebate, 'ReliefF'),
249 'skrebate_SURF': getattr(skrebate, 'SURF'), 374 'skrebate_SURF': getattr(skrebate, 'SURF'),
250 'skrebate_SURFstar': getattr(skrebate, 'SURFstar'), 375 'skrebate_SURFstar': getattr(skrebate, 'SURFstar'),
251 'skrebate_MultiSURF': getattr(skrebate, 'MultiSURF'), 376 'skrebate_MultiSURF': getattr(skrebate, 'MultiSURF'),
252 'skrebate_MultiSURFstar': getattr(skrebate, 'MultiSURFstar'), 377 'skrebate_MultiSURFstar': getattr(skrebate, 'MultiSURFstar'),
253 'skrebate_TuRF': getattr(skrebate, 'TuRF'), 378 'skrebate_TuRF': getattr(skrebate, 'TuRF'),
254 'xgboost_XGBClassifier' : getattr(xgboost, 'XGBClassifier'), 379 'xgboost_XGBClassifier': getattr(xgboost, 'XGBClassifier'),
255 'xgboost_XGBRegressor' : getattr(xgboost, 'XGBRegressor') 380 'xgboost_XGBRegressor': getattr(xgboost, 'XGBRegressor'),
381 'imblearn_over_sampling': getattr(imblearn, 'over_sampling'),
382 'imblearn_combine': getattr(imblearn, 'combine')
256 } 383 }
257 syms.update(estimator_table) 384 syms.update(estimator_table)
258 385
259 for key in unwanted: 386 for key in unwanted:
260 syms.pop(key, None) 387 syms.pop(key, None)
261 388
262 super(SafeEval, self).__init__(symtable=syms, use_numpy=False, minimal=False, 389 super(SafeEval, self).__init__(
263 no_if=True, no_for=True, no_while=True, no_try=True, 390 symtable=syms, use_numpy=False, minimal=False,
264 no_functiondef=True, no_ifexp=True, no_listcomp=False, 391 no_if=True, no_for=True, no_while=True, no_try=True,
265 no_augassign=False, no_assert=True, no_delete=True, 392 no_functiondef=True, no_ifexp=True, no_listcomp=False,
266 no_raise=True, no_print=True) 393 no_augassign=False, no_assert=True, no_delete=True,
267 394 no_raise=True, no_print=True)
268 395
269 396
270 def get_estimator(estimator_json): 397 def get_estimator(estimator_json):
271 398 """Return a sklearn or compatible estimator from Galaxy tool inputs
399 """
272 estimator_module = estimator_json['selected_module'] 400 estimator_module = estimator_json['selected_module']
273 401
274 if estimator_module == 'customer_estimator': 402 if estimator_module == 'custom_estimator':
275 c_estimator = estimator_json['c_estimator'] 403 c_estimator = estimator_json['c_estimator']
276 with open(c_estimator, 'rb') as model_handler: 404 with open(c_estimator, 'rb') as model_handler:
277 new_model = load_model(model_handler) 405 new_model = load_model(model_handler)
278 return new_model 406 return new_model
279 407
408 if estimator_module == "binarize_target":
409 wrapped_estimator = estimator_json['wrapped_estimator']
410 with open(wrapped_estimator, 'rb') as model_handler:
411 wrapped_estimator = load_model(model_handler)
412 options = {}
413 if estimator_json['z_score'] is not None:
414 options['z_score'] = estimator_json['z_score']
415 if estimator_json['value'] is not None:
416 options['value'] = estimator_json['value']
417 options['less_is_positive'] = estimator_json['less_is_positive']
418 if estimator_json['clf_or_regr'] == 'BinarizeTargetClassifier':
419 klass = try_get_attr('iraps_classifier',
420 'BinarizeTargetClassifier')
421 else:
422 klass = try_get_attr('iraps_classifier',
423 'BinarizeTargetRegressor')
424 return klass(wrapped_estimator, **options)
425
280 estimator_cls = estimator_json['selected_estimator'] 426 estimator_cls = estimator_json['selected_estimator']
281 427
282 if estimator_module == 'xgboost': 428 if estimator_module == 'xgboost':
283 cls = getattr(xgboost, estimator_cls) 429 klass = getattr(xgboost, estimator_cls)
284 else: 430 else:
285 module = getattr(sklearn, estimator_module) 431 module = getattr(sklearn, estimator_module)
286 cls = getattr(module, estimator_cls) 432 klass = getattr(module, estimator_cls)
287 433
288 estimator = cls() 434 estimator = klass()
289 435
290 estimator_params = estimator_json['text_params'].strip() 436 estimator_params = estimator_json['text_params'].strip()
291 if estimator_params != '': 437 if estimator_params != '':
292 try: 438 try:
439 safe_eval = SafeEval()
293 params = safe_eval('dict(' + estimator_params + ')') 440 params = safe_eval('dict(' + estimator_params + ')')
294 except ValueError: 441 except ValueError:
295 sys.exit("Unsupported parameter input: `%s`" % estimator_params) 442 sys.exit("Unsupported parameter input: `%s`" % estimator_params)
296 estimator.set_params(**params) 443 estimator.set_params(**params)
297 if 'n_jobs' in estimator.get_params(): 444 if 'n_jobs' in estimator.get_params():
299 446
300 return estimator 447 return estimator
301 448
302 449
303 def get_cv(cv_json): 450 def get_cv(cv_json):
304 """ 451 """ Return CV splitter from Galaxy tool inputs
305 cv_json: 452
306 e.g.: 453 Parameters
454 ----------
455 cv_json : dict
456 From Galaxy tool inputs.
457 e.g.:
307 { 458 {
308 'selected_cv': 'StratifiedKFold', 459 'selected_cv': 'StratifiedKFold',
309 'n_splits': 3, 460 'n_splits': 3,
310 'shuffle': True, 461 'shuffle': True,
311 'random_state': 0 462 'random_state': 0
313 """ 464 """
314 cv = cv_json.pop('selected_cv') 465 cv = cv_json.pop('selected_cv')
315 if cv == 'default': 466 if cv == 'default':
316 return cv_json['n_splits'], None 467 return cv_json['n_splits'], None
317 468
318 groups = cv_json.pop('groups', None) 469 groups = cv_json.pop('groups_selector', None)
319 if groups: 470 if groups is not None:
320 groups = groups.strip() 471 infile_g = groups['infile_g']
321 if groups != '': 472 header = 'infer' if groups['header_g'] else None
322 if groups.startswith('__ob__'): 473 column_option = (groups['column_selector_options_g']
323 groups = groups[6:] 474 ['selected_column_selector_option_g'])
324 if groups.endswith('__cb__'): 475 if column_option in ['by_index_number', 'all_but_by_index_number',
325 groups = groups[:-6] 476 'by_header_name', 'all_but_by_header_name']:
326 groups = [int(x.strip()) for x in groups.split(',')] 477 c = groups['column_selector_options_g']['col_g']
478 else:
479 c = None
480 groups = read_columns(
481 infile_g,
482 c=c,
483 c_option=column_option,
484 sep='\t',
485 header=header,
486 parse_dates=True)
487 groups = groups.ravel()
327 488
328 for k, v in cv_json.items(): 489 for k, v in cv_json.items():
329 if v == '': 490 if v == '':
330 cv_json[k] = None 491 cv_json[k] = None
331 492
339 500
340 test_size = cv_json.get('test_size', None) 501 test_size = cv_json.get('test_size', None)
341 if test_size and test_size > 1.0: 502 if test_size and test_size > 1.0:
342 cv_json['test_size'] = int(test_size) 503 cv_json['test_size'] = int(test_size)
343 504
344 cv_class = getattr(model_selection, cv) 505 if cv == 'OrderedKFold':
506 cv_class = try_get_attr('model_validations', 'OrderedKFold')
507 elif cv == 'RepeatedOrderedKFold':
508 cv_class = try_get_attr('model_validations', 'RepeatedOrderedKFold')
509 else:
510 cv_class = getattr(model_selection, cv)
345 splitter = cv_class(**cv_json) 511 splitter = cv_class(**cv_json)
346 512
347 return splitter, groups 513 return splitter, groups
348 514
349 515
350 # needed when sklearn < v0.20 516 # needed when sklearn < v0.20
351 def balanced_accuracy_score(y_true, y_pred): 517 def balanced_accuracy_score(y_true, y_pred):
518 """Compute balanced accuracy score, which is now available in
519 scikit-learn from v0.20.0.
520 """
352 C = metrics.confusion_matrix(y_true, y_pred) 521 C = metrics.confusion_matrix(y_true, y_pred)
353 with np.errstate(divide='ignore', invalid='ignore'): 522 with np.errstate(divide='ignore', invalid='ignore'):
354 per_class = np.diag(C) / C.sum(axis=1) 523 per_class = np.diag(C) / C.sum(axis=1)
355 if np.any(np.isnan(per_class)): 524 if np.any(np.isnan(per_class)):
356 warnings.warn('y_pred contains classes not in y_true') 525 warnings.warn('y_pred contains classes not in y_true')
358 score = np.mean(per_class) 527 score = np.mean(per_class)
359 return score 528 return score
360 529
361 530
362 def get_scoring(scoring_json): 531 def get_scoring(scoring_json):
363 532 """Return single sklearn scorer class
533 or multiple scoers in dictionary
534 """
364 if scoring_json['primary_scoring'] == 'default': 535 if scoring_json['primary_scoring'] == 'default':
365 return None 536 return None
366 537
367 my_scorers = metrics.SCORERS 538 my_scorers = metrics.SCORERS
539 my_scorers['binarize_auc_scorer'] =\
540 try_get_attr('iraps_classifier', 'binarize_auc_scorer')
541 my_scorers['binarize_average_precision_scorer'] =\
542 try_get_attr('iraps_classifier', 'binarize_average_precision_scorer')
368 if 'balanced_accuracy' not in my_scorers: 543 if 'balanced_accuracy' not in my_scorers:
369 my_scorers['balanced_accuracy'] = metrics.make_scorer(balanced_accuracy_score) 544 my_scorers['balanced_accuracy'] =\
545 metrics.make_scorer(balanced_accuracy_score)
370 546
371 if scoring_json['secondary_scoring'] != 'None'\ 547 if scoring_json['secondary_scoring'] != 'None'\
372 and scoring_json['secondary_scoring'] != scoring_json['primary_scoring']: 548 and scoring_json['secondary_scoring'] !=\
373 scoring = {} 549 scoring_json['primary_scoring']:
374 scoring['primary'] = my_scorers[scoring_json['primary_scoring']] 550 return_scoring = {}
551 primary_scoring = scoring_json['primary_scoring']
552 return_scoring[primary_scoring] = my_scorers[primary_scoring]
375 for scorer in scoring_json['secondary_scoring'].split(','): 553 for scorer in scoring_json['secondary_scoring'].split(','):
376 if scorer != scoring_json['primary_scoring']: 554 if scorer != scoring_json['primary_scoring']:
377 scoring[scorer] = my_scorers[scorer] 555 return_scoring[scorer] = my_scorers[scorer]
378 return scoring 556 return return_scoring
379 557
380 return my_scorers[scoring_json['primary_scoring']] 558 return my_scorers[scoring_json['primary_scoring']]
559
560
561 def get_search_params(estimator):
562 """Format the output of `estimator.get_params()`
563 """
564 params = estimator.get_params()
565 results = []
566 for k, v in params.items():
567 # params below won't be shown for search in the searchcv tool
568 keywords = ('n_jobs', 'pre_dispatch', 'memory', 'steps',
569 'nthread', 'verbose')
570 if k.endswith(keywords):
571 results.append(['*', k, k+": "+repr(v)])
572 else:
573 results.append(['@', k, k+": "+repr(v)])
574 results.append(
575 ["", "Note:",
576 "@, params eligible for search in searchcv tool."])
577
578 return results
579
580
581 def try_get_attr(module, name):
582 """try to get attribute from a custom module
583
584 Parameters
585 ----------
586 module : str
587 Module name
588 name : str
589 Attribute (class/function) name.
590
591 Returns
592 -------
593 class or function
594 """
595 mod = sys.modules.get(module, None)
596 if mod:
597 return getattr(mod, name)
598 else:
599 raise Exception("No module named %s." % module)