Mercurial > repos > bgruening > scipy_sparse
comparison utils.py @ 24:b9ed7b774ba3 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ab963ec9498bd05d2fb2f24f75adb2fccae7958c
| author | bgruening |
|---|---|
| date | Wed, 15 May 2019 07:43:48 -0400 |
| parents | 27c0b1a050df |
| children |
comparison
equal
deleted
inserted
replaced
| 23:27c0b1a050df | 24:b9ed7b774ba3 |
|---|---|
| 1 import ast | |
| 1 import json | 2 import json |
| 3 import imblearn | |
| 2 import numpy as np | 4 import numpy as np |
| 3 import os | |
| 4 import pandas | 5 import pandas |
| 5 import pickle | 6 import pickle |
| 6 import re | 7 import re |
| 7 import scipy | 8 import scipy |
| 8 import sklearn | 9 import sklearn |
| 10 import skrebate | |
| 9 import sys | 11 import sys |
| 10 import warnings | 12 import warnings |
| 11 import xgboost | 13 import xgboost |
| 12 | 14 |
| 15 from collections import Counter | |
| 13 from asteval import Interpreter, make_symbol_table | 16 from asteval import Interpreter, make_symbol_table |
| 14 from sklearn import (cluster, compose, decomposition, ensemble, feature_extraction, | 17 from imblearn import under_sampling, over_sampling, combine |
| 15 feature_selection, gaussian_process, kernel_approximation, metrics, | 18 from imblearn.pipeline import Pipeline as imbPipeline |
| 16 model_selection, naive_bayes, neighbors, pipeline, preprocessing, | 19 from mlxtend import regressor, classifier |
| 17 svm, linear_model, tree, discriminant_analysis) | 20 from scipy.io import mmread |
| 21 from sklearn import ( | |
| 22 cluster, compose, decomposition, ensemble, feature_extraction, | |
| 23 feature_selection, gaussian_process, kernel_approximation, metrics, | |
| 24 model_selection, naive_bayes, neighbors, pipeline, preprocessing, | |
| 25 svm, linear_model, tree, discriminant_analysis) | |
| 18 | 26 |
| 19 try: | 27 try: |
| 20 import skrebate | 28 import iraps_classifier |
| 21 except ModuleNotFoundError: | 29 except ImportError: |
| 22 pass | 30 pass |
| 23 | 31 |
| 24 | |
| 25 N_JOBS = int(os.environ.get('GALAXY_SLOTS', 1)) | |
| 26 | |
| 27 try: | 32 try: |
| 28 sk_whitelist | 33 import model_validations |
| 29 except NameError: | 34 except ImportError: |
| 30 sk_whitelist = None | 35 pass |
| 31 | 36 |
| 32 | 37 try: |
| 33 class SafePickler(pickle.Unpickler): | 38 import feature_selectors |
| 34 """ | 39 except ImportError: |
| 35 Used to safely deserialize scikit-learn model objects serialized by cPickle.dump | 40 pass |
| 41 | |
| 42 try: | |
| 43 import preprocessors | |
| 44 except ImportError: | |
| 45 pass | |
| 46 | |
| 47 # handle pickle white list file | |
| 48 WL_FILE = __import__('os').path.join( | |
| 49 __import__('os').path.dirname(__file__), 'pk_whitelist.json') | |
| 50 | |
| 51 N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1)) | |
| 52 | |
| 53 | |
| 54 class _SafePickler(pickle.Unpickler, object): | |
| 55 """ | |
| 56 Used to safely deserialize scikit-learn model objects | |
| 36 Usage: | 57 Usage: |
| 37 eg.: SafePickler.load(pickled_file_object) | 58 eg.: _SafePickler.load(pickled_file_object) |
| 38 """ | 59 """ |
| 60 def __init__(self, file): | |
| 61 super(_SafePickler, self).__init__(file) | |
| 62 # load global white list | |
| 63 with open(WL_FILE, 'r') as f: | |
| 64 self.pk_whitelist = json.load(f) | |
| 65 | |
| 66 self.bad_names = ( | |
| 67 'and', 'as', 'assert', 'break', 'class', 'continue', | |
| 68 'def', 'del', 'elif', 'else', 'except', 'exec', | |
| 69 'finally', 'for', 'from', 'global', 'if', 'import', | |
| 70 'in', 'is', 'lambda', 'not', 'or', 'pass', 'print', | |
| 71 'raise', 'return', 'try', 'system', 'while', 'with', | |
| 72 'True', 'False', 'None', 'eval', 'execfile', '__import__', | |
| 73 '__package__', '__subclasses__', '__bases__', '__globals__', | |
| 74 '__code__', '__closure__', '__func__', '__self__', '__module__', | |
| 75 '__dict__', '__class__', '__call__', '__get__', | |
| 76 '__getattribute__', '__subclasshook__', '__new__', | |
| 77 '__init__', 'func_globals', 'func_code', 'func_closure', | |
| 78 'im_class', 'im_func', 'im_self', 'gi_code', 'gi_frame', | |
| 79 '__asteval__', 'f_locals', '__mro__') | |
| 80 | |
| 81 # unclassified good globals | |
| 82 self.good_names = [ | |
| 83 'copy_reg._reconstructor', '__builtin__.object', | |
| 84 '__builtin__.bytearray', 'builtins.object', | |
| 85 'builtins.bytearray', 'keras.engine.sequential.Sequential', | |
| 86 'keras.engine.sequential.Model'] | |
| 87 | |
| 88 # custom module in Galaxy-ML | |
| 89 self.custom_modules = [ | |
| 90 '__main__', 'keras_galaxy_models', 'feature_selectors', | |
| 91 'preprocessors', 'iraps_classifier', 'model_validations'] | |
| 92 | |
| 93 # override | |
| 39 def find_class(self, module, name): | 94 def find_class(self, module, name): |
| 40 | 95 # balack list first |
| 41 # sk_whitelist could be read from tool | 96 if name in self.bad_names: |
| 42 global sk_whitelist | 97 raise pickle.UnpicklingError("global '%s.%s' is forbidden" |
| 43 if not sk_whitelist: | 98 % (module, name)) |
| 44 whitelist_file = os.path.join(os.path.dirname(__file__), 'sk_whitelist.json') | 99 |
| 45 with open(whitelist_file, 'r') as f: | 100 # custom module in Galaxy-ML |
| 46 sk_whitelist = json.load(f) | 101 if module in self.custom_modules: |
| 47 | 102 cutom_module = sys.modules.get(module, None) |
| 48 bad_names = ('and', 'as', 'assert', 'break', 'class', 'continue', | 103 if cutom_module: |
| 49 'def', 'del', 'elif', 'else', 'except', 'exec', | 104 return getattr(cutom_module, name) |
| 50 'finally', 'for', 'from', 'global', 'if', 'import', | 105 else: |
| 51 'in', 'is', 'lambda', 'not', 'or', 'pass', 'print', | 106 raise pickle.UnpicklingError("Module %s' is not imported" |
| 52 'raise', 'return', 'try', 'system', 'while', 'with', | 107 % module) |
| 53 'True', 'False', 'None', 'eval', 'execfile', '__import__', | 108 |
| 54 '__package__', '__subclasses__', '__bases__', '__globals__', | 109 # For objects from outside libraries, it's necessary to verify |
| 55 '__code__', '__closure__', '__func__', '__self__', '__module__', | 110 # both module and name. Currently only a blacklist checker |
| 56 '__dict__', '__class__', '__call__', '__get__', | 111 # is working. |
| 57 '__getattribute__', '__subclasshook__', '__new__', | 112 # TODO: replace with a whitelist checker. |
| 58 '__init__', 'func_globals', 'func_code', 'func_closure', | 113 good_names = self.good_names |
| 59 'im_class', 'im_func', 'im_self', 'gi_code', 'gi_frame', | 114 pk_whitelist = self.pk_whitelist |
| 60 '__asteval__', 'f_locals', '__mro__') | |
| 61 good_names = ['copy_reg._reconstructor', '__builtin__.object'] | |
| 62 | |
| 63 if re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', name): | 115 if re.match(r'^[a-zA-Z_][a-zA-Z0-9_]*$', name): |
| 64 fullname = module + '.' + name | 116 fullname = module + '.' + name |
| 65 if (fullname in good_names)\ | 117 if (fullname in good_names)\ |
| 66 or ( ( module.startswith('sklearn.') | 118 or (module.startswith(('sklearn.', 'xgboost.', 'skrebate.', |
| 67 or module.startswith('xgboost.') | 119 'imblearn.', 'mlxtend.', 'numpy.')) |
| 68 or module.startswith('skrebate.') | 120 or module == 'numpy'): |
| 69 or module.startswith('imblearn') | 121 if fullname not in (pk_whitelist['SK_NAMES'] + |
| 70 or module.startswith('numpy.') | 122 pk_whitelist['SKR_NAMES'] + |
| 71 or module == 'numpy' | 123 pk_whitelist['XGB_NAMES'] + |
| 72 ) | 124 pk_whitelist['NUMPY_NAMES'] + |
| 73 and (name not in bad_names) | 125 pk_whitelist['IMBLEARN_NAMES'] + |
| 74 ): | 126 pk_whitelist['MLXTEND_NAMES'] + |
| 75 # TODO: replace with a whitelist checker | 127 good_names): |
| 76 if fullname not in sk_whitelist['SK_NAMES'] + sk_whitelist['SKR_NAMES'] + sk_whitelist['XGB_NAMES'] + sk_whitelist['NUMPY_NAMES'] + sk_whitelist['IMBLEARN_NAMES'] + good_names: | 128 # raise pickle.UnpicklingError |
| 77 print("Warning: global %s is not in pickler whitelist yet and will loss support soon. Contact tool author or leave a message at github.com" % fullname) | 129 print("Warning: global %s is not in pickler whitelist " |
| 130 "yet and will loss support soon. Contact tool " | |
| 131 "author or leave a message at github.com" % fullname) | |
| 78 mod = sys.modules[module] | 132 mod = sys.modules[module] |
| 79 return getattr(mod, name) | 133 return getattr(mod, name) |
| 80 | 134 |
| 81 raise pickle.UnpicklingError("global '%s' is forbidden" % fullname) | 135 raise pickle.UnpicklingError("global '%s' is forbidden" % fullname) |
| 82 | 136 |
| 83 | 137 |
| 84 def load_model(file): | 138 def load_model(file): |
| 85 return SafePickler(file).load() | 139 """Load pickled object with `_SafePicker` |
| 86 | 140 """ |
| 87 | 141 return _SafePickler(file).load() |
| 88 def read_columns(f, c=None, c_option='by_index_number', return_df=False, **args): | 142 |
| 143 | |
| 144 def read_columns(f, c=None, c_option='by_index_number', | |
| 145 return_df=False, **args): | |
| 146 """Return array from a tabular dataset by various columns selection | |
| 147 """ | |
| 89 data = pandas.read_csv(f, **args) | 148 data = pandas.read_csv(f, **args) |
| 90 if c_option == 'by_index_number': | 149 if c_option == 'by_index_number': |
| 91 cols = list(map(lambda x: x - 1, c)) | 150 cols = list(map(lambda x: x - 1, c)) |
| 92 data = data.iloc[:, cols] | 151 data = data.iloc[:, cols] |
| 93 if c_option == 'all_but_by_index_number': | 152 if c_option == 'all_but_by_index_number': |
| 104 return y, data | 163 return y, data |
| 105 else: | 164 else: |
| 106 return y | 165 return y |
| 107 | 166 |
| 108 | 167 |
| 109 ## generate an instance for one of sklearn.feature_selection classes | 168 def feature_selector(inputs, X=None, y=None): |
| 110 def feature_selector(inputs): | 169 """generate an instance of sklearn.feature_selection classes |
| 170 | |
| 171 Parameters | |
| 172 ---------- | |
| 173 inputs : dict | |
| 174 From galaxy tool parameters. | |
| 175 X : array | |
| 176 Containing training features. | |
| 177 y : array or list | |
| 178 Target values. | |
| 179 """ | |
| 111 selector = inputs['selected_algorithm'] | 180 selector = inputs['selected_algorithm'] |
| 112 selector = getattr(sklearn.feature_selection, selector) | 181 if selector != 'DyRFECV': |
| 182 selector = getattr(sklearn.feature_selection, selector) | |
| 113 options = inputs['options'] | 183 options = inputs['options'] |
| 114 | 184 |
| 115 if inputs['selected_algorithm'] == 'SelectFromModel': | 185 if inputs['selected_algorithm'] == 'SelectFromModel': |
| 116 if not options['threshold'] or options['threshold'] == 'None': | 186 if not options['threshold'] or options['threshold'] == 'None': |
| 117 options['threshold'] = None | 187 options['threshold'] = None |
| 126 fitted_estimator = load_model(model_handler) | 196 fitted_estimator = load_model(model_handler) |
| 127 new_selector = selector(fitted_estimator, prefit=True, **options) | 197 new_selector = selector(fitted_estimator, prefit=True, **options) |
| 128 else: | 198 else: |
| 129 estimator_json = inputs['model_inputter']['estimator_selector'] | 199 estimator_json = inputs['model_inputter']['estimator_selector'] |
| 130 estimator = get_estimator(estimator_json) | 200 estimator = get_estimator(estimator_json) |
| 201 check_feature_importances = try_get_attr( | |
| 202 'feature_selectors', 'check_feature_importances') | |
| 203 estimator = check_feature_importances(estimator) | |
| 131 new_selector = selector(estimator, **options) | 204 new_selector = selector(estimator, **options) |
| 132 | 205 |
| 133 elif inputs['selected_algorithm'] == 'RFE': | 206 elif inputs['selected_algorithm'] == 'RFE': |
| 134 estimator = get_estimator(inputs['estimator_selector']) | |
| 135 step = options.get('step', None) | 207 step = options.get('step', None) |
| 136 if step and step >= 1.0: | 208 if step and step >= 1.0: |
| 137 options['step'] = int(step) | 209 options['step'] = int(step) |
| 210 estimator = get_estimator(inputs["estimator_selector"]) | |
| 211 check_feature_importances = try_get_attr( | |
| 212 'feature_selectors', 'check_feature_importances') | |
| 213 estimator = check_feature_importances(estimator) | |
| 138 new_selector = selector(estimator, **options) | 214 new_selector = selector(estimator, **options) |
| 139 | 215 |
| 140 elif inputs['selected_algorithm'] == 'RFECV': | 216 elif inputs['selected_algorithm'] == 'RFECV': |
| 141 options['scoring'] = get_scoring(options['scoring']) | 217 options['scoring'] = get_scoring(options['scoring']) |
| 142 options['n_jobs'] = N_JOBS | 218 options['n_jobs'] = N_JOBS |
| 143 splitter, groups = get_cv(options.pop('cv_selector')) | 219 splitter, groups = get_cv(options.pop('cv_selector')) |
| 144 # TODO support group cv splitters | 220 if groups is None: |
| 145 options['cv'] = splitter | 221 options['cv'] = splitter |
| 222 else: | |
| 223 options['cv'] = list(splitter.split(X, y, groups=groups)) | |
| 146 step = options.get('step', None) | 224 step = options.get('step', None) |
| 147 if step and step >= 1.0: | 225 if step and step >= 1.0: |
| 148 options['step'] = int(step) | 226 options['step'] = int(step) |
| 149 estimator = get_estimator(inputs['estimator_selector']) | 227 estimator = get_estimator(inputs['estimator_selector']) |
| 228 check_feature_importances = try_get_attr( | |
| 229 'feature_selectors', 'check_feature_importances') | |
| 230 estimator = check_feature_importances(estimator) | |
| 150 new_selector = selector(estimator, **options) | 231 new_selector = selector(estimator, **options) |
| 232 | |
| 233 elif inputs['selected_algorithm'] == 'DyRFECV': | |
| 234 options['scoring'] = get_scoring(options['scoring']) | |
| 235 options['n_jobs'] = N_JOBS | |
| 236 splitter, groups = get_cv(options.pop('cv_selector')) | |
| 237 if groups is None: | |
| 238 options['cv'] = splitter | |
| 239 else: | |
| 240 options['cv'] = list(splitter.split(X, y, groups=groups)) | |
| 241 step = options.get('step') | |
| 242 if not step or step == 'None': | |
| 243 step = None | |
| 244 else: | |
| 245 step = ast.literal_eval(step) | |
| 246 options['step'] = step | |
| 247 estimator = get_estimator(inputs["estimator_selector"]) | |
| 248 check_feature_importances = try_get_attr( | |
| 249 'feature_selectors', 'check_feature_importances') | |
| 250 estimator = check_feature_importances(estimator) | |
| 251 DyRFECV = try_get_attr('feature_selectors', 'DyRFECV') | |
| 252 | |
| 253 new_selector = DyRFECV(estimator, **options) | |
| 151 | 254 |
| 152 elif inputs['selected_algorithm'] == 'VarianceThreshold': | 255 elif inputs['selected_algorithm'] == 'VarianceThreshold': |
| 153 new_selector = selector(**options) | 256 new_selector = selector(**options) |
| 154 | 257 |
| 155 else: | 258 else: |
| 159 | 262 |
| 160 return new_selector | 263 return new_selector |
| 161 | 264 |
| 162 | 265 |
| 163 def get_X_y(params, file1, file2): | 266 def get_X_y(params, file1, file2): |
| 164 input_type = params['selected_tasks']['selected_algorithms']['input_options']['selected_input'] | 267 """Return machine learning inputs X, y from tabluar inputs |
| 268 """ | |
| 269 input_type = (params['selected_tasks']['selected_algorithms'] | |
| 270 ['input_options']['selected_input']) | |
| 165 if input_type == 'tabular': | 271 if input_type == 'tabular': |
| 166 header = 'infer' if params['selected_tasks']['selected_algorithms']['input_options']['header1'] else None | 272 header = 'infer' if (params['selected_tasks']['selected_algorithms'] |
| 167 column_option = params['selected_tasks']['selected_algorithms']['input_options']['column_selector_options_1']['selected_column_selector_option'] | 273 ['input_options']['header1']) else None |
| 168 if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']: | 274 column_option = (params['selected_tasks']['selected_algorithms'] |
| 169 c = params['selected_tasks']['selected_algorithms']['input_options']['column_selector_options_1']['col1'] | 275 ['input_options']['column_selector_options_1'] |
| 276 ['selected_column_selector_option']) | |
| 277 if column_option in ['by_index_number', 'all_but_by_index_number', | |
| 278 'by_header_name', 'all_but_by_header_name']: | |
| 279 c = (params['selected_tasks']['selected_algorithms'] | |
| 280 ['input_options']['column_selector_options_1']['col1']) | |
| 170 else: | 281 else: |
| 171 c = None | 282 c = None |
| 172 X = read_columns( | 283 X = read_columns( |
| 173 file1, | 284 file1, |
| 174 c=c, | 285 c=c, |
| 175 c_option=column_option, | 286 c_option=column_option, |
| 176 sep='\t', | 287 sep='\t', |
| 177 header=header, | 288 header=header, |
| 178 parse_dates=True | 289 parse_dates=True).astype(float) |
| 179 ) | |
| 180 else: | 290 else: |
| 181 X = mmread(file1) | 291 X = mmread(file1) |
| 182 | 292 |
| 183 header = 'infer' if params['selected_tasks']['selected_algorithms']['input_options']['header2'] else None | 293 header = 'infer' if (params['selected_tasks']['selected_algorithms'] |
| 184 column_option = params['selected_tasks']['selected_algorithms']['input_options']['column_selector_options_2']['selected_column_selector_option2'] | 294 ['input_options']['header2']) else None |
| 185 if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']: | 295 column_option = (params['selected_tasks']['selected_algorithms'] |
| 186 c = params['selected_tasks']['selected_algorithms']['input_options']['column_selector_options_2']['col2'] | 296 ['input_options']['column_selector_options_2'] |
| 297 ['selected_column_selector_option2']) | |
| 298 if column_option in ['by_index_number', 'all_but_by_index_number', | |
| 299 'by_header_name', 'all_but_by_header_name']: | |
| 300 c = (params['selected_tasks']['selected_algorithms'] | |
| 301 ['input_options']['column_selector_options_2']['col2']) | |
| 187 else: | 302 else: |
| 188 c = None | 303 c = None |
| 189 y = read_columns( | 304 y = read_columns( |
| 190 file2, | 305 file2, |
| 191 c=c, | 306 c=c, |
| 192 c_option=column_option, | 307 c_option=column_option, |
| 193 sep='\t', | 308 sep='\t', |
| 194 header=header, | 309 header=header, |
| 195 parse_dates=True | 310 parse_dates=True) |
| 196 ) | |
| 197 y = y.ravel() | 311 y = y.ravel() |
| 312 | |
| 198 return X, y | 313 return X, y |
| 199 | 314 |
| 200 | 315 |
| 201 class SafeEval(Interpreter): | 316 class SafeEval(Interpreter): |
| 202 | 317 """Customized symbol table for safely literal eval |
| 203 def __init__(self, load_scipy=False, load_numpy=False, load_estimators=False): | 318 """ |
| 319 def __init__(self, load_scipy=False, load_numpy=False, | |
| 320 load_estimators=False): | |
| 204 | 321 |
| 205 # File opening and other unneeded functions could be dropped | 322 # File opening and other unneeded functions could be dropped |
| 206 unwanted = ['open', 'type', 'dir', 'id', 'str', 'repr'] | 323 unwanted = ['open', 'type', 'dir', 'id', 'str', 'repr'] |
| 207 | 324 |
| 208 # Allowed symbol table. Add more if needed. | 325 # Allowed symbol table. Add more if needed. |
| 209 new_syms = { | 326 new_syms = { |
| 210 'np_arange': getattr(np, 'arange'), | 327 'np_arange': getattr(np, 'arange'), |
| 211 'ensemble_ExtraTreesClassifier': getattr(ensemble, 'ExtraTreesClassifier') | 328 'ensemble_ExtraTreesClassifier': |
| 329 getattr(ensemble, 'ExtraTreesClassifier') | |
| 212 } | 330 } |
| 213 | 331 |
| 214 syms = make_symbol_table(use_numpy=False, **new_syms) | 332 syms = make_symbol_table(use_numpy=False, **new_syms) |
| 215 | 333 |
| 216 if load_scipy: | 334 if load_scipy: |
| 217 scipy_distributions = scipy.stats.distributions.__dict__ | 335 scipy_distributions = scipy.stats.distributions.__dict__ |
| 218 for k, v in scipy_distributions.items(): | 336 for k, v in scipy_distributions.items(): |
| 219 if isinstance(v, (scipy.stats.rv_continuous, scipy.stats.rv_discrete)): | 337 if isinstance(v, (scipy.stats.rv_continuous, |
| 338 scipy.stats.rv_discrete)): | |
| 220 syms['scipy_stats_' + k] = v | 339 syms['scipy_stats_' + k] = v |
| 221 | 340 |
| 222 if load_numpy: | 341 if load_numpy: |
| 223 from_numpy_random = ['beta', 'binomial', 'bytes', 'chisquare', 'choice', 'dirichlet', 'division', | 342 from_numpy_random = [ |
| 224 'exponential', 'f', 'gamma', 'geometric', 'gumbel', 'hypergeometric', | 343 'beta', 'binomial', 'bytes', 'chisquare', 'choice', |
| 225 'laplace', 'logistic', 'lognormal', 'logseries', 'mtrand', 'multinomial', | 344 'dirichlet', 'division', 'exponential', 'f', 'gamma', |
| 226 'multivariate_normal', 'negative_binomial', 'noncentral_chisquare', 'noncentral_f', | 345 'geometric', 'gumbel', 'hypergeometric', 'laplace', |
| 227 'normal', 'pareto', 'permutation', 'poisson', 'power', 'rand', 'randint', | 346 'logistic', 'lognormal', 'logseries', 'mtrand', |
| 228 'randn', 'random', 'random_integers', 'random_sample', 'ranf', 'rayleigh', | 347 'multinomial', 'multivariate_normal', 'negative_binomial', |
| 229 'sample', 'seed', 'set_state', 'shuffle', 'standard_cauchy', 'standard_exponential', | 348 'noncentral_chisquare', 'noncentral_f', 'normal', 'pareto', |
| 230 'standard_gamma', 'standard_normal', 'standard_t', 'triangular', 'uniform', | 349 'permutation', 'poisson', 'power', 'rand', 'randint', |
| 231 'vonmises', 'wald', 'weibull', 'zipf'] | 350 'randn', 'random', 'random_integers', 'random_sample', |
| 351 'ranf', 'rayleigh', 'sample', 'seed', 'set_state', | |
| 352 'shuffle', 'standard_cauchy', 'standard_exponential', | |
| 353 'standard_gamma', 'standard_normal', 'standard_t', | |
| 354 'triangular', 'uniform', 'vonmises', 'wald', 'weibull', 'zipf'] | |
| 232 for f in from_numpy_random: | 355 for f in from_numpy_random: |
| 233 syms['np_random_' + f] = getattr(np.random, f) | 356 syms['np_random_' + f] = getattr(np.random, f) |
| 234 | 357 |
| 235 if load_estimators: | 358 if load_estimators: |
| 236 estimator_table = { | 359 estimator_table = { |
| 237 'sklearn_svm' : getattr(sklearn, 'svm'), | 360 'sklearn_svm': getattr(sklearn, 'svm'), |
| 238 'sklearn_tree' : getattr(sklearn, 'tree'), | 361 'sklearn_tree': getattr(sklearn, 'tree'), |
| 239 'sklearn_ensemble' : getattr(sklearn, 'ensemble'), | 362 'sklearn_ensemble': getattr(sklearn, 'ensemble'), |
| 240 'sklearn_neighbors' : getattr(sklearn, 'neighbors'), | 363 'sklearn_neighbors': getattr(sklearn, 'neighbors'), |
| 241 'sklearn_naive_bayes' : getattr(sklearn, 'naive_bayes'), | 364 'sklearn_naive_bayes': getattr(sklearn, 'naive_bayes'), |
| 242 'sklearn_linear_model' : getattr(sklearn, 'linear_model'), | 365 'sklearn_linear_model': getattr(sklearn, 'linear_model'), |
| 243 'sklearn_cluster' : getattr(sklearn, 'cluster'), | 366 'sklearn_cluster': getattr(sklearn, 'cluster'), |
| 244 'sklearn_decomposition' : getattr(sklearn, 'decomposition'), | 367 'sklearn_decomposition': getattr(sklearn, 'decomposition'), |
| 245 'sklearn_preprocessing' : getattr(sklearn, 'preprocessing'), | 368 'sklearn_preprocessing': getattr(sklearn, 'preprocessing'), |
| 246 'sklearn_feature_selection' : getattr(sklearn, 'feature_selection'), | 369 'sklearn_feature_selection': |
| 247 'sklearn_kernel_approximation' : getattr(sklearn, 'kernel_approximation'), | 370 getattr(sklearn, 'feature_selection'), |
| 371 'sklearn_kernel_approximation': | |
| 372 getattr(sklearn, 'kernel_approximation'), | |
| 248 'skrebate_ReliefF': getattr(skrebate, 'ReliefF'), | 373 'skrebate_ReliefF': getattr(skrebate, 'ReliefF'), |
| 249 'skrebate_SURF': getattr(skrebate, 'SURF'), | 374 'skrebate_SURF': getattr(skrebate, 'SURF'), |
| 250 'skrebate_SURFstar': getattr(skrebate, 'SURFstar'), | 375 'skrebate_SURFstar': getattr(skrebate, 'SURFstar'), |
| 251 'skrebate_MultiSURF': getattr(skrebate, 'MultiSURF'), | 376 'skrebate_MultiSURF': getattr(skrebate, 'MultiSURF'), |
| 252 'skrebate_MultiSURFstar': getattr(skrebate, 'MultiSURFstar'), | 377 'skrebate_MultiSURFstar': getattr(skrebate, 'MultiSURFstar'), |
| 253 'skrebate_TuRF': getattr(skrebate, 'TuRF'), | 378 'skrebate_TuRF': getattr(skrebate, 'TuRF'), |
| 254 'xgboost_XGBClassifier' : getattr(xgboost, 'XGBClassifier'), | 379 'xgboost_XGBClassifier': getattr(xgboost, 'XGBClassifier'), |
| 255 'xgboost_XGBRegressor' : getattr(xgboost, 'XGBRegressor') | 380 'xgboost_XGBRegressor': getattr(xgboost, 'XGBRegressor'), |
| 381 'imblearn_over_sampling': getattr(imblearn, 'over_sampling'), | |
| 382 'imblearn_combine': getattr(imblearn, 'combine') | |
| 256 } | 383 } |
| 257 syms.update(estimator_table) | 384 syms.update(estimator_table) |
| 258 | 385 |
| 259 for key in unwanted: | 386 for key in unwanted: |
| 260 syms.pop(key, None) | 387 syms.pop(key, None) |
| 261 | 388 |
| 262 super(SafeEval, self).__init__(symtable=syms, use_numpy=False, minimal=False, | 389 super(SafeEval, self).__init__( |
| 263 no_if=True, no_for=True, no_while=True, no_try=True, | 390 symtable=syms, use_numpy=False, minimal=False, |
| 264 no_functiondef=True, no_ifexp=True, no_listcomp=False, | 391 no_if=True, no_for=True, no_while=True, no_try=True, |
| 265 no_augassign=False, no_assert=True, no_delete=True, | 392 no_functiondef=True, no_ifexp=True, no_listcomp=False, |
| 266 no_raise=True, no_print=True) | 393 no_augassign=False, no_assert=True, no_delete=True, |
| 267 | 394 no_raise=True, no_print=True) |
| 268 | 395 |
| 269 | 396 |
| 270 def get_estimator(estimator_json): | 397 def get_estimator(estimator_json): |
| 271 | 398 """Return a sklearn or compatible estimator from Galaxy tool inputs |
| 399 """ | |
| 272 estimator_module = estimator_json['selected_module'] | 400 estimator_module = estimator_json['selected_module'] |
| 273 | 401 |
| 274 if estimator_module == 'customer_estimator': | 402 if estimator_module == 'custom_estimator': |
| 275 c_estimator = estimator_json['c_estimator'] | 403 c_estimator = estimator_json['c_estimator'] |
| 276 with open(c_estimator, 'rb') as model_handler: | 404 with open(c_estimator, 'rb') as model_handler: |
| 277 new_model = load_model(model_handler) | 405 new_model = load_model(model_handler) |
| 278 return new_model | 406 return new_model |
| 279 | 407 |
| 408 if estimator_module == "binarize_target": | |
| 409 wrapped_estimator = estimator_json['wrapped_estimator'] | |
| 410 with open(wrapped_estimator, 'rb') as model_handler: | |
| 411 wrapped_estimator = load_model(model_handler) | |
| 412 options = {} | |
| 413 if estimator_json['z_score'] is not None: | |
| 414 options['z_score'] = estimator_json['z_score'] | |
| 415 if estimator_json['value'] is not None: | |
| 416 options['value'] = estimator_json['value'] | |
| 417 options['less_is_positive'] = estimator_json['less_is_positive'] | |
| 418 if estimator_json['clf_or_regr'] == 'BinarizeTargetClassifier': | |
| 419 klass = try_get_attr('iraps_classifier', | |
| 420 'BinarizeTargetClassifier') | |
| 421 else: | |
| 422 klass = try_get_attr('iraps_classifier', | |
| 423 'BinarizeTargetRegressor') | |
| 424 return klass(wrapped_estimator, **options) | |
| 425 | |
| 280 estimator_cls = estimator_json['selected_estimator'] | 426 estimator_cls = estimator_json['selected_estimator'] |
| 281 | 427 |
| 282 if estimator_module == 'xgboost': | 428 if estimator_module == 'xgboost': |
| 283 cls = getattr(xgboost, estimator_cls) | 429 klass = getattr(xgboost, estimator_cls) |
| 284 else: | 430 else: |
| 285 module = getattr(sklearn, estimator_module) | 431 module = getattr(sklearn, estimator_module) |
| 286 cls = getattr(module, estimator_cls) | 432 klass = getattr(module, estimator_cls) |
| 287 | 433 |
| 288 estimator = cls() | 434 estimator = klass() |
| 289 | 435 |
| 290 estimator_params = estimator_json['text_params'].strip() | 436 estimator_params = estimator_json['text_params'].strip() |
| 291 if estimator_params != '': | 437 if estimator_params != '': |
| 292 try: | 438 try: |
| 439 safe_eval = SafeEval() | |
| 293 params = safe_eval('dict(' + estimator_params + ')') | 440 params = safe_eval('dict(' + estimator_params + ')') |
| 294 except ValueError: | 441 except ValueError: |
| 295 sys.exit("Unsupported parameter input: `%s`" % estimator_params) | 442 sys.exit("Unsupported parameter input: `%s`" % estimator_params) |
| 296 estimator.set_params(**params) | 443 estimator.set_params(**params) |
| 297 if 'n_jobs' in estimator.get_params(): | 444 if 'n_jobs' in estimator.get_params(): |
| 299 | 446 |
| 300 return estimator | 447 return estimator |
| 301 | 448 |
| 302 | 449 |
| 303 def get_cv(cv_json): | 450 def get_cv(cv_json): |
| 304 """ | 451 """ Return CV splitter from Galaxy tool inputs |
| 305 cv_json: | 452 |
| 306 e.g.: | 453 Parameters |
| 454 ---------- | |
| 455 cv_json : dict | |
| 456 From Galaxy tool inputs. | |
| 457 e.g.: | |
| 307 { | 458 { |
| 308 'selected_cv': 'StratifiedKFold', | 459 'selected_cv': 'StratifiedKFold', |
| 309 'n_splits': 3, | 460 'n_splits': 3, |
| 310 'shuffle': True, | 461 'shuffle': True, |
| 311 'random_state': 0 | 462 'random_state': 0 |
| 313 """ | 464 """ |
| 314 cv = cv_json.pop('selected_cv') | 465 cv = cv_json.pop('selected_cv') |
| 315 if cv == 'default': | 466 if cv == 'default': |
| 316 return cv_json['n_splits'], None | 467 return cv_json['n_splits'], None |
| 317 | 468 |
| 318 groups = cv_json.pop('groups', None) | 469 groups = cv_json.pop('groups_selector', None) |
| 319 if groups: | 470 if groups is not None: |
| 320 groups = groups.strip() | 471 infile_g = groups['infile_g'] |
| 321 if groups != '': | 472 header = 'infer' if groups['header_g'] else None |
| 322 if groups.startswith('__ob__'): | 473 column_option = (groups['column_selector_options_g'] |
| 323 groups = groups[6:] | 474 ['selected_column_selector_option_g']) |
| 324 if groups.endswith('__cb__'): | 475 if column_option in ['by_index_number', 'all_but_by_index_number', |
| 325 groups = groups[:-6] | 476 'by_header_name', 'all_but_by_header_name']: |
| 326 groups = [int(x.strip()) for x in groups.split(',')] | 477 c = groups['column_selector_options_g']['col_g'] |
| 478 else: | |
| 479 c = None | |
| 480 groups = read_columns( | |
| 481 infile_g, | |
| 482 c=c, | |
| 483 c_option=column_option, | |
| 484 sep='\t', | |
| 485 header=header, | |
| 486 parse_dates=True) | |
| 487 groups = groups.ravel() | |
| 327 | 488 |
| 328 for k, v in cv_json.items(): | 489 for k, v in cv_json.items(): |
| 329 if v == '': | 490 if v == '': |
| 330 cv_json[k] = None | 491 cv_json[k] = None |
| 331 | 492 |
| 339 | 500 |
| 340 test_size = cv_json.get('test_size', None) | 501 test_size = cv_json.get('test_size', None) |
| 341 if test_size and test_size > 1.0: | 502 if test_size and test_size > 1.0: |
| 342 cv_json['test_size'] = int(test_size) | 503 cv_json['test_size'] = int(test_size) |
| 343 | 504 |
| 344 cv_class = getattr(model_selection, cv) | 505 if cv == 'OrderedKFold': |
| 506 cv_class = try_get_attr('model_validations', 'OrderedKFold') | |
| 507 elif cv == 'RepeatedOrderedKFold': | |
| 508 cv_class = try_get_attr('model_validations', 'RepeatedOrderedKFold') | |
| 509 else: | |
| 510 cv_class = getattr(model_selection, cv) | |
| 345 splitter = cv_class(**cv_json) | 511 splitter = cv_class(**cv_json) |
| 346 | 512 |
| 347 return splitter, groups | 513 return splitter, groups |
| 348 | 514 |
| 349 | 515 |
| 350 # needed when sklearn < v0.20 | 516 # needed when sklearn < v0.20 |
| 351 def balanced_accuracy_score(y_true, y_pred): | 517 def balanced_accuracy_score(y_true, y_pred): |
| 518 """Compute balanced accuracy score, which is now available in | |
| 519 scikit-learn from v0.20.0. | |
| 520 """ | |
| 352 C = metrics.confusion_matrix(y_true, y_pred) | 521 C = metrics.confusion_matrix(y_true, y_pred) |
| 353 with np.errstate(divide='ignore', invalid='ignore'): | 522 with np.errstate(divide='ignore', invalid='ignore'): |
| 354 per_class = np.diag(C) / C.sum(axis=1) | 523 per_class = np.diag(C) / C.sum(axis=1) |
| 355 if np.any(np.isnan(per_class)): | 524 if np.any(np.isnan(per_class)): |
| 356 warnings.warn('y_pred contains classes not in y_true') | 525 warnings.warn('y_pred contains classes not in y_true') |
| 358 score = np.mean(per_class) | 527 score = np.mean(per_class) |
| 359 return score | 528 return score |
| 360 | 529 |
| 361 | 530 |
| 362 def get_scoring(scoring_json): | 531 def get_scoring(scoring_json): |
| 363 | 532 """Return single sklearn scorer class |
| 533 or multiple scoers in dictionary | |
| 534 """ | |
| 364 if scoring_json['primary_scoring'] == 'default': | 535 if scoring_json['primary_scoring'] == 'default': |
| 365 return None | 536 return None |
| 366 | 537 |
| 367 my_scorers = metrics.SCORERS | 538 my_scorers = metrics.SCORERS |
| 539 my_scorers['binarize_auc_scorer'] =\ | |
| 540 try_get_attr('iraps_classifier', 'binarize_auc_scorer') | |
| 541 my_scorers['binarize_average_precision_scorer'] =\ | |
| 542 try_get_attr('iraps_classifier', 'binarize_average_precision_scorer') | |
| 368 if 'balanced_accuracy' not in my_scorers: | 543 if 'balanced_accuracy' not in my_scorers: |
| 369 my_scorers['balanced_accuracy'] = metrics.make_scorer(balanced_accuracy_score) | 544 my_scorers['balanced_accuracy'] =\ |
| 545 metrics.make_scorer(balanced_accuracy_score) | |
| 370 | 546 |
| 371 if scoring_json['secondary_scoring'] != 'None'\ | 547 if scoring_json['secondary_scoring'] != 'None'\ |
| 372 and scoring_json['secondary_scoring'] != scoring_json['primary_scoring']: | 548 and scoring_json['secondary_scoring'] !=\ |
| 373 scoring = {} | 549 scoring_json['primary_scoring']: |
| 374 scoring['primary'] = my_scorers[scoring_json['primary_scoring']] | 550 return_scoring = {} |
| 551 primary_scoring = scoring_json['primary_scoring'] | |
| 552 return_scoring[primary_scoring] = my_scorers[primary_scoring] | |
| 375 for scorer in scoring_json['secondary_scoring'].split(','): | 553 for scorer in scoring_json['secondary_scoring'].split(','): |
| 376 if scorer != scoring_json['primary_scoring']: | 554 if scorer != scoring_json['primary_scoring']: |
| 377 scoring[scorer] = my_scorers[scorer] | 555 return_scoring[scorer] = my_scorers[scorer] |
| 378 return scoring | 556 return return_scoring |
| 379 | 557 |
| 380 return my_scorers[scoring_json['primary_scoring']] | 558 return my_scorers[scoring_json['primary_scoring']] |
| 559 | |
| 560 | |
| 561 def get_search_params(estimator): | |
| 562 """Format the output of `estimator.get_params()` | |
| 563 """ | |
| 564 params = estimator.get_params() | |
| 565 results = [] | |
| 566 for k, v in params.items(): | |
| 567 # params below won't be shown for search in the searchcv tool | |
| 568 keywords = ('n_jobs', 'pre_dispatch', 'memory', 'steps', | |
| 569 'nthread', 'verbose') | |
| 570 if k.endswith(keywords): | |
| 571 results.append(['*', k, k+": "+repr(v)]) | |
| 572 else: | |
| 573 results.append(['@', k, k+": "+repr(v)]) | |
| 574 results.append( | |
| 575 ["", "Note:", | |
| 576 "@, params eligible for search in searchcv tool."]) | |
| 577 | |
| 578 return results | |
| 579 | |
| 580 | |
| 581 def try_get_attr(module, name): | |
| 582 """try to get attribute from a custom module | |
| 583 | |
| 584 Parameters | |
| 585 ---------- | |
| 586 module : str | |
| 587 Module name | |
| 588 name : str | |
| 589 Attribute (class/function) name. | |
| 590 | |
| 591 Returns | |
| 592 ------- | |
| 593 class or function | |
| 594 """ | |
| 595 mod = sys.modules.get(module, None) | |
| 596 if mod: | |
| 597 return getattr(mod, name) | |
| 598 else: | |
| 599 raise Exception("No module named %s." % module) |
