sklearn_feature_selection: utils.py comparison

comparison utils.py @ 17:2bbbac61e48d draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 57f4407e278a615f47a377a3328782b1d8e0b54d

author	bgruening
date	Sun, 30 Dec 2018 01:57:11 -0500
parents	026667802750
children	ec25331946b8

comparison

equal deleted inserted replaced

-:328a8d547ca2
+:2bbbac61e48d
-import sys
+import json
+import numpy as np
 import os
 import pandas
+import pickle
 import re
-import pickle
-import warnings
-import numpy as np
-import xgboost
 import scipy
 import sklearn
+import sys
+import warnings
+import xgboost
 from asteval import Interpreter, make_symbol_table
-from sklearn import (cluster, decomposition, ensemble, feature_extraction, feature_selection,
+from sklearn import (cluster, compose, decomposition, ensemble, feature_extraction,
-gaussian_process, kernel_approximation, metrics,
+feature_selection, gaussian_process, kernel_approximation, metrics,
 model_selection, naive_bayes, neighbors, pipeline, preprocessing,
 svm, linear_model, tree, discriminant_analysis)
+try:
+import skrebate
+except ModuleNotFoundError:
+pass
 N_JOBS = int(os.environ.get('GALAXY_SLOTS', 1))
+try:
+sk_whitelist
+except NameError:
+sk_whitelist = None
 class SafePickler(pickle.Unpickler):
 """
 Used to safely deserialize scikit-learn model objects serialized by cPickle.dump
 Usage:
 eg.: SafePickler.load(pickled_file_object)
 """
 def find_class(self, module, name):
+# sk_whitelist could be read from tool
+global sk_whitelist
+if not sk_whitelist:
+whitelist_file = os.path.join(os.path.dirname(__file__), 'sk_whitelist.json')
+with open(whitelist_file, 'r') as f:
+sk_whitelist = json.load(f)
 bad_names = ('and', 'as', 'assert', 'break', 'class', 'continue',
 'def', 'del', 'elif', 'else', 'except', 'exec',
 'finally', 'for', 'from', 'global', 'if', 'import',
 'in', 'is', 'lambda', 'not', 'or', 'pass', 'print',
 fullname = module + '.' + name
 if (fullname in good_names)\
 or  (   (   module.startswith('sklearn.')
 or module.startswith('xgboost.')
 or module.startswith('skrebate.')
+or module.startswith('imblearn')
 or module.startswith('numpy.')
 or module == 'numpy'
 )
 and (name not in bad_names)
 ):
 # TODO: replace with a whitelist checker
-if fullname not in sk_whitelist['SK_NAMES'] + sk_whitelist['SKR_NAMES'] + sk_whitelist['XGB_NAMES'] + sk_whitelist['NUMPY_NAMES'] + good_names:
+if fullname not in sk_whitelist['SK_NAMES'] + sk_whitelist['SKR_NAMES'] + sk_whitelist['XGB_NAMES'] + sk_whitelist['NUMPY_NAMES'] + sk_whitelist['IMBLEARN_NAMES'] + good_names:
 print("Warning: global %s is not in pickler whitelist yet and will loss support soon. Contact tool author or leave a message at github.com" % fullname)
 mod = sys.modules[module]
 return getattr(mod, name)
 raise pickle.UnpicklingError("global '%s' is forbidden" % fullname)
 y = data.values
 if return_df:
 return y, data
 else:
 return y
-return y
 ## generate an instance for one of sklearn.feature_selection classes
 def feature_selector(inputs):
-selector = inputs["selected_algorithm"]
+selector = inputs['selected_algorithm']
 selector = getattr(sklearn.feature_selection, selector)
-options = inputs["options"]
+options = inputs['options']
 if inputs['selected_algorithm'] == 'SelectFromModel':
 if not options['threshold'] or options['threshold'] == 'None':
 options['threshold'] = None
+else:
+try:
+options['threshold'] = float(options['threshold'])
+except ValueError:
+pass
 if inputs['model_inputter']['input_mode'] == 'prefitted':
 model_file = inputs['model_inputter']['fitted_estimator']
 with open(model_file, 'rb') as model_handler:
 fitted_estimator = load_model(model_handler)
 new_selector = selector(fitted_estimator, prefit=True, **options)
 else:
-estimator_json = inputs['model_inputter']["estimator_selector"]
+estimator_json = inputs['model_inputter']['estimator_selector']
 estimator = get_estimator(estimator_json)
 new_selector = selector(estimator, **options)
 elif inputs['selected_algorithm'] == 'RFE':
-estimator = get_estimator(inputs["estimator_selector"])
+estimator = get_estimator(inputs['estimator_selector'])
+step = options.get('step', None)
+if step and step >= 1.0:
+options['step'] = int(step)
 new_selector = selector(estimator, **options)
 elif inputs['selected_algorithm'] == 'RFECV':
 options['scoring'] = get_scoring(options['scoring'])
 options['n_jobs'] = N_JOBS
-options['cv'] = get_cv(options['cv'].strip())
+splitter, groups = get_cv(options.pop('cv_selector'))
-estimator = get_estimator(inputs["estimator_selector"])
+# TODO support group cv splitters
+options['cv'] = splitter
+step = options.get('step', None)
+if step and step >= 1.0:
+options['step'] = int(step)
+estimator = get_estimator(inputs['estimator_selector'])
 new_selector = selector(estimator, **options)
-elif inputs['selected_algorithm'] == "VarianceThreshold":
+elif inputs['selected_algorithm'] == 'VarianceThreshold':
 new_selector = selector(**options)
 else:
-score_func = inputs["score_func"]
+score_func = inputs['score_func']
 score_func = getattr(sklearn.feature_selection, score_func)
 new_selector = selector(score_func, **options)
 return new_selector
 def get_X_y(params, file1, file2):
-input_type = params["selected_tasks"]["selected_algorithms"]["input_options"]["selected_input"]
+input_type = params['selected_tasks']['selected_algorithms']['input_options']['selected_input']
-if input_type == "tabular":
+if input_type == 'tabular':
-header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header1"] else None
+header = 'infer' if params['selected_tasks']['selected_algorithms']['input_options']['header1'] else None
-column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["selected_column_selector_option"]
+column_option = params['selected_tasks']['selected_algorithms']['input_options']['column_selector_options_1']['selected_column_selector_option']
-if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
+if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']:
-c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["col1"]
+c = params['selected_tasks']['selected_algorithms']['input_options']['column_selector_options_1']['col1']
 else:
 c = None
 X = read_columns(
 file1,
 c=c,
 parse_dates=True
 )
 else:
 X = mmread(file1)
-header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header2"] else None
+header = 'infer' if params['selected_tasks']['selected_algorithms']['input_options']['header2'] else None
-column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["selected_column_selector_option2"]
+column_option = params['selected_tasks']['selected_algorithms']['input_options']['column_selector_options_2']['selected_column_selector_option2']
-if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]:
+if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']:
-c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["col2"]
+c = params['selected_tasks']['selected_algorithms']['input_options']['column_selector_options_2']['col2']
 else:
 c = None
 y = read_columns(
 file2,
 c=c,
 return X, y
 class SafeEval(Interpreter):
-def __init__(self, load_scipy=False, load_numpy=False):
+def __init__(self, load_scipy=False, load_numpy=False, load_estimators=False):
 # File opening and other unneeded functions could be dropped
 unwanted = ['open', 'type', 'dir', 'id', 'str', 'repr']
 # Allowed symbol table. Add more if needed.
 'standard_gamma', 'standard_normal', 'standard_t', 'triangular', 'uniform',
 'vonmises', 'wald', 'weibull', 'zipf']
 for f in from_numpy_random:
 syms['np_random_' + f] = getattr(np.random, f)
+if load_estimators:
+estimator_table = {
+'sklearn_svm' : getattr(sklearn, 'svm'),
+'sklearn_tree' : getattr(sklearn, 'tree'),
+'sklearn_ensemble' : getattr(sklearn, 'ensemble'),
+'sklearn_neighbors' : getattr(sklearn, 'neighbors'),
+'sklearn_naive_bayes' : getattr(sklearn, 'naive_bayes'),
+'sklearn_linear_model' : getattr(sklearn, 'linear_model'),
+'sklearn_cluster' : getattr(sklearn, 'cluster'),
+'sklearn_decomposition' : getattr(sklearn, 'decomposition'),
+'sklearn_preprocessing' : getattr(sklearn, 'preprocessing'),
+'sklearn_feature_selection' : getattr(sklearn, 'feature_selection'),
+'sklearn_kernel_approximation' : getattr(sklearn, 'kernel_approximation'),
+'skrebate_ReliefF': getattr(skrebate, 'ReliefF'),
+'skrebate_SURF': getattr(skrebate, 'SURF'),
+'skrebate_SURFstar': getattr(skrebate, 'SURFstar'),
+'skrebate_MultiSURF': getattr(skrebate, 'MultiSURF'),
+'skrebate_MultiSURFstar': getattr(skrebate, 'MultiSURFstar'),
+'skrebate_TuRF': getattr(skrebate, 'TuRF'),
+'xgboost_XGBClassifier' : getattr(xgboost, 'XGBClassifier'),
+'xgboost_XGBRegressor' : getattr(xgboost, 'XGBRegressor')
+}
+syms.update(estimator_table)
 for key in unwanted:
 syms.pop(key, None)
 super(SafeEval, self).__init__(symtable=syms, use_numpy=False, minimal=False,
 no_if=True, no_for=True, no_while=True, no_try=True,
 no_functiondef=True, no_ifexp=True, no_listcomp=False,
 no_augassign=False, no_assert=True, no_delete=True,
 no_raise=True, no_print=True)
-def get_search_params(params_builder):
-search_params = {}
-safe_eval = SafeEval(load_scipy=True, load_numpy=True)
-for p in params_builder['param_set']:
-search_p = p['search_param_selector']['search_p']
-if search_p.strip() == '':
-continue
-param_type = p['search_param_selector']['selected_param_type']
-lst = search_p.split(":")
-assert (len(lst) == 2), "Error, make sure there is one and only one colon in search parameter input."
-literal = lst[1].strip()
-ev = safe_eval(literal)
-if param_type == "final_estimator_p":
-search_params["estimator__" + lst[0].strip()] = ev
-else:
-search_params["preprocessing_" + param_type[5:6] + "__" + lst[0].strip()] = ev
-return search_params
 def get_estimator(estimator_json):
 estimator_module = estimator_json['selected_module']
+if estimator_module == 'customer_estimator':
+c_estimator = estimator_json['c_estimator']
+with open(c_estimator, 'rb') as model_handler:
+new_model = load_model(model_handler)
+return new_model
 estimator_cls = estimator_json['selected_estimator']
-if estimator_module == "xgboost":
+if estimator_module == 'xgboost':
 cls = getattr(xgboost, estimator_cls)
 else:
 module = getattr(sklearn, estimator_module)
 cls = getattr(module, estimator_cls)
 estimator = cls()
 estimator_params = estimator_json['text_params'].strip()
-if estimator_params != "":
+if estimator_params != '':
 try:
 params = safe_eval('dict(' + estimator_params + ')')
 except ValueError:
 sys.exit("Unsupported parameter input: `%s`" % estimator_params)
 estimator.set_params(**params)
 estimator.set_params(n_jobs=N_JOBS)
 return estimator
-def get_cv(literal):
+def get_cv(cv_json):
-safe_eval = SafeEval()
+"""
-if literal == "":
+cv_json:
-return None
+e.g.:
-if literal.isdigit():
+{
-return int(literal)
+'selected_cv': 'StratifiedKFold',
-m = re.match(r'^(?P<method>\w+)\((?P<args>.*)\)$', literal)
+'n_splits': 3,
-if m:
+'shuffle': True,
-my_class = getattr(model_selection, m.group('method'))
+'random_state': 0
-args = safe_eval('dict('+ m.group('args') + ')')
+}
-return my_class(**args)
+"""
-sys.exit("Unsupported CV input: %s" % literal)
+cv = cv_json.pop('selected_cv')
+if cv == 'default':
+return cv_json['n_splits'], None
+groups = cv_json.pop('groups', None)
+if groups:
+groups = groups.strip()
+if groups != '':
+if groups.startswith('__ob__'):
+groups = groups[6:]
+if groups.endswith('__cb__'):
+groups = groups[:-6]
+groups = [int(x.strip()) for x in groups.split(',')]
+for k, v in cv_json.items():
+if v == '':
+cv_json[k] = None
+test_fold = cv_json.get('test_fold', None)
+if test_fold:
+if test_fold.startswith('__ob__'):
+test_fold = test_fold[6:]
+if test_fold.endswith('__cb__'):
+test_fold = test_fold[:-6]
+cv_json['test_fold'] = [int(x.strip()) for x in test_fold.split(',')]
+test_size = cv_json.get('test_size', None)
+if test_size and test_size > 1.0:
+cv_json['test_size'] = int(test_size)
+cv_class = getattr(model_selection, cv)
+splitter = cv_class(**cv_json)
+return splitter, groups
+# needed when sklearn < v0.20
+def balanced_accuracy_score(y_true, y_pred):
+C = metrics.confusion_matrix(y_true, y_pred)
+with np.errstate(divide='ignore', invalid='ignore'):
+per_class = np.diag(C) / C.sum(axis=1)
+if np.any(np.isnan(per_class)):
+warnings.warn('y_pred contains classes not in y_true')
+per_class = per_class[~np.isnan(per_class)]
+score = np.mean(per_class)
+return score
 def get_scoring(scoring_json):
-def balanced_accuracy_score(y_true, y_pred):
-C = metrics.confusion_matrix(y_true, y_pred)
+if scoring_json['primary_scoring'] == 'default':
-with np.errstate(divide='ignore', invalid='ignore'):
-per_class = np.diag(C) / C.sum(axis=1)
-if np.any(np.isnan(per_class)):
-warnings.warn('y_pred contains classes not in y_true')
-per_class = per_class[~np.isnan(per_class)]
-score = np.mean(per_class)
-return score
-if scoring_json['primary_scoring'] == "default":
 return None
 my_scorers = metrics.SCORERS
 if 'balanced_accuracy' not in my_scorers:
 my_scorers['balanced_accuracy'] = metrics.make_scorer(balanced_accuracy_score)

Mercurial > repos > bgruening > sklearn_feature_selection

comparison utils.py @ 17:2bbbac61e48d draft