Mercurial > repos > bgruening > sklearn_data_preprocess
diff utils.py @ 23:d6b8103c909c draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 57f4407e278a615f47a377a3328782b1d8e0b54d
author | bgruening |
---|---|
date | Sun, 30 Dec 2018 01:58:00 -0500 |
parents | f156acc7239b |
children | 9e43ee712723 |
line wrap: on
line diff
--- a/utils.py Thu Oct 11 03:35:14 2018 -0400 +++ b/utils.py Sun Dec 30 01:58:00 2018 -0500 @@ -1,21 +1,34 @@ -import sys +import json +import numpy as np import os import pandas +import pickle import re -import pickle -import warnings -import numpy as np -import xgboost import scipy import sklearn +import sys +import warnings +import xgboost + from asteval import Interpreter, make_symbol_table -from sklearn import (cluster, decomposition, ensemble, feature_extraction, feature_selection, - gaussian_process, kernel_approximation, metrics, +from sklearn import (cluster, compose, decomposition, ensemble, feature_extraction, + feature_selection, gaussian_process, kernel_approximation, metrics, model_selection, naive_bayes, neighbors, pipeline, preprocessing, svm, linear_model, tree, discriminant_analysis) +try: + import skrebate +except ModuleNotFoundError: + pass + + N_JOBS = int(os.environ.get('GALAXY_SLOTS', 1)) +try: + sk_whitelist +except NameError: + sk_whitelist = None + class SafePickler(pickle.Unpickler): """ @@ -25,6 +38,13 @@ """ def find_class(self, module, name): + # sk_whitelist could be read from tool + global sk_whitelist + if not sk_whitelist: + whitelist_file = os.path.join(os.path.dirname(__file__), 'sk_whitelist.json') + with open(whitelist_file, 'r') as f: + sk_whitelist = json.load(f) + bad_names = ('and', 'as', 'assert', 'break', 'class', 'continue', 'def', 'del', 'elif', 'else', 'except', 'exec', 'finally', 'for', 'from', 'global', 'if', 'import', @@ -46,13 +66,14 @@ or ( ( module.startswith('sklearn.') or module.startswith('xgboost.') or module.startswith('skrebate.') + or module.startswith('imblearn') or module.startswith('numpy.') or module == 'numpy' ) and (name not in bad_names) ): # TODO: replace with a whitelist checker - if fullname not in sk_whitelist['SK_NAMES'] + sk_whitelist['SKR_NAMES'] + sk_whitelist['XGB_NAMES'] + sk_whitelist['NUMPY_NAMES'] + good_names: + if fullname not in sk_whitelist['SK_NAMES'] + sk_whitelist['SKR_NAMES'] + sk_whitelist['XGB_NAMES'] + sk_whitelist['NUMPY_NAMES'] + sk_whitelist['IMBLEARN_NAMES'] + good_names: print("Warning: global %s is not in pickler whitelist yet and will loss support soon. Contact tool author or leave a message at github.com" % fullname) mod = sys.modules[module] return getattr(mod, name) @@ -83,44 +104,56 @@ return y, data else: return y - return y ## generate an instance for one of sklearn.feature_selection classes def feature_selector(inputs): - selector = inputs["selected_algorithm"] + selector = inputs['selected_algorithm'] selector = getattr(sklearn.feature_selection, selector) - options = inputs["options"] + options = inputs['options'] if inputs['selected_algorithm'] == 'SelectFromModel': if not options['threshold'] or options['threshold'] == 'None': options['threshold'] = None + else: + try: + options['threshold'] = float(options['threshold']) + except ValueError: + pass if inputs['model_inputter']['input_mode'] == 'prefitted': model_file = inputs['model_inputter']['fitted_estimator'] with open(model_file, 'rb') as model_handler: fitted_estimator = load_model(model_handler) new_selector = selector(fitted_estimator, prefit=True, **options) else: - estimator_json = inputs['model_inputter']["estimator_selector"] + estimator_json = inputs['model_inputter']['estimator_selector'] estimator = get_estimator(estimator_json) new_selector = selector(estimator, **options) elif inputs['selected_algorithm'] == 'RFE': - estimator = get_estimator(inputs["estimator_selector"]) + estimator = get_estimator(inputs['estimator_selector']) + step = options.get('step', None) + if step and step >= 1.0: + options['step'] = int(step) new_selector = selector(estimator, **options) elif inputs['selected_algorithm'] == 'RFECV': options['scoring'] = get_scoring(options['scoring']) options['n_jobs'] = N_JOBS - options['cv'] = get_cv(options['cv'].strip()) - estimator = get_estimator(inputs["estimator_selector"]) + splitter, groups = get_cv(options.pop('cv_selector')) + # TODO support group cv splitters + options['cv'] = splitter + step = options.get('step', None) + if step and step >= 1.0: + options['step'] = int(step) + estimator = get_estimator(inputs['estimator_selector']) new_selector = selector(estimator, **options) - elif inputs['selected_algorithm'] == "VarianceThreshold": + elif inputs['selected_algorithm'] == 'VarianceThreshold': new_selector = selector(**options) else: - score_func = inputs["score_func"] + score_func = inputs['score_func'] score_func = getattr(sklearn.feature_selection, score_func) new_selector = selector(score_func, **options) @@ -128,12 +161,12 @@ def get_X_y(params, file1, file2): - input_type = params["selected_tasks"]["selected_algorithms"]["input_options"]["selected_input"] - if input_type == "tabular": - header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header1"] else None - column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["selected_column_selector_option"] - if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: - c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_1"]["col1"] + input_type = params['selected_tasks']['selected_algorithms']['input_options']['selected_input'] + if input_type == 'tabular': + header = 'infer' if params['selected_tasks']['selected_algorithms']['input_options']['header1'] else None + column_option = params['selected_tasks']['selected_algorithms']['input_options']['column_selector_options_1']['selected_column_selector_option'] + if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']: + c = params['selected_tasks']['selected_algorithms']['input_options']['column_selector_options_1']['col1'] else: c = None X = read_columns( @@ -147,10 +180,10 @@ else: X = mmread(file1) - header = 'infer' if params["selected_tasks"]["selected_algorithms"]["input_options"]["header2"] else None - column_option = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] - if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: - c = params["selected_tasks"]["selected_algorithms"]["input_options"]["column_selector_options_2"]["col2"] + header = 'infer' if params['selected_tasks']['selected_algorithms']['input_options']['header2'] else None + column_option = params['selected_tasks']['selected_algorithms']['input_options']['column_selector_options_2']['selected_column_selector_option2'] + if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']: + c = params['selected_tasks']['selected_algorithms']['input_options']['column_selector_options_2']['col2'] else: c = None y = read_columns( @@ -167,7 +200,7 @@ class SafeEval(Interpreter): - def __init__(self, load_scipy=False, load_numpy=False): + def __init__(self, load_scipy=False, load_numpy=False, load_estimators=False): # File opening and other unneeded functions could be dropped unwanted = ['open', 'type', 'dir', 'id', 'str', 'repr'] @@ -199,6 +232,30 @@ for f in from_numpy_random: syms['np_random_' + f] = getattr(np.random, f) + if load_estimators: + estimator_table = { + 'sklearn_svm' : getattr(sklearn, 'svm'), + 'sklearn_tree' : getattr(sklearn, 'tree'), + 'sklearn_ensemble' : getattr(sklearn, 'ensemble'), + 'sklearn_neighbors' : getattr(sklearn, 'neighbors'), + 'sklearn_naive_bayes' : getattr(sklearn, 'naive_bayes'), + 'sklearn_linear_model' : getattr(sklearn, 'linear_model'), + 'sklearn_cluster' : getattr(sklearn, 'cluster'), + 'sklearn_decomposition' : getattr(sklearn, 'decomposition'), + 'sklearn_preprocessing' : getattr(sklearn, 'preprocessing'), + 'sklearn_feature_selection' : getattr(sklearn, 'feature_selection'), + 'sklearn_kernel_approximation' : getattr(sklearn, 'kernel_approximation'), + 'skrebate_ReliefF': getattr(skrebate, 'ReliefF'), + 'skrebate_SURF': getattr(skrebate, 'SURF'), + 'skrebate_SURFstar': getattr(skrebate, 'SURFstar'), + 'skrebate_MultiSURF': getattr(skrebate, 'MultiSURF'), + 'skrebate_MultiSURFstar': getattr(skrebate, 'MultiSURFstar'), + 'skrebate_TuRF': getattr(skrebate, 'TuRF'), + 'xgboost_XGBClassifier' : getattr(xgboost, 'XGBClassifier'), + 'xgboost_XGBRegressor' : getattr(xgboost, 'XGBRegressor') + } + syms.update(estimator_table) + for key in unwanted: syms.pop(key, None) @@ -209,33 +266,20 @@ no_raise=True, no_print=True) -def get_search_params(params_builder): - search_params = {} - safe_eval = SafeEval(load_scipy=True, load_numpy=True) - - for p in params_builder['param_set']: - search_p = p['search_param_selector']['search_p'] - if search_p.strip() == '': - continue - param_type = p['search_param_selector']['selected_param_type'] - - lst = search_p.split(":") - assert (len(lst) == 2), "Error, make sure there is one and only one colon in search parameter input." - literal = lst[1].strip() - ev = safe_eval(literal) - if param_type == "final_estimator_p": - search_params["estimator__" + lst[0].strip()] = ev - else: - search_params["preprocessing_" + param_type[5:6] + "__" + lst[0].strip()] = ev - - return search_params - def get_estimator(estimator_json): + estimator_module = estimator_json['selected_module'] + + if estimator_module == 'customer_estimator': + c_estimator = estimator_json['c_estimator'] + with open(c_estimator, 'rb') as model_handler: + new_model = load_model(model_handler) + return new_model + estimator_cls = estimator_json['selected_estimator'] - if estimator_module == "xgboost": + if estimator_module == 'xgboost': cls = getattr(xgboost, estimator_cls) else: module = getattr(sklearn, estimator_module) @@ -244,7 +288,7 @@ estimator = cls() estimator_params = estimator_json['text_params'].strip() - if estimator_params != "": + if estimator_params != '': try: params = safe_eval('dict(' + estimator_params + ')') except ValueError: @@ -256,32 +300,68 @@ return estimator -def get_cv(literal): - safe_eval = SafeEval() - if literal == "": - return None - if literal.isdigit(): - return int(literal) - m = re.match(r'^(?P<method>\w+)\((?P<args>.*)\)$', literal) - if m: - my_class = getattr(model_selection, m.group('method')) - args = safe_eval('dict('+ m.group('args') + ')') - return my_class(**args) - sys.exit("Unsupported CV input: %s" % literal) +def get_cv(cv_json): + """ + cv_json: + e.g.: + { + 'selected_cv': 'StratifiedKFold', + 'n_splits': 3, + 'shuffle': True, + 'random_state': 0 + } + """ + cv = cv_json.pop('selected_cv') + if cv == 'default': + return cv_json['n_splits'], None + + groups = cv_json.pop('groups', None) + if groups: + groups = groups.strip() + if groups != '': + if groups.startswith('__ob__'): + groups = groups[6:] + if groups.endswith('__cb__'): + groups = groups[:-6] + groups = [int(x.strip()) for x in groups.split(',')] + + for k, v in cv_json.items(): + if v == '': + cv_json[k] = None + + test_fold = cv_json.get('test_fold', None) + if test_fold: + if test_fold.startswith('__ob__'): + test_fold = test_fold[6:] + if test_fold.endswith('__cb__'): + test_fold = test_fold[:-6] + cv_json['test_fold'] = [int(x.strip()) for x in test_fold.split(',')] + + test_size = cv_json.get('test_size', None) + if test_size and test_size > 1.0: + cv_json['test_size'] = int(test_size) + + cv_class = getattr(model_selection, cv) + splitter = cv_class(**cv_json) + + return splitter, groups + + +# needed when sklearn < v0.20 +def balanced_accuracy_score(y_true, y_pred): + C = metrics.confusion_matrix(y_true, y_pred) + with np.errstate(divide='ignore', invalid='ignore'): + per_class = np.diag(C) / C.sum(axis=1) + if np.any(np.isnan(per_class)): + warnings.warn('y_pred contains classes not in y_true') + per_class = per_class[~np.isnan(per_class)] + score = np.mean(per_class) + return score def get_scoring(scoring_json): - def balanced_accuracy_score(y_true, y_pred): - C = metrics.confusion_matrix(y_true, y_pred) - with np.errstate(divide='ignore', invalid='ignore'): - per_class = np.diag(C) / C.sum(axis=1) - if np.any(np.isnan(per_class)): - warnings.warn('y_pred contains classes not in y_true') - per_class = per_class[~np.isnan(per_class)] - score = np.mean(per_class) - return score - if scoring_json['primary_scoring'] == "default": + if scoring_json['primary_scoring'] == 'default': return None my_scorers = metrics.SCORERS