Mercurial > repos > bgruening > sklearn_data_preprocess

diff search_model_validation.py @ 23:d6b8103c909c draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 57f4407e278a615f47a377a3328782b1d8e0b54d
author: bgruening
date: Sun, 30 Dec 2018 01:58:00 -0500
children: 9e43ee712723
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/search_model_validation.py	Sun Dec 30 01:58:00 2018 -0500
@@ -0,0 +1,234 @@
+import imblearn
+import json
+import numpy as np
+import os
+import pandas
+import pickle
+import skrebate
+import sklearn
+import sys
+import xgboost
+import warnings
+from imblearn import under_sampling, over_sampling, combine
+from imblearn.pipeline import Pipeline as imbPipeline
+from sklearn import (cluster, compose, decomposition, ensemble, feature_extraction,
+                    feature_selection, gaussian_process, kernel_approximation, metrics,
+                    model_selection, naive_bayes, neighbors, pipeline, preprocessing,
+                    svm, linear_model, tree, discriminant_analysis)
+from sklearn.exceptions import FitFailedWarning
+from sklearn.externals import joblib
+from utils import get_cv, get_scoring, get_X_y, load_model, read_columns, SafeEval
+
+
+N_JOBS = int(os.environ.get('GALAXY_SLOTS', 1))
+
+
+def get_search_params(params_builder):
+    search_params = {}
+    safe_eval = SafeEval(load_scipy=True, load_numpy=True)
+    safe_eval_es = SafeEval(load_estimators=True)
+
+    for p in params_builder['param_set']:
+        search_p = p['search_param_selector']['search_p']
+        if search_p.strip() == '':
+            continue
+        param_type = p['search_param_selector']['selected_param_type']
+
+        lst = search_p.split(':')
+        assert (len(lst) == 2), "Error, make sure there is one and only one colon in search parameter input."
+        literal = lst[1].strip()
+        param_name = lst[0].strip()
+        if param_name:
+            if param_name.lower() == 'n_jobs':
+                sys.exit("Parameter `%s` is invalid for search." %param_name)
+            elif not param_name.endswith('-'):
+                ev = safe_eval(literal)
+                if param_type == 'final_estimator_p':
+                    search_params['estimator__' + param_name] = ev
+                else:
+                    search_params['preprocessing_' + param_type[5:6] + '__' + param_name] = ev
+            else:
+                # only for estimator eval, add `-` to the end of param
+                #TODO maybe add regular express check
+                ev = safe_eval_es(literal)
+                for obj in ev:
+                    if 'n_jobs' in obj.get_params():
+                        obj.set_params( n_jobs=N_JOBS )
+                if param_type == 'final_estimator_p':
+                    search_params['estimator__' + param_name[:-1]] = ev
+                else:
+                    search_params['preprocessing_' + param_type[5:6] + '__' + param_name[:-1]] = ev
+        elif param_type != 'final_estimator_p':
+            #TODO regular express check ?
+            ev = safe_eval_es(literal)
+            preprocessors = [preprocessing.StandardScaler(), preprocessing.Binarizer(), preprocessing.Imputer(),
+                            preprocessing.MaxAbsScaler(), preprocessing.Normalizer(), preprocessing.MinMaxScaler(),
+                            preprocessing.PolynomialFeatures(),preprocessing.RobustScaler(),
+                            feature_selection.SelectKBest(), feature_selection.GenericUnivariateSelect(),
+                            feature_selection.SelectPercentile(), feature_selection.SelectFpr(), feature_selection.SelectFdr(),
+                            feature_selection.SelectFwe(), feature_selection.VarianceThreshold(),
+                            decomposition.FactorAnalysis(random_state=0), decomposition.FastICA(random_state=0), decomposition.IncrementalPCA(),
+                            decomposition.KernelPCA(random_state=0, n_jobs=N_JOBS), decomposition.LatentDirichletAllocation(random_state=0, n_jobs=N_JOBS),
+                            decomposition.MiniBatchDictionaryLearning(random_state=0, n_jobs=N_JOBS),
+                            decomposition.MiniBatchSparsePCA(random_state=0, n_jobs=N_JOBS), decomposition.NMF(random_state=0),
+                            decomposition.PCA(random_state=0), decomposition.SparsePCA(random_state=0, n_jobs=N_JOBS),
+                            decomposition.TruncatedSVD(random_state=0),
+                            kernel_approximation.Nystroem(random_state=0), kernel_approximation.RBFSampler(random_state=0),
+                            kernel_approximation.AdditiveChi2Sampler(), kernel_approximation.SkewedChi2Sampler(random_state=0),
+                            cluster.FeatureAgglomeration(),
+                            skrebate.ReliefF(n_jobs=N_JOBS), skrebate.SURF(n_jobs=N_JOBS), skrebate.SURFstar(n_jobs=N_JOBS),
+                            skrebate.MultiSURF(n_jobs=N_JOBS), skrebate.MultiSURFstar(n_jobs=N_JOBS),
+                            imblearn.under_sampling.ClusterCentroids(random_state=0, n_jobs=N_JOBS),
+                            imblearn.under_sampling.CondensedNearestNeighbour(random_state=0, n_jobs=N_JOBS),
+                            imblearn.under_sampling.EditedNearestNeighbours(random_state=0, n_jobs=N_JOBS),
+                            imblearn.under_sampling.RepeatedEditedNearestNeighbours(random_state=0, n_jobs=N_JOBS),
+                            imblearn.under_sampling.AllKNN(random_state=0, n_jobs=N_JOBS),
+                            imblearn.under_sampling.InstanceHardnessThreshold(random_state=0, n_jobs=N_JOBS),
+                            imblearn.under_sampling.NearMiss(random_state=0, n_jobs=N_JOBS),
+                            imblearn.under_sampling.NeighbourhoodCleaningRule(random_state=0, n_jobs=N_JOBS),
+                            imblearn.under_sampling.OneSidedSelection(random_state=0, n_jobs=N_JOBS),
+                            imblearn.under_sampling.RandomUnderSampler(random_state=0),
+                            imblearn.under_sampling.TomekLinks(random_state=0, n_jobs=N_JOBS),
+                            imblearn.over_sampling.ADASYN(random_state=0, n_jobs=N_JOBS),
+                            imblearn.over_sampling.RandomOverSampler(random_state=0),
+                            imblearn.over_sampling.SMOTE(random_state=0, n_jobs=N_JOBS),
+                            imblearn.over_sampling.SVMSMOTE(random_state=0, n_jobs=N_JOBS),
+                            imblearn.over_sampling.BorderlineSMOTE(random_state=0, n_jobs=N_JOBS),
+                            imblearn.over_sampling.SMOTENC(categorical_features=[], random_state=0, n_jobs=N_JOBS),
+                            imblearn.combine.SMOTEENN(random_state=0), imblearn.combine.SMOTETomek(random_state=0)]
+            newlist = []
+            for obj in ev:
+                if obj is None:
+                    newlist.append(None)
+                elif obj == 'all_0':
+                    newlist.extend(preprocessors[0:36])
+                elif obj == 'sk_prep_all':      # no KernalCenter()
+                    newlist.extend(preprocessors[0:8])
+                elif obj == 'fs_all':
+                    newlist.extend(preprocessors[8:15])
+                elif obj == 'decomp_all':
+                    newlist.extend(preprocessors[15:26])
+                elif obj == 'k_appr_all':
+                    newlist.extend(preprocessors[26:30])
+                elif obj == 'reb_all':
+                    newlist.extend(preprocessors[31:36])
+                elif obj == 'imb_all':
+                    newlist.extend(preprocessors[36:55])
+                elif  type(obj) is int and -1 < obj < len(preprocessors):
+                    newlist.append(preprocessors[obj])
+                elif hasattr(obj, 'get_params'):       # user object
+                    if 'n_jobs' in obj.get_params():
+                        newlist.append( obj.set_params(n_jobs=N_JOBS) )
+                    else:
+                        newlist.append(obj)
+                else:
+                    sys.exit("Unsupported preprocessor type: %r" %(obj))
+            search_params['preprocessing_' + param_type[5:6]] = newlist
+        else:
+            sys.exit("Parameter name of the final estimator can't be skipped!")
+
+    return search_params
+
+
+if __name__ == '__main__':
+
+    warnings.simplefilter('ignore')
+
+    input_json_path = sys.argv[1]
+    with open(input_json_path, 'r') as param_handler:
+        params = json.load(param_handler)
+
+    infile_pipeline = sys.argv[2]
+    infile1 = sys.argv[3]
+    infile2 = sys.argv[4]
+    outfile_result = sys.argv[5]
+    if len(sys.argv) > 6:
+        outfile_estimator = sys.argv[6]
+    else:
+        outfile_estimator = None
+
+    params_builder = params['search_schemes']['search_params_builder']
+
+    input_type = params['input_options']['selected_input']
+    if input_type == 'tabular':
+        header = 'infer' if params['input_options']['header1'] else None
+        column_option = params['input_options']['column_selector_options_1']['selected_column_selector_option']
+        if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']:
+            c = params['input_options']['column_selector_options_1']['col1']
+        else:
+            c = None
+        X = read_columns(
+                infile1,
+                c = c,
+                c_option = column_option,
+                sep='\t',
+                header=header,
+                parse_dates=True
+        )
+    else:
+        X = mmread(open(infile1, 'r'))
+
+    header = 'infer' if params['input_options']['header2'] else None
+    column_option = params['input_options']['column_selector_options_2']['selected_column_selector_option2']
+    if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']:
+        c = params['input_options']['column_selector_options_2']['col2']
+    else:
+        c = None
+    y = read_columns(
+            infile2,
+            c = c,
+            c_option = column_option,
+            sep='\t',
+            header=header,
+            parse_dates=True
+    )
+    y = y.ravel()
+
+    optimizer = params['search_schemes']['selected_search_scheme']
+    optimizer = getattr(model_selection, optimizer)
+
+    options = params['search_schemes']['options']
+    splitter, groups = get_cv(options.pop('cv_selector'))
+    if groups is None:
+        options['cv'] = splitter
+    elif groups == '':
+        options['cv'] = list( splitter.split(X, y, groups=None) )
+    else:
+        options['cv'] = list( splitter.split(X, y, groups=groups) )
+    options['n_jobs'] = N_JOBS
+    primary_scoring = options['scoring']['primary_scoring']
+    options['scoring'] = get_scoring(options['scoring'])
+    if options['error_score']:
+        options['error_score'] = 'raise'
+    else:
+        options['error_score'] = np.NaN
+    if options['refit'] and isinstance(options['scoring'], dict):
+        options['refit'] = 'primary'
+    if 'pre_dispatch' in options and options['pre_dispatch'] == '':
+        options['pre_dispatch'] = None
+
+    with open(infile_pipeline, 'rb') as pipeline_handler:
+        pipeline = load_model(pipeline_handler)
+
+    search_params = get_search_params(params_builder)
+    searcher = optimizer(pipeline, search_params, **options)
+
+    if options['error_score'] == 'raise':
+        searcher.fit(X, y)
+    else:
+        warnings.simplefilter('always', FitFailedWarning)
+        with warnings.catch_warnings(record=True) as w:
+            try:
+                searcher.fit(X, y)
+            except ValueError:
+                pass
+            for warning in w:
+                print(repr(warning.message))
+
+    cv_result = pandas.DataFrame(searcher.cv_results_)
+    cv_result.rename(inplace=True, columns={'mean_test_primary': 'mean_test_'+primary_scoring, 'rank_test_primary': 'rank_test_'+primary_scoring})
+    cv_result.to_csv(path_or_buf=outfile_result, sep='\t', header=True, index=False)
+
+    if outfile_estimator:
+        with open(outfile_estimator, 'wb') as output_handler:
+            pickle.dump(searcher.best_estimator_, output_handler, pickle.HIGHEST_PROTOCOL)
author	bgruening
date	Sun, 30 Dec 2018 01:58:00 -0500
parents
children	9e43ee712723