Mercurial > repos > bgruening > sklearn_model_validation
diff model_validation.xml @ 17:cf9aa11b91c8 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit ab963ec9498bd05d2fb2f24f75adb2fccae7958c
author | bgruening |
---|---|
date | Wed, 15 May 2019 07:42:07 -0400 |
parents | 86e1e2874460 |
children | efbec977a47d |
line wrap: on
line diff
--- a/model_validation.xml Sun Dec 30 02:02:32 2018 -0500 +++ b/model_validation.xml Wed May 15 07:42:07 2019 -0400 @@ -15,15 +15,27 @@ <inputs name="inputs" /> <configfile name="sklearn_model_validation_script"> <![CDATA[ -import sys -import os +import imblearn import json -import pandas import numpy as np -from sklearn import preprocessing, model_selection, svm, linear_model, ensemble, naive_bayes, tree, neighbors -from sklearn.pipeline import Pipeline +import pandas as pd +import pickle +import pprint +import skrebate +import sys +import warnings +import xgboost +from mlxtend import classifier, regressor +from sklearn import ( + cluster, compose, decomposition, ensemble, feature_extraction, + feature_selection, gaussian_process, kernel_approximation, metrics, + model_selection, naive_bayes, neighbors, pipeline, preprocessing, + svm, linear_model, tree, discriminant_analysis) -exec(open('$__tool_directory__/utils.py').read(), globals()) +sys.path.insert(0, '$__tool_directory__') +from utils import SafeEval, get_cv, get_scoring, load_model, read_columns + +N_JOBS = int(__import__('os').environ.get('GALAXY_SLOTS', 1)) warnings.filterwarnings('ignore') @@ -33,6 +45,12 @@ with open(input_json_path, 'r') as param_handler: params = json.load(param_handler) +#if $model_validation_functions.options.cv_selector.selected_cv\ + in ['GroupKFold', 'GroupShuffleSplit', 'LeaveOneGroupOut', 'LeavePGroupsOut']: +params['model_validation_functions']['options']['cv_selector']['groups_selector']['infile_g'] =\ + '$model_validation_functions.options.cv_selector.groups_selector.infile_g' +#end if + input_type = params['input_options']['selected_input'] if input_type == 'tabular': header = 'infer' if params['input_options']['header1'] else None @@ -47,8 +65,7 @@ c_option = column_option, sep='\t', header=header, - parse_dates=True - ) + parse_dates=True).astype(float) else: X = mmread('$input_options.infile1') @@ -64,45 +81,24 @@ c_option = column_option, sep='\t', header=header, - parse_dates=True -) -y=y.ravel() + parse_dates=True) +y = y.ravel() +## handle options options = params['model_validation_functions']['options'] splitter, groups = get_cv( options.pop('cv_selector') ) -if groups is None: - options['cv'] = splitter -elif groups == '': - options['cv'] = list( splitter.split(X, y, groups=None) ) -else: - options['cv'] = list( splitter.split(X, y, groups=groups) ) +options['cv'] = splitter +options['groups'] = groups options['n_jobs'] = N_JOBS if 'scoring' in options: + primary_scoring = options['scoring']['primary_scoring'] options['scoring'] = get_scoring(options['scoring']) if 'pre_dispatch' in options and options['pre_dispatch'] == '': options['pre_dispatch'] = None -pipeline_steps = [] - -## Set up pre_processor and add to pipeline steps. -if params['pre_processing']['do_pre_processing'] == 'Yes': - preprocessor = params['pre_processing']['pre_processors']['selected_pre_processor'] - pre_processor_options = params['pre_processing']['pre_processors']['options'] - my_class = getattr(preprocessing, preprocessor) - pipeline_steps.append( ('pre_processor', my_class(**pre_processor_options)) ) - -## Set up feature selector and add to pipeline steps. -if params['feature_selection']['do_feature_selection'] == 'Yes': - feature_selector = feature_selector(params['feature_selection']['fs_algorithm_selector']) - pipeline_steps.append( ('feature_selector', feature_selector) ) - -## Set up estimator and add to pipeline. -estimator_json = params['model_validation_functions']['estimator_selector'] -estimator = get_estimator(estimator_json) - -pipeline_steps.append( ('estimator', estimator) ) - -pipeline = Pipeline(pipeline_steps) +## load pipeline +with open('$infile_pipeline', 'rb') as pipeline_handler: + pipeline = load_model(pipeline_handler) ## Set up validator, run pipeline through validator and return results. @@ -110,87 +106,75 @@ validator = getattr(model_selection, validator) selected_function = params['model_validation_functions']['selected_function'] -rval_type = params['model_validation_functions'].get('return_type', None) if selected_function == 'cross_validate': res = validator(pipeline, X, y, **options) - rval = res[rval_type] + rval = pd.DataFrame(res) + col_rename = {} + for col in rval.columns: + if col.endswith('_primary'): + col_rename[col] = col[:-7] + primary_scoring + rval.rename(inplace=True, columns=col_rename) +elif selected_function == 'cross_val_predict': + predicted = validator(pipeline, X, y, **options) + if len(predicted.shape) == 1: + rval = pd.DataFrame(predicted, columns=['Predicted']) + else: + rval = pd.DataFrame(predicted) elif selected_function == 'learning_curve': - options['train_sizes'] = eval(options['train_sizes']) + try: + train_sizes = safe_eval(options['train_sizes']) + except: + sys.exit("Unsupported train_sizes input! Supports int/float in tuple and array-like structure.") + if type(train_sizes) is tuple: + train_sizes = np.linspace(*train_sizes) + options['train_sizes'] = train_sizes train_sizes_abs, train_scores, test_scores = validator(pipeline, X, y, **options) - rval = eval(rval_type) + rval = pd.DataFrame(dict( + train_sizes_abs = train_sizes_abs, + mean_train_scores = np.mean(train_scores, axis=1), + std_train_scores = np.std(train_scores, axis=1), + mean_test_scores = np.mean(test_scores, axis=1), + std_test_scores = np.std(test_scores, axis=1))) + rval = rval[['train_sizes_abs', 'mean_train_scores', 'std_train_scores', + 'mean_test_scores', 'std_test_scores']] elif selected_function == 'permutation_test_score': score, permutation_scores, pvalue = validator(pipeline, X, y, **options) - rval = eval(rval_type) - if rval_type in ['score', 'pvalue']: - rval = [rval] -elif selected_function == 'validation_curve': - options['param_name'] = 'estimator__' + options['param_name'] - options['param_range'] = eval(options['param_range']) - train_scores, test_scores = validator(pipeline, X, y, **options) - rval = eval(rval_type) -else: - rval = validator(pipeline, X, y, **options) + permutation_scores_df = pd.DataFrame(dict( + permutation_scores = permutation_scores)) + score_df = pd.DataFrame(dict( + score = [score], + pvalue = [pvalue])) + rval = pd.concat([score_df[['score', 'pvalue']], permutation_scores_df], axis=1) -rval = pandas.DataFrame(rval) -rval.to_csv(path_or_buf='$outfile', sep='\t', header=False, index=False) +rval.to_csv(path_or_buf='$outfile', sep='\t', header=True, index=False) ]]> </configfile> </configfiles> <inputs> - <conditional name="pre_processing"> - <param name="do_pre_processing" type="select" label="Do pre_processing?"> - <option value="No" selected="true"/> - <option value="Yes"/> - </param> - <when value="No"/> - <when value="Yes"> - <conditional name="pre_processors"> - <expand macro="sparse_preprocessors_ext" /> - <expand macro="sparse_preprocessor_options_ext" /> - </conditional> - </when> - </conditional> - <conditional name="feature_selection"> - <param name="do_feature_selection" type="select" label="Do feature selection?"> - <option value="No" selected="true"/> - <option value="Yes"/> - </param> - <when value="No"/> - <when value="Yes"> - <expand macro="feature_selection_pipeline"/> - </when> - </conditional> + <param name="infile_pipeline" type="data" format="zip" label="Choose the dataset containing model/pipeline object"/> <conditional name="model_validation_functions"> <param name="selected_function" type="select" label="Select a model validation function"> <option value="cross_validate">cross_validate - Evaluate metric(s) by cross-validation and also record fit/score times</option> <option value="cross_val_predict">cross_val_predict - Generate cross-validated estimates for each input data point</option> - <option value="cross_val_score">cross_val_score - Evaluate a score by cross-validation</option> <option value="learning_curve">learning_curve - Learning curve</option> <option value="permutation_test_score">permutation_test_score - Evaluate the significance of a cross-validated score with permutations</option> - <option value="validation_curve">validation_curve - Validation curve</option> + <option value="validation_curve">validation_curve - Use grid search with one parameter instead</option> </param> <when value="cross_validate"> - <expand macro="estimator_selector_all" /> <section name="options" title="Other Options" expanded="false"> - <!--groups--> + <expand macro="scoring_selection"/> <expand macro="model_validation_common_options"/> - <expand macro="scoring_selection"/> + <!--param argument="return_train_score" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="true" help="Whether to include train scores."/> --> + <!--param argument="return_estimator" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" help="Whether to return the estimators fitted on each split."/> --> + <!--param argument="error_score" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="Raise fit error:" help="If false, the metric score is assigned to NaN if an error occurs in estimator fitting and FitFailedWarning is raised."/> --> <!--fit_params--> <expand macro="pre_dispatch"/> </section> - <param name="return_type" type="select" label="Select a return type"> - <option value="test_score" selected="true">test_score</option> - <option value="train_score">train_score</option> - <option value="fit_time">fit_time</option> - <option value="score_time">score_time</option> - </param> </when> <when value="cross_val_predict"> - <expand macro="estimator_selector_all" /> <section name="options" title="Other Options" expanded="false"> - <!--groups--> <expand macro="model_validation_common_options" /> <!--fit_params--> <expand macro="pre_dispatch" value="2*n_jobs’" help="Controls the number of jobs that get dispatched during parallel execution"/> @@ -200,64 +184,34 @@ </param> </section> </when> - <when value="cross_val_score"> - <expand macro="estimator_selector_all" /> + <when value="learning_curve"> <section name="options" title="Other Options" expanded="false"> - <!--groups--> + <expand macro="scoring_selection"/> <expand macro="model_validation_common_options"/> - <expand macro="scoring_selection"/> - <!--fit_params--> + <param argument="train_sizes" type="text" value="(0.1, 1.0, 5)" label="train_sizes" + help="Relative or absolute numbers of training examples that will be used to generate the learning curve. Supports 1) tuple, to be evaled by np.linspace, e.g. (0.1, 1.0, 5); 2) array-like, e.g. [0.1 , 0.325, 0.55 , 0.775, 1.]"> + <sanitizer> + <valid initial="default"> + <add value="["/> + <add value="]"/> + </valid> + </sanitizer> + </param> + <param argument="exploit_incremental_learning" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" help="Whether to apply incremental learning to speed up fitting of the estimator if supported"/> <expand macro="pre_dispatch"/> + <expand macro="shuffle" checked="false" label="shuffle" help="Whether to shuffle training data before taking prefixes"/> + <expand macro="random_state" help_text="If int, the seed used by the random number generator. Used when `shuffle` is True"/> </section> </when> - <when value="learning_curve"> - <expand macro="estimator_selector_all" /> + <when value="permutation_test_score"> <section name="options" title="Other Options" expanded="false"> - <!--groups--> - <expand macro="model_validation_common_options"/> - <param argument="train_sizes" type="text" value="np.linspace(0.1, 1.0, 5)" label="train_sizes" help="Relative or absolute numbers of training examples that will be used to generate the learning curve"/> <expand macro="scoring_selection"/> - <param argument="exploit_incremental_learning" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="exploit_incremental_learning" help="Whether to apply incremental learning to speed up fitting of the estimator if supported"/> - <expand macro="pre_dispatch"/> - <expand macro="shuffle" checked="false" label="shuffle" help="Whether to shuffle training data before taking prefixes"/> - <expand macro="random_state"/> - </section> - <param name="return_type" type="select" label="Select a return type"> - <option value="train_sizes_abs" selected="true">train_sizes_abs</option> - <option value="train_scores">train_scores</option> - <option value="test_scores">test_scores</option> - </param> - </when> - <when value="permutation_test_score"> - <expand macro="estimator_selector_all" /> - <section name="options" title="Other Options" expanded="false"> - <!--groups--> <expand macro="model_validation_common_options"/> - <expand macro="scoring_selection"/> <param name="n_permutations" type="integer" value="100" optional="true" label="n_permutations" help="Number of times to permute y"/> <expand macro="random_state"/> </section> - <param name="return_type" type="select" label="Select a return type"> - <option value="score" selected="true">score</option> - <option value="permutation_scores">permutation_scores</option> - <option value="pvalue">pvalue</option> - </param> </when> - <when value="validation_curve"> - <expand macro="estimator_selector_all" /> - <section name="options" title="Other Options" expanded="false"> - <param name="param_name" type="text" value="gamma" label="param_name" help="Name of the parameter that will be varied"/> - <param name="param_range" type="text" value="np.logspace(-6, -1, 5)" label="param_range" help="The values of the parameter that will be evaluated."/> - <!--groups--> - <expand macro="model_validation_common_options"/> - <expand macro="scoring_selection"/> - <expand macro="pre_dispatch"/> - </section> - <param name="return_type" type="select" label="Select a return type"> - <option value="train_scores" selected="true">train_scores</option> - <option value="test_scores">test_scores</option> - </param> - </when> + <when value="validation_curve"/> </conditional> <expand macro="sl_mixed_input"/> </inputs> @@ -266,70 +220,72 @@ </outputs> <tests> <test> + <param name="infile_pipeline" value="pipeline02"/> <param name="selected_function" value="cross_validate"/> - <param name="selected_module" value="linear_model"/> - <param name="selected_estimator" value="LassoCV"/> - <param name="infile1" value="regression_train.tabular" ftype="tabular"/> - <param name="col1" value="1,2,3,4,5"/> - <param name="infile2" value="regression_train.tabular" ftype="tabular"/> - <param name="col2" value="6"/> - <output name="outfile" file="mv_result01.tabular"/> - </test> - <test> - <param name="selected_function" value="cross_val_predict"/> - <param name="selected_module" value="linear_model"/> - <param name="selected_estimator" value="LassoCV"/> <param name="infile1" value="regression_train.tabular" ftype="tabular"/> <param name="col1" value="1,2,3,4,5"/> <param name="infile2" value="regression_train.tabular" ftype="tabular"/> <param name="col2" value="6"/> - <output name="outfile" file="mv_result02.tabular"/> + <output name="outfile"> + <assert_contents> + <has_n_columns n="4"/> + <has_text text="0.9999961390418067"/> + <has_text text="0.9944541531269271"/> + <has_text text="0.9999193322454393"/> + </assert_contents> + </output> </test> <test> - <param name="selected_function" value="cross_val_score"/> - <param name="selected_module" value="linear_model"/> - <param name="selected_estimator" value="LassoCV"/> + <param name="infile_pipeline" value="pipeline02"/> + <param name="selected_function" value="cross_val_predict"/> <param name="infile1" value="regression_train.tabular" ftype="tabular"/> <param name="col1" value="1,2,3,4,5"/> <param name="infile2" value="regression_train.tabular" ftype="tabular"/> <param name="col2" value="6"/> - <output name="outfile" file="mv_result03.tabular"/> + <output name="outfile" file="mv_result02.tabular" lines_diff="4"/> </test> <test> + <param name="infile_pipeline" value="pipeline05"/> <param name="selected_function" value="learning_curve"/> - <param name="selected_module" value="linear_model"/> - <param name="selected_estimator" value="LassoCV"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="col2" value="1"/> - <output name="outfile" file="mv_result04.tabular"/> + <output name="outfile" file="mv_result03.tabular"/> </test> <test> + <param name="infile_pipeline" value="pipeline05"/> <param name="selected_function" value="permutation_test_score"/> - <param name="selected_module" value="linear_model"/> - <param name="selected_estimator" value="LassoCV"/> <param name="infile1" value="regression_train.tabular" ftype="tabular"/> <param name="col1" value="1,2,3,4,5"/> <param name="infile2" value="regression_train.tabular" ftype="tabular"/> <param name="col2" value="6"/> - <output name="outfile" file="mv_result05.tabular"/> + <output name="outfile"> + <assert_contents> + <has_n_columns n="3"/> + <has_text text="0.25697059258228816"/> + </assert_contents> + </output> </test> <test> - <param name="selected_function" value="validation_curve"/> - <param name="selected_module" value="svm"/> - <param name="selected_estimator" value="SVC"/> - <param name="text_params" value="kernel='linear'"/> + <param name="infile_pipeline" value="pipeline05"/> + <param name="selected_function" value="cross_val_predict"/> + <section name="groups_selector"> + <param name="infile_groups" value="regression_y.tabular" ftype="tabular"/> + <param name="header_g" value="true"/> + <param name="selected_column_selector_option_g" value="by_index_number"/> + <param name="col_g" value="1"/> + </section> + <param name="selected_cv" value="GroupKFold"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> - <param name="header1" value="true" /> - <param name="selected_column_selector_option" value="all_columns"/> + <param name="header1" value="true"/> + <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> - <param name="header2" value="true" /> + <param name="header2" value="true"/> <param name="col2" value="1"/> - <param name="return_type" value="test_scores"/> - <output name="outfile" file="mv_result06.tabular"/> + <output name="outfile" file="mv_result05.tabular"/> </test> </tests> <help>