Mercurial > repos > bgruening > sklearn_model_validation
diff model_validation.xml @ 9:c6b3efcba7bd draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 76583c1fcd9d06a4679cc46ffaee44117b9e22cd
author | bgruening |
---|---|
date | Sat, 04 Aug 2018 12:35:35 -0400 |
parents | fd7a054ffdbd |
children | e4ab6b0bdf37 |
line wrap: on
line diff
--- a/model_validation.xml Fri Jul 13 03:56:45 2018 -0400 +++ b/model_validation.xml Sat Aug 04 12:35:35 2018 -0400 @@ -21,13 +21,14 @@ import ast import pickle import numpy as np -import sklearn.model_selection -from sklearn import svm, linear_model, ensemble, preprocessing +import sklearn.feature_selection +from sklearn import preprocessing, model_selection, svm, linear_model, ensemble, naive_bayes, tree, neighbors from sklearn.pipeline import Pipeline @COLUMNS_FUNCTION@ +@GET_ESTIMATOR_FUNCTION@ +@FEATURE_SELECTOR_FUNCTION@ -@FEATURE_SELECTOR_FUNCTION@ input_json_path = sys.argv[1] with open(input_json_path, "r") as param_handler: @@ -85,14 +86,12 @@ ## Set up feature selector and add to pipeline steps. if params['feature_selection']['do_feature_selection'] == 'Yes': - feature_selector = feature_selector(params['feature_selection']['feature_selection_algorithms']) + feature_selector = feature_selector(params['feature_selection']['fs_algorithm_selector']) pipeline_steps.append( ('feature_selector', feature_selector) ) ## Set up estimator and add to pipeline. -estimator=params["model_validation_functions"]["estimator"] -if params["model_validation_functions"]["extra_estimator"]["has_estimator"] == 'no': - estimator = params["model_validation_functions"]["extra_estimator"]["new_estimator"] -estimator = eval(estimator.replace('__dq__', '"').replace("__sq__","'")) +estimator_json = params["model_validation_functions"]['estimator_selector'] +estimator = get_estimator(estimator_json) pipeline_steps.append( ('estimator', estimator) ) @@ -101,7 +100,7 @@ ## Set up validator, run pipeline through validator and return results. validator = params["model_validation_functions"]["selected_function"] -validator = getattr(sklearn.model_selection, validator) +validator = getattr(model_selection, validator) selected_function = params["model_validation_functions"]["selected_function"] rval_type = params["model_validation_functions"].get("return_type", None) @@ -123,24 +122,11 @@ options['param_range'] = eval(options['param_range']) train_scores, test_scores = validator(pipeline, X, y, **options) rval = eval(rval_type) -elif selected_function == 'GridSearchCV': - param_grid = params["model_validation_functions"]["param_grid"].replace("__sq__","'")\ - .replace('__dq__','"').replace("__oc__", "{").replace("__cc__", "}")\ - .replace("__ob__", "[").replace("__cb__", "]") - param_grid = ast.literal_eval(param_grid) - grid = validator(pipeline, param_grid, **options) - grid.fit(X, y) - rval = getattr(grid, rval_type) - if rval_type in ["best_estimator_", "best_score_", "best_index_"]: - rval = [rval] else: rval = validator(pipeline, X, y, **options) rval = pandas.DataFrame(rval) -if rval_type and rval_type == "cv_results_": - rval.to_csv(path_or_buf="$outfile", sep='\t', header=True, index=False) -else: - rval.to_csv(path_or_buf="$outfile", sep='\t', header=False, index=False) +rval.to_csv(path_or_buf="$outfile", sep='\t', header=False, index=False) ]]> </configfile> @@ -166,12 +152,13 @@ </param> <when value="No"/> <when value="Yes"> - <expand macro="feature_selection_all"/> + <expand macro="feature_selection_all"> + <expand macro="fs_selectfrommodel_no_prefitted"/> + </expand> </when> </conditional> <conditional name="model_validation_functions"> <param name="selected_function" type="select" label="Select a model validation function"> - <option value="GridSearchCV">GridSearchCV - Exhaustive search over specified parameter values for an estimator </option> <option value="cross_validate">cross_validate - Evaluate metric(s) by cross-validation and also record fit/score times</option> <option value="cross_val_predict">cross_val_predict - Generate cross-validated estimates for each input data point</option> <option value="cross_val_score">cross_val_score - Evaluate a score by cross-validation</option> @@ -179,28 +166,8 @@ <option value="permutation_test_score">permutation_test_score - Evaluate the significance of a cross-validated score with permutations</option> <option value="validation_curve">validation_curve - Validation curve</option> </param> - <when value="GridSearchCV"> - <expand macro="estimator_input_no_fit" /> - <param argument="param_grid" type="text" value="[{'feature_selector__k': [3, 5, 7, 9], 'estimator__C': [1, 10, 100, 1000]}]" label="param_grid" help="Dictionary with parameters names (string) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored"/> - <section name="options" title="Other Options" expanded="false"> - <expand macro="scoring"/> - <expand macro="model_validation_common_options"/> - <expand macro="pre_dispatch" value="2*n_jobs" help="Controls the number of jobs that get dispatched during parallel execution"/> - <param argument="iid" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="iid" help="Data is identically distributed?"/> - <param argument="refit" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="refit" help="Refit an estimator using the best found parameters on the whole dataset."/> - <!--error_score--> - <param argument="return_train_score" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="return_train_score" help=""/> - </section> - <param name="return_type" type="select" label="Select a return type"> - <option value="cv_results_" selected="true">cv_results_</option> - <option value="best_estimator_">best_estimator_</option> - <option value="best_score_">best_score_</option> - <option value="best_params_">best_params_</option> - <option value="best_index_">best_index_</option> - </param> - </when> <when value="cross_validate"> - <expand macro="estimator_input_no_fit" /> + <expand macro="estimator_selector_all" /> <section name="options" title="Other Options" expanded="false"> <!--groups--> <expand macro="model_validation_common_options"/> @@ -216,7 +183,7 @@ </param> </when> <when value="cross_val_predict"> - <expand macro="estimator_input_no_fit" /> + <expand macro="estimator_selector_all" /> <section name="options" title="Other Options" expanded="false"> <!--groups--> <expand macro="model_validation_common_options" /> @@ -229,7 +196,7 @@ </section> </when> <when value="cross_val_score"> - <expand macro="estimator_input_no_fit" /> + <expand macro="estimator_selector_all" /> <section name="options" title="Other Options" expanded="false"> <!--groups--> <expand macro="model_validation_common_options"/> @@ -239,7 +206,7 @@ </section> </when> <when value="learning_curve"> - <expand macro="estimator_input_no_fit" /> + <expand macro="estimator_selector_all" /> <section name="options" title="Other Options" expanded="false"> <!--groups--> <expand macro="model_validation_common_options"/> @@ -257,7 +224,7 @@ </param> </when> <when value="permutation_test_score"> - <expand macro="estimator_input_no_fit" /> + <expand macro="estimator_selector_all" /> <section name="options" title="Other Options" expanded="false"> <!--groups--> <expand macro="model_validation_common_options"/> @@ -272,7 +239,7 @@ </param> </when> <when value="validation_curve"> - <expand macro="estimator_input_no_fit" /> + <expand macro="estimator_selector_all" /> <section name="options" title="Other Options" expanded="false"> <param name="param_name" type="text" value="gamma" label="param_name" help="Name of the parameter that will be varied"/> <param name="param_range" type="text" value="np.logspace(-6, -1, 5)" label="param_range" help="The values of the parameter that will be evaluated."/> @@ -295,8 +262,8 @@ <tests> <test> <param name="selected_function" value="cross_validate"/> - <param name="estimator" value="linear_model.LassoCV()"/> - <param name="has_estimator" value="yes"/> + <param name="selected_module" value="linear_model"/> + <param name="selected_estimator" value="LassoCV"/> <param name="infile1" value="regression_train.tabular" ftype="tabular"/> <param name="col1" value="1,2,3,4,5"/> <param name="infile2" value="regression_train.tabular" ftype="tabular"/> @@ -305,8 +272,8 @@ </test> <test> <param name="selected_function" value="cross_val_predict"/> - <param name="estimator" value="linear_model.LassoCV()"/> - <param name="has_estimator" value="yes"/> + <param name="selected_module" value="linear_model"/> + <param name="selected_estimator" value="LassoCV"/> <param name="infile1" value="regression_train.tabular" ftype="tabular"/> <param name="col1" value="1,2,3,4,5"/> <param name="infile2" value="regression_train.tabular" ftype="tabular"/> @@ -315,8 +282,8 @@ </test> <test> <param name="selected_function" value="cross_val_score"/> - <param name="estimator" value="linear_model.LassoCV()"/> - <param name="has_estimator" value="yes"/> + <param name="selected_module" value="linear_model"/> + <param name="selected_estimator" value="LassoCV"/> <param name="infile1" value="regression_train.tabular" ftype="tabular"/> <param name="col1" value="1,2,3,4,5"/> <param name="infile2" value="regression_train.tabular" ftype="tabular"/> @@ -325,8 +292,8 @@ </test> <test> <param name="selected_function" value="learning_curve"/> - <param name="estimator" value="linear_model.LassoCV()"/> - <param name="has_estimator" value="yes"/> + <param name="selected_module" value="linear_model"/> + <param name="selected_estimator" value="LassoCV"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> @@ -337,8 +304,8 @@ </test> <test> <param name="selected_function" value="permutation_test_score"/> - <param name="estimator" value="linear_model.LassoCV()"/> - <param name="has_estimator" value="yes"/> + <param name="selected_module" value="linear_model"/> + <param name="selected_estimator" value="LassoCV"/> <param name="infile1" value="regression_train.tabular" ftype="tabular"/> <param name="col1" value="1,2,3,4,5"/> <param name="infile2" value="regression_train.tabular" ftype="tabular"/> @@ -347,8 +314,9 @@ </test> <test> <param name="selected_function" value="validation_curve"/> - <param name="estimator" value="svm.SVC(kernel="linear")"/> - <param name="has_estimator" value="yes"/> + <param name="selected_module" value="svm"/> + <param name="selected_estimator" value="SVC"/> + <param name="text_params" value="'kernel': 'linear'"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> @@ -358,79 +326,15 @@ <param name="return_type" value="test_scores"/> <output name="outfile" file="mv_result06.tabular"/> </test> - <test> - <param name="do_feature_selection" value="Yes"/> - <param name="selected_algorithm" value="SelectKBest"/> - <param name="score_func" value="chi2"/> - <param name="selected_function" value="GridSearchCV"/> - <param name="estimator" value="svm.SVR(kernel="linear")"/> - <param name="has_estimator" value="yes"/> - <param name="param_grid" value="[{'feature_selector__k': [3, 7], 'estimator__C': [1, 100]}]"/> - <param name="return_type" value="best_score_"/> - <param name="infile1" value="regression_X.tabular" ftype="tabular"/> - <param name="header1" value="true" /> - <param name="selected_column_selector_option" value="all_columns"/> - <param name="infile2" value="regression_y.tabular" ftype="tabular"/> - <param name="header2" value="true" /> - <param name="selected_column_selector_option2" value="all_columns"/> - <output name="outfile" > - <assert_contents> - <has_line line="0.7824428015300172" /> - </assert_contents> - </output> - </test> - <test> - <param name="do_pre_processing" value="Yes"/> - <param name="selected_pre_processor" value="RobustScaler"/> - <param name="do_feature_selection" value="Yes"/> - <param name="selected_algorithm" value="SelectKBest"/> - <param name="score_func" value="f_classif"/> - <param name="selected_function" value="GridSearchCV"/> - <param name="estimator" value="svm.SVR(kernel="linear")"/> - <param name="has_estimator" value="yes"/> - <param name="param_grid" value="[{'feature_selector__k': [3, 5, 7, 9], 'estimator__C': [1, 10, 100, 1000]}]"/> - <param name="return_type" value="best_score_"/> - <param name="infile1" value="regression_X.tabular" ftype="tabular"/> - <param name="header1" value="true" /> - <param name="selected_column_selector_option" value="all_columns"/> - <param name="infile2" value="regression_y.tabular" ftype="tabular"/> - <param name="header2" value="true" /> - <param name="selected_column_selector_option2" value="all_columns"/> - <output name="outfile" > - <assert_contents> - <has_line line="0.7938837807353147" /> - </assert_contents> - </output> - </test> - <test> - <param name="do_pre_processing" value="Yes"/> - <param name="selected_pre_processor" value="RobustScaler"/> - <param name="selected_function" value="GridSearchCV"/> - <param name="estimator" value="svm.SVR(kernel="linear")"/> - <param name="has_estimator" value="yes"/> - <param name="param_grid" value="[{'estimator__C': [1, 10, 100, 1000]}]"/> - <param name="return_type" value="best_score_"/> - <param name="infile1" value="regression_X.tabular" ftype="tabular"/> - <param name="header1" value="true" /> - <param name="selected_column_selector_option" value="all_columns"/> - <param name="infile2" value="regression_y.tabular" ftype="tabular"/> - <param name="header2" value="true" /> - <param name="selected_column_selector_option2" value="all_columns"/> - <output name="outfile" > - <assert_contents> - <has_line line="0.7904476204861263" /> - </assert_contents> - </output> - </test> </tests> <help> <![CDATA[ **What it does** This tool includes model validation functions to evaluate estimator performance in the cross-validation approach. This tool is based on sklearn.model_selection package. -For information about classification metric functions and their parameter settings please refer to `Scikit-learn classification metrics`_. +For information about model validation functions and their parameter settings please refer to `Scikit-learn model_selection`_. -.. _`Scikit-learn classification metrics`: http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics +.. _`Scikit-learn model_selection`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection ]]> </help> <expand macro="sklearn_citation"/>