diff model_validation.xml @ 9:c6b3efcba7bd draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 76583c1fcd9d06a4679cc46ffaee44117b9e22cd
author bgruening
date Sat, 04 Aug 2018 12:35:35 -0400
parents fd7a054ffdbd
children e4ab6b0bdf37
line wrap: on
line diff
--- a/model_validation.xml	Fri Jul 13 03:56:45 2018 -0400
+++ b/model_validation.xml	Sat Aug 04 12:35:35 2018 -0400
@@ -21,13 +21,14 @@
 import ast
 import pickle
 import numpy as np
-import sklearn.model_selection
-from sklearn import svm, linear_model, ensemble, preprocessing
+import sklearn.feature_selection
+from sklearn import preprocessing, model_selection, svm, linear_model, ensemble, naive_bayes, tree, neighbors
 from sklearn.pipeline import Pipeline
 
 @COLUMNS_FUNCTION@
+@GET_ESTIMATOR_FUNCTION@
+@FEATURE_SELECTOR_FUNCTION@
 
-@FEATURE_SELECTOR_FUNCTION@
 
 input_json_path = sys.argv[1]
 with open(input_json_path, "r") as param_handler:
@@ -85,14 +86,12 @@
 
 ## Set up feature selector and add to pipeline steps.
 if params['feature_selection']['do_feature_selection'] == 'Yes':
-    feature_selector = feature_selector(params['feature_selection']['feature_selection_algorithms'])
+    feature_selector = feature_selector(params['feature_selection']['fs_algorithm_selector'])
     pipeline_steps.append( ('feature_selector', feature_selector) )
 
 ## Set up estimator and add to pipeline.
-estimator=params["model_validation_functions"]["estimator"]
-if params["model_validation_functions"]["extra_estimator"]["has_estimator"] == 'no':
-    estimator = params["model_validation_functions"]["extra_estimator"]["new_estimator"]
-estimator = eval(estimator.replace('__dq__', '"').replace("__sq__","'"))
+estimator_json = params["model_validation_functions"]['estimator_selector']
+estimator = get_estimator(estimator_json)
 
 pipeline_steps.append( ('estimator', estimator) )
 
@@ -101,7 +100,7 @@
 ## Set up validator, run pipeline through validator and return results.
 
 validator = params["model_validation_functions"]["selected_function"]
-validator = getattr(sklearn.model_selection, validator)
+validator = getattr(model_selection, validator)
 
 selected_function = params["model_validation_functions"]["selected_function"]
 rval_type = params["model_validation_functions"].get("return_type", None)
@@ -123,24 +122,11 @@
     options['param_range'] = eval(options['param_range'])
     train_scores, test_scores = validator(pipeline, X, y, **options)
     rval = eval(rval_type)
-elif selected_function == 'GridSearchCV':
-    param_grid = params["model_validation_functions"]["param_grid"].replace("__sq__","'")\
-        .replace('__dq__','"').replace("__oc__", "{").replace("__cc__", "}")\
-        .replace("__ob__", "[").replace("__cb__", "]")
-    param_grid = ast.literal_eval(param_grid)
-    grid = validator(pipeline, param_grid, **options)
-    grid.fit(X, y)
-    rval = getattr(grid, rval_type)
-    if rval_type in ["best_estimator_", "best_score_", "best_index_"]:
-        rval = [rval]     
 else:
     rval = validator(pipeline, X, y, **options)
 
 rval = pandas.DataFrame(rval)
-if rval_type and rval_type == "cv_results_":
-    rval.to_csv(path_or_buf="$outfile", sep='\t', header=True, index=False)
-else:
-    rval.to_csv(path_or_buf="$outfile", sep='\t', header=False, index=False)
+rval.to_csv(path_or_buf="$outfile", sep='\t', header=False, index=False)
 
             ]]>
         </configfile>
@@ -166,12 +152,13 @@
             </param>
             <when value="No"/>
             <when value="Yes">
-                <expand macro="feature_selection_all"/>
+                <expand macro="feature_selection_all">
+                    <expand macro="fs_selectfrommodel_no_prefitted"/>
+                </expand>
             </when>
         </conditional>
         <conditional name="model_validation_functions">
             <param name="selected_function" type="select" label="Select a model validation function">
-                <option value="GridSearchCV">GridSearchCV - Exhaustive search over specified parameter values for an estimator </option>
                 <option value="cross_validate">cross_validate - Evaluate metric(s) by cross-validation and also record fit/score times</option>
                 <option value="cross_val_predict">cross_val_predict - Generate cross-validated estimates for each input data point</option>
                 <option value="cross_val_score">cross_val_score - Evaluate a score by cross-validation</option>
@@ -179,28 +166,8 @@
                 <option value="permutation_test_score">permutation_test_score - Evaluate the significance of a cross-validated score with permutations</option>
                 <option value="validation_curve">validation_curve - Validation curve</option>
             </param>
-            <when value="GridSearchCV">
-                <expand macro="estimator_input_no_fit" />
-                <param argument="param_grid" type="text" value="[{'feature_selector__k': [3, 5, 7, 9], 'estimator__C': [1, 10, 100, 1000]}]" label="param_grid" help="Dictionary with parameters names (string) as keys and lists of parameter settings to try as values, or a list of such dictionaries, in which case the grids spanned by each dictionary in the list are explored"/>
-                <section name="options" title="Other Options" expanded="false">
-                    <expand macro="scoring"/>
-                    <expand macro="model_validation_common_options"/>
-                    <expand macro="pre_dispatch" value="2*n_jobs" help="Controls the number of jobs that get dispatched during parallel execution"/>
-                    <param argument="iid" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="iid" help="Data is identically distributed?"/>
-                    <param argument="refit" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="true" label="refit" help="Refit an estimator using the best found parameters on the whole dataset."/>
-                    <!--error_score-->
-                    <param argument="return_train_score" type="boolean" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="return_train_score" help=""/>
-                </section>
-                <param name="return_type" type="select" label="Select a return type">
-                    <option value="cv_results_" selected="true">cv_results_</option>
-                    <option value="best_estimator_">best_estimator_</option>
-                    <option value="best_score_">best_score_</option>
-                    <option value="best_params_">best_params_</option>
-                    <option value="best_index_">best_index_</option>
-                </param>
-            </when>
             <when value="cross_validate">
-                <expand macro="estimator_input_no_fit" />
+                <expand macro="estimator_selector_all" />
                 <section name="options" title="Other Options" expanded="false">
                     <!--groups-->
                     <expand macro="model_validation_common_options"/>
@@ -216,7 +183,7 @@
                 </param>
             </when>
             <when value="cross_val_predict">
-                <expand macro="estimator_input_no_fit" />
+                <expand macro="estimator_selector_all" />
                 <section name="options" title="Other Options" expanded="false">
                     <!--groups-->
                     <expand macro="model_validation_common_options" />
@@ -229,7 +196,7 @@
                 </section>
             </when>
             <when value="cross_val_score">
-                <expand macro="estimator_input_no_fit" />
+                <expand macro="estimator_selector_all" />
                 <section name="options" title="Other Options" expanded="false">
                     <!--groups-->
                     <expand macro="model_validation_common_options"/>
@@ -239,7 +206,7 @@
                 </section>
             </when>
             <when value="learning_curve">
-                <expand macro="estimator_input_no_fit" />
+                <expand macro="estimator_selector_all" />
                 <section name="options" title="Other Options" expanded="false">
                     <!--groups-->
                     <expand macro="model_validation_common_options"/>
@@ -257,7 +224,7 @@
                 </param>
             </when>
             <when value="permutation_test_score">
-                <expand macro="estimator_input_no_fit" />
+                <expand macro="estimator_selector_all" />
                 <section name="options" title="Other Options" expanded="false">
                     <!--groups-->
                     <expand macro="model_validation_common_options"/>
@@ -272,7 +239,7 @@
                 </param>
             </when>
             <when value="validation_curve">
-                <expand macro="estimator_input_no_fit" />
+                <expand macro="estimator_selector_all" />
                 <section name="options" title="Other Options" expanded="false">
                     <param name="param_name" type="text" value="gamma" label="param_name" help="Name of the parameter that will be varied"/>
                     <param name="param_range" type="text" value="np.logspace(-6, -1, 5)" label="param_range" help="The values of the parameter that will be evaluated."/>
@@ -295,8 +262,8 @@
     <tests>
         <test>
             <param name="selected_function" value="cross_validate"/>
-            <param name="estimator" value="linear_model.LassoCV()"/>
-            <param name="has_estimator" value="yes"/>
+            <param name="selected_module" value="linear_model"/>
+            <param name="selected_estimator" value="LassoCV"/>
             <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
             <param name="col1" value="1,2,3,4,5"/>
             <param name="infile2" value="regression_train.tabular" ftype="tabular"/>
@@ -305,8 +272,8 @@
         </test>
         <test>
             <param name="selected_function" value="cross_val_predict"/>
-            <param name="estimator" value="linear_model.LassoCV()"/>
-            <param name="has_estimator" value="yes"/>
+            <param name="selected_module" value="linear_model"/>
+            <param name="selected_estimator" value="LassoCV"/>
             <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
             <param name="col1" value="1,2,3,4,5"/>
             <param name="infile2" value="regression_train.tabular" ftype="tabular"/>
@@ -315,8 +282,8 @@
         </test>
         <test>
             <param name="selected_function" value="cross_val_score"/>
-            <param name="estimator" value="linear_model.LassoCV()"/>
-            <param name="has_estimator" value="yes"/>
+            <param name="selected_module" value="linear_model"/>
+            <param name="selected_estimator" value="LassoCV"/>
             <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
             <param name="col1" value="1,2,3,4,5"/>
             <param name="infile2" value="regression_train.tabular" ftype="tabular"/>
@@ -325,8 +292,8 @@
         </test>
         <test>
             <param name="selected_function" value="learning_curve"/>
-            <param name="estimator" value="linear_model.LassoCV()"/>
-            <param name="has_estimator" value="yes"/>
+            <param name="selected_module" value="linear_model"/>
+            <param name="selected_estimator" value="LassoCV"/>
             <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
             <param name="header1" value="true" />
             <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
@@ -337,8 +304,8 @@
         </test>
         <test>
             <param name="selected_function" value="permutation_test_score"/>
-            <param name="estimator" value="linear_model.LassoCV()"/>
-            <param name="has_estimator" value="yes"/>
+            <param name="selected_module" value="linear_model"/>
+            <param name="selected_estimator" value="LassoCV"/>
             <param name="infile1" value="regression_train.tabular" ftype="tabular"/>
             <param name="col1" value="1,2,3,4,5"/>
             <param name="infile2" value="regression_train.tabular" ftype="tabular"/>
@@ -347,8 +314,9 @@
         </test>
         <test>
             <param name="selected_function" value="validation_curve"/>
-            <param name="estimator" value="svm.SVC(kernel=&quot;linear&quot;)"/>
-            <param name="has_estimator" value="yes"/>
+            <param name="selected_module" value="svm"/>
+            <param name="selected_estimator" value="SVC"/>
+            <param name="text_params" value="'kernel': 'linear'"/>
             <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
             <param name="header1" value="true" />
             <param name="selected_column_selector_option" value="all_columns"/>
@@ -358,79 +326,15 @@
             <param name="return_type" value="test_scores"/>
             <output name="outfile" file="mv_result06.tabular"/>
         </test>
-        <test>
-            <param name="do_feature_selection" value="Yes"/>
-            <param name="selected_algorithm" value="SelectKBest"/>
-            <param name="score_func" value="chi2"/>
-            <param name="selected_function" value="GridSearchCV"/>
-            <param name="estimator" value="svm.SVR(kernel=&quot;linear&quot;)"/>
-            <param name="has_estimator" value="yes"/>
-            <param name="param_grid" value="[{'feature_selector__k': [3, 7], 'estimator__C': [1, 100]}]"/>
-            <param name="return_type" value="best_score_"/>
-            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
-            <param name="header1" value="true" />
-            <param name="selected_column_selector_option" value="all_columns"/>
-            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
-            <param name="header2" value="true" />
-            <param name="selected_column_selector_option2" value="all_columns"/>
-            <output name="outfile" >
-                <assert_contents>
-                    <has_line line="0.7824428015300172" />
-                </assert_contents>
-            </output>
-        </test>
-        <test>
-            <param name="do_pre_processing" value="Yes"/>
-            <param name="selected_pre_processor" value="RobustScaler"/>
-            <param name="do_feature_selection" value="Yes"/>
-            <param name="selected_algorithm" value="SelectKBest"/>
-            <param name="score_func" value="f_classif"/>
-            <param name="selected_function" value="GridSearchCV"/>
-            <param name="estimator" value="svm.SVR(kernel=&quot;linear&quot;)"/>
-            <param name="has_estimator" value="yes"/>
-            <param name="param_grid" value="[{'feature_selector__k': [3, 5, 7, 9], 'estimator__C': [1, 10, 100, 1000]}]"/>
-            <param name="return_type" value="best_score_"/>
-            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
-            <param name="header1" value="true" />
-            <param name="selected_column_selector_option" value="all_columns"/>
-            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
-            <param name="header2" value="true" />
-            <param name="selected_column_selector_option2" value="all_columns"/>
-            <output name="outfile" >
-                <assert_contents>
-                    <has_line line="0.7938837807353147" />
-                </assert_contents>
-            </output>
-        </test>
-         <test>
-            <param name="do_pre_processing" value="Yes"/>
-            <param name="selected_pre_processor" value="RobustScaler"/>
-            <param name="selected_function" value="GridSearchCV"/>
-            <param name="estimator" value="svm.SVR(kernel=&quot;linear&quot;)"/>
-            <param name="has_estimator" value="yes"/>
-            <param name="param_grid" value="[{'estimator__C': [1, 10, 100, 1000]}]"/>
-            <param name="return_type" value="best_score_"/>
-            <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
-            <param name="header1" value="true" />
-            <param name="selected_column_selector_option" value="all_columns"/>
-            <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
-            <param name="header2" value="true" />
-            <param name="selected_column_selector_option2" value="all_columns"/>
-            <output name="outfile" >
-                <assert_contents>
-                    <has_line line="0.7904476204861263" />
-                </assert_contents>
-            </output>
-        </test>
     </tests>
     <help>
         <![CDATA[
 **What it does**
 This tool includes model validation functions to evaluate estimator performance in the cross-validation approach. This tool is based on
 sklearn.model_selection package.
-For information about classification metric functions and their parameter settings please refer to `Scikit-learn classification metrics`_.
+For information about model validation functions and their parameter settings please refer to `Scikit-learn model_selection`_.
 
-.. _`Scikit-learn classification metrics`: http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics
+.. _`Scikit-learn model_selection`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection
         ]]>
     </help>
     <expand macro="sklearn_citation"/>