Mercurial > repos > bgruening > sklearn_model_validation
view model_validation.xml @ 13:badd86b9ce24 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 8cf3d813ec755166ee0bd517b4ecbbd4f84d4df1
author | bgruening |
---|---|
date | Thu, 23 Aug 2018 16:21:01 -0400 |
parents | 2c1851992069 |
children | e244d6f2df1a |
line wrap: on
line source
<tool id="sklearn_model_validation" name="Model Validation" version="@VERSION@"> <description>evaluates estimator performance by cross-validation</description> <macros> <import>main_macros.xml</import> </macros> <expand macro="python_requirements"/> <expand macro="macro_stdio"/> <version_command>echo "@VERSION@"</version_command> <command> <![CDATA[ python "$sklearn_model_validation_script" '$inputs' ]]> </command> <configfiles> <inputs name="inputs" /> <configfile name="sklearn_model_validation_script"> <![CDATA[ import sys import os import json import pandas import numpy as np from sklearn import preprocessing, model_selection, svm, linear_model, ensemble, naive_bayes, tree, neighbors from sklearn.pipeline import Pipeline execfile("$__tool_directory__/utils.py") safe_eval = SafeEval() input_json_path = sys.argv[1] with open(input_json_path, "r") as param_handler: params = json.load(param_handler) input_type = params["input_options"]["selected_input"] if input_type=="tabular": header = 'infer' if params["input_options"]["header1"] else None column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"] if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: c = params["input_options"]["column_selector_options_1"]["col1"] else: c = None X = read_columns( "$input_options.infile1", c = c, c_option = column_option, sep='\t', header=header, parse_dates=True ) else: X = mmread("$input_options.infile1") header = 'infer' if params["input_options"]["header2"] else None column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: c = params["input_options"]["column_selector_options_2"]["col2"] else: c = None y = read_columns( "$input_options.infile2", c = c, c_option = column_option, sep='\t', header=header, parse_dates=True ) y=y.ravel() options = params["model_validation_functions"]["options"] options['cv'] = get_cv( options['cv'] ) options['n_jobs'] = N_JOBS if 'scoring' in options: options['scoring'] = get_scoring(options['scoring']) if 'pre_dispatch' in options and options['pre_dispatch'] == '': options['pre_dispatch'] = None pipeline_steps = [] ## Set up pre_processor and add to pipeline steps. if params['pre_processing']['do_pre_processing'] == 'Yes': preprocessor = params["pre_processing"]["pre_processors"]["selected_pre_processor"] pre_processor_options = params["pre_processing"]["pre_processors"]["options"] my_class = getattr(preprocessing, preprocessor) pipeline_steps.append( ('pre_processor', my_class(**pre_processor_options)) ) ## Set up feature selector and add to pipeline steps. if params['feature_selection']['do_feature_selection'] == 'Yes': feature_selector = feature_selector(params['feature_selection']['fs_algorithm_selector']) pipeline_steps.append( ('feature_selector', feature_selector) ) ## Set up estimator and add to pipeline. estimator_json = params["model_validation_functions"]['estimator_selector'] estimator = get_estimator(estimator_json) pipeline_steps.append( ('estimator', estimator) ) pipeline = Pipeline(pipeline_steps) ## Set up validator, run pipeline through validator and return results. validator = params["model_validation_functions"]["selected_function"] validator = getattr(model_selection, validator) selected_function = params["model_validation_functions"]["selected_function"] rval_type = params["model_validation_functions"].get("return_type", None) if selected_function == 'cross_validate': res = validator(pipeline, X, y, **options) rval = res[rval_type] elif selected_function == 'learning_curve': options['train_sizes'] = eval(options['train_sizes']) train_sizes_abs, train_scores, test_scores = validator(pipeline, X, y, **options) rval = eval(rval_type) elif selected_function == 'permutation_test_score': score, permutation_scores, pvalue = validator(pipeline, X, y, **options) rval = eval(rval_type) if rval_type in ["score", "pvalue"]: rval = [rval] elif selected_function == 'validation_curve': options['param_name'] = 'estimator__' + options['param_name'] options['param_range'] = eval(options['param_range']) train_scores, test_scores = validator(pipeline, X, y, **options) rval = eval(rval_type) else: rval = validator(pipeline, X, y, **options) rval = pandas.DataFrame(rval) rval.to_csv(path_or_buf="$outfile", sep='\t', header=False, index=False) ]]> </configfile> </configfiles> <inputs> <conditional name="pre_processing"> <param name="do_pre_processing" type="select" label="Do pre_processing?"> <option value="No" selected="true"/> <option value="Yes"/> </param> <when value="No"/> <when value="Yes"> <conditional name="pre_processors"> <expand macro="sparse_preprocessors_ext" /> <expand macro="sparse_preprocessor_options_ext" /> </conditional> </when> </conditional> <conditional name="feature_selection"> <param name="do_feature_selection" type="select" label="Do feature selection?"> <option value="No" selected="true"/> <option value="Yes"/> </param> <when value="No"/> <when value="Yes"> <expand macro="feature_selection_all"> <expand macro="fs_selectfrommodel_no_prefitted"/> </expand> </when> </conditional> <conditional name="model_validation_functions"> <param name="selected_function" type="select" label="Select a model validation function"> <option value="cross_validate">cross_validate - Evaluate metric(s) by cross-validation and also record fit/score times</option> <option value="cross_val_predict">cross_val_predict - Generate cross-validated estimates for each input data point</option> <option value="cross_val_score">cross_val_score - Evaluate a score by cross-validation</option> <option value="learning_curve">learning_curve - Learning curve</option> <option value="permutation_test_score">permutation_test_score - Evaluate the significance of a cross-validated score with permutations</option> <option value="validation_curve">validation_curve - Validation curve</option> </param> <when value="cross_validate"> <expand macro="estimator_selector_all" /> <section name="options" title="Other Options" expanded="false"> <!--groups--> <expand macro="model_validation_common_options"/> <expand macro="scoring_selection"/> <!--fit_params--> <expand macro="pre_dispatch"/> </section> <param name="return_type" type="select" label="Select a return type"> <option value="test_score" selected="true">test_score</option> <option value="train_score">train_score</option> <option value="fit_time">fit_time</option> <option value="score_time">score_time</option> </param> </when> <when value="cross_val_predict"> <expand macro="estimator_selector_all" /> <section name="options" title="Other Options" expanded="false"> <!--groups--> <expand macro="model_validation_common_options" /> <!--fit_params--> <expand macro="pre_dispatch" value="2*n_jobs’" help="Controls the number of jobs that get dispatched during parallel execution"/> <param argument="method" type="select" label="Invokes the passed method name of the passed estimator"> <option value="predict" selected="true">predict</option> <option value="predict_proba">predict_proba</option> </param> </section> </when> <when value="cross_val_score"> <expand macro="estimator_selector_all" /> <section name="options" title="Other Options" expanded="false"> <!--groups--> <expand macro="model_validation_common_options"/> <expand macro="scoring_selection"/> <!--fit_params--> <expand macro="pre_dispatch"/> </section> </when> <when value="learning_curve"> <expand macro="estimator_selector_all" /> <section name="options" title="Other Options" expanded="false"> <!--groups--> <expand macro="model_validation_common_options"/> <param argument="train_sizes" type="text" value="np.linspace(0.1, 1.0, 5)" label="train_sizes" help="Relative or absolute numbers of training examples that will be used to generate the learning curve"/> <expand macro="scoring_selection"/> <param argument="exploit_incremental_learning" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="false" label="exploit_incremental_learning" help="Whether to apply incremental learning to speed up fitting of the estimator if supported"/> <expand macro="pre_dispatch"/> <expand macro="shuffle" checked="false" label="shuffle" help="Whether to shuffle training data before taking prefixes"/> <expand macro="random_state"/> </section> <param name="return_type" type="select" label="Select a return type"> <option value="train_sizes_abs" selected="true">train_sizes_abs</option> <option value="train_scores">train_scores</option> <option value="test_scores">test_scores</option> </param> </when> <when value="permutation_test_score"> <expand macro="estimator_selector_all" /> <section name="options" title="Other Options" expanded="false"> <!--groups--> <expand macro="model_validation_common_options"/> <expand macro="scoring_selection"/> <param name="n_permutations" type="integer" value="100" optional="true" label="n_permutations" help="Number of times to permute y"/> <expand macro="random_state"/> </section> <param name="return_type" type="select" label="Select a return type"> <option value="score" selected="true">score</option> <option value="permutation_scores">permutation_scores</option> <option value="pvalue">pvalue</option> </param> </when> <when value="validation_curve"> <expand macro="estimator_selector_all" /> <section name="options" title="Other Options" expanded="false"> <param name="param_name" type="text" value="gamma" label="param_name" help="Name of the parameter that will be varied"/> <param name="param_range" type="text" value="np.logspace(-6, -1, 5)" label="param_range" help="The values of the parameter that will be evaluated."/> <!--groups--> <expand macro="model_validation_common_options"/> <expand macro="scoring_selection"/> <expand macro="pre_dispatch"/> </section> <param name="return_type" type="select" label="Select a return type"> <option value="train_scores" selected="true">train_scores</option> <option value="test_scores">test_scores</option> </param> </when> </conditional> <expand macro="sl_mixed_input"/> </inputs> <outputs> <data format="tabular" name="outfile"/> </outputs> <tests> <test> <param name="selected_function" value="cross_validate"/> <param name="selected_module" value="linear_model"/> <param name="selected_estimator" value="LassoCV"/> <param name="infile1" value="regression_train.tabular" ftype="tabular"/> <param name="col1" value="1,2,3,4,5"/> <param name="infile2" value="regression_train.tabular" ftype="tabular"/> <param name="col2" value="6"/> <output name="outfile" file="mv_result01.tabular"/> </test> <test> <param name="selected_function" value="cross_val_predict"/> <param name="selected_module" value="linear_model"/> <param name="selected_estimator" value="LassoCV"/> <param name="infile1" value="regression_train.tabular" ftype="tabular"/> <param name="col1" value="1,2,3,4,5"/> <param name="infile2" value="regression_train.tabular" ftype="tabular"/> <param name="col2" value="6"/> <output name="outfile" file="mv_result02.tabular"/> </test> <test> <param name="selected_function" value="cross_val_score"/> <param name="selected_module" value="linear_model"/> <param name="selected_estimator" value="LassoCV"/> <param name="infile1" value="regression_train.tabular" ftype="tabular"/> <param name="col1" value="1,2,3,4,5"/> <param name="infile2" value="regression_train.tabular" ftype="tabular"/> <param name="col2" value="6"/> <output name="outfile" file="mv_result03.tabular"/> </test> <test> <param name="selected_function" value="learning_curve"/> <param name="selected_module" value="linear_model"/> <param name="selected_estimator" value="LassoCV"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="col2" value="1"/> <output name="outfile" file="mv_result04.tabular"/> </test> <test> <param name="selected_function" value="permutation_test_score"/> <param name="selected_module" value="linear_model"/> <param name="selected_estimator" value="LassoCV"/> <param name="infile1" value="regression_train.tabular" ftype="tabular"/> <param name="col1" value="1,2,3,4,5"/> <param name="infile2" value="regression_train.tabular" ftype="tabular"/> <param name="col2" value="6"/> <output name="outfile" file="mv_result05.tabular"/> </test> <test> <param name="selected_function" value="validation_curve"/> <param name="selected_module" value="svm"/> <param name="selected_estimator" value="SVC"/> <param name="text_params" value="kernel='linear'"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="true" /> <param name="selected_column_selector_option" value="all_columns"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="header2" value="true" /> <param name="col2" value="1"/> <param name="return_type" value="test_scores"/> <output name="outfile" file="mv_result06.tabular"/> </test> </tests> <help> <![CDATA[ **What it does** This tool includes model validation functions to evaluate estimator performance in the cross-validation approach. This tool is based on sklearn.model_selection package. For information about model validation functions and their parameter settings please refer to `Scikit-learn model_selection`_. .. _`Scikit-learn model_selection`: http://scikit-learn.org/stable/modules/classes.html#module-sklearn.model_selection ]]> </help> <expand macro="sklearn_citation"> <expand macro="skrebate_citation"/> <expand macro="xgboost_citation"/> </expand> </tool>