Mercurial > repos > bgruening > sklearn_feature_selection
diff feature_selection.xml @ 0:092199a095dd draft
planemo upload for repository https://github.com/bgruening/galaxytools/tools/sklearn commit 7a31960686122d7e53054fef4996525f04ebd254
author | bgruening |
---|---|
date | Thu, 12 Apr 2018 08:23:30 -0400 |
parents | |
children | 2eb90e73f0d5 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/feature_selection.xml Thu Apr 12 08:23:30 2018 -0400 @@ -0,0 +1,359 @@ +<tool id="sklearn_feature_selection" name="Feature Selection" version="@VERSION@"> + <description>module, including univariate filter selection methods and recursive feature elimination algorithm</description> + <macros> + <import>main_macros.xml</import> + </macros> + <expand macro="python_requirements"/> + <expand macro="macro_stdio"/> + <version_command>echo "@VERSION@"</version_command> + <command> + <![CDATA[ + python "$feature_selection_script" '$inputs' + ]]> + </command> + <configfiles> + <inputs name="inputs" /> + <configfile name="feature_selection_script"> + <![CDATA[ +import sys +import json +import pandas +import pickle +import numpy as np +import sklearn.feature_selection +from sklearn import svm, linear_model, ensemble + +@COLUMNS_FUNCTION@ + +input_json_path = sys.argv[1] +params = json.load(open(input_json_path, "r")) + +input_type = params["input_options"]["selected_input"] +if input_type=="tabular": + header = 'infer' if params["input_options"]["header1"] else None + X = read_columns( + "$input_options.infile1", + "$input_options.col1", + sep='\t', + header=header, + parse_dates=True + ) +else: + X = mmread(open("$input_options.infile1", 'r')) + +header = 'infer' if params["input_options"]["header2"] else None +y = read_columns( + "$input_options.infile2", + "$input_options.col2", + sep='\t', + header=header, + parse_dates=True +) +y=y.ravel() + +selector = params["feature_selection_algorithms"]["selected_algorithm"] +selector = getattr(sklearn.feature_selection, selector) +options = params["feature_selection_algorithms"]["options"] + +#if $feature_selection_algorithms.selected_algorithm == 'SelectFromModel': +if not options['threshold'] or options['threshold'] == 'None': + options['threshold'] = None +#if $feature_selection_algorithms.extra_estimator.has_estimator == 'no_load': +fitted_estimator = pickle.load(open("$feature_selection_algorithms.extra_estimator.fitted_estimator", 'r')) +new_selector = selector(fitted_estimator, prefit=True, **options) +#else: +estimator=params["feature_selection_algorithms"]["estimator"] +if params["feature_selection_algorithms"]["extra_estimator"]["has_estimator"]=='no': + estimator=params["feature_selection_algorithms"]["extra_estimator"]["new_estimator"] +estimator=eval(estimator.replace('__dq__', '"').replace("__sq__","'")) +new_selector = selector(estimator, **options) +new_selector.fit(X, y) +#end if + +#elif $feature_selection_algorithms.selected_algorithm in ['RFE', 'RFECV']: +if 'scoring' in options and (not options['scoring'] or options['scoring'] == 'None'): + options['scoring'] = None +estimator=params["feature_selection_algorithms"]["estimator"] +if params["feature_selection_algorithms"]["extra_estimator"]["has_estimator"]=='no': + estimator=params["feature_selection_algorithms"]["extra_estimator"]["new_estimator"] +estimator=eval(estimator.replace('__dq__', '"').replace("__sq__","'")) +new_selector = selector(estimator, **options) +new_selector.fit(X, y) + +#elif $feature_selection_algorithms.selected_algorithm == "VarianceThreshold": +new_selector = selector(**options) +new_selector.fit(X, y) + +#else: +score_func = params["feature_selection_algorithms"]["score_func"] +score_func = getattr(sklearn.feature_selection, score_func) +new_selector = selector(score_func, **options) +new_selector.fit(X, y) +#end if + +#if $select_methods.selected_method == "fit_transform": +res = new_selector.transform(X) + +#else: +res = new_selector.get_support(params["select_methods"]["indices"]) +#end if + +res = pandas.DataFrame(res) +res.to_csv(path_or_buf="$outfile", sep='\t', index=False) + + + ]]> + </configfile> + </configfiles> + <inputs> + <conditional name="feature_selection_algorithms"> + <param name="selected_algorithm" type="select" label="Select a feature selection algorithm"> + <option value="SelectFromModel" selected="true">SelectFromModel - Meta-transformer for selecting features based on importance weights</option> + <option value="GenericUnivariateSelect" selected="true">GenericUnivariateSelect - Univariate feature selector with configurable strategy</option> + <option value="SelectPercentile">SelectPercentile - Select features according to a percentile of the highest scores</option> + <option value="SelectKBest">SelectKBest - Select features according to the k highest scores</option> + <option value="SelectFpr">SelectFpr - Filter: Select the p-values below alpha based on a FPR test</option> + <option value="SelectFdr">SelectFdr - Filter: Select the p-values for an estimated false discovery rate</option> + <option value="SelectFwe">SelectFwe - Filter: Select the p-values corresponding to Family-wise error rate</option> + <option value="RFE">RFE - Feature ranking with recursive feature elimination</option> + <option value="RFECV">RFECV - Feature ranking with recursive feature elimination and cross-validated selection of the best number of features</option> + <option value="VarianceThreshold">VarianceThreshold - Feature selector that removes all low-variance features</option> + <!--option value="chi2">Compute chi-squared stats between each non-negative feature and class</option--> + <!--option value="f_classif">Compute the ANOVA F-value for the provided sample</option--> + <!--option value="f_regression">Univariate linear regression tests</option--> + <!--option value="mutual_info_classif">Estimate mutual information for a discrete target variable</option--> + <!--option value="mutual_info_regression">Estimate mutual information for a continuous target variable</option--> + </param> + <when value="SelectFromModel"> + <expand macro="feature_selection_estimator" /> + <conditional name="extra_estimator"> + <expand macro="feature_selection_extra_estimator" > + <option value="no_load">No, I will load a prefitted estimator</option> + </expand> + <expand macro="feature_selection_estimator_choices" > + <when value="no_load"> + <param name="fitted_estimator" type="data" format='zip' label="Load a prefitted estimator" /> + </when> + </expand> + </conditional> + <section name="options" title="Other Options" expanded="True"> + <param argument="threshold" type="text" value="" optional="true" label="threshold" help="The threshold value to use for feature selection. e.g. 'mean', 'median', '1.25*mean'." /> + <param argument="norm_order" type="integer" value="1" label="norm_order" help="Order of the norm used to filter the vectors of coefficients below threshold in the case where the coef_ attribute of the estimator is of dimension 2. " /> + </section> + </when> + <when value="GenericUnivariateSelect"> + <expand macro="feature_selection_score_function" /> + <section name="options" title="Other Options" expanded="True"> + <param argument="mode" type="select" label="Feature selection mode"> + <option value="percentile">percentile</option> + <option value="k_best">k_best</option> + <option value="fpr">fpr</option> + <option value="fdr">fdr</option> + <option value="fwe">fwe</option> + </param> + <param argument="param" type="float" value="" optional="true" label="Parameter of the corresponding mode" help="float or int depending on the feature selection mode" /> + </section> + </when> + <when value="SelectPercentile"> + <expand macro="feature_selection_score_function" /> + <section name="options" title="Other Options" expanded="True"> + <param argument="percentile" type="integer" value="10" optional="True" label="Percent of features to keep" /> + </section> + </when> + <when value="SelectKBest"> + <expand macro="feature_selection_score_function" /> + <section name="options" title="Other Options" expanded="True"> + <param argument="k" type="integer" value="10" optional="True" label="Number of top features to select" help="No 'all' option is supported." /> + </section> + </when> + <when value="SelectFpr"> + <expand macro="feature_selection_score_function" /> + <section name="options" title="Other Options" expanded="True"> + <param argument="alpha" type="float" value="" optional="True" label="Alpha" help="The highest p-value for features to be kept."/> + </section> + </when> + <when value="SelectFdr"> + <expand macro="feature_selection_score_function" /> + <section name="options" title="Other Options" expanded="True"> + <param argument="alpha" type="float" value="" optional="True" label="Alpha" help="The highest uncorrected p-value for features to keep."/> + </section> + </when> + <when value="SelectFwe"> + <expand macro="feature_selection_score_function" /> + <section name="options" title="Other Options" expanded="True"> + <param argument="alpha" type="float" value="" optional="True" label="Alpha" help="The highest uncorrected p-value for features to keep."/> + </section> + </when> + <when value="RFE"> + <expand macro="feature_selection_estimator" /> + <conditional name="extra_estimator"> + <expand macro="feature_selection_extra_estimator" /> + <expand macro="feature_selection_estimator_choices" /> + </conditional> + <section name="options" title="Other Options" expanded="True"> + <param argument="n_features_to_select" type="integer" value="" optional="true" label="n_features_to_select" help="The number of features to select. If None, half of the features are selected." /> + <param argument="step" type="float" value="1" label="step" optional="true" help="Default = 1. " /> + <param argument="verbose" type="integer" value="0" label="verbose" help="Controls verbosity of output." /> + </section> + </when> + <when value="RFECV"> + <expand macro="feature_selection_estimator" /> + <conditional name="extra_estimator"> + <expand macro="feature_selection_extra_estimator" /> + <expand macro="feature_selection_estimator_choices" /> + </conditional> + <section name="options" title="Other Options" expanded="True"> + <param argument="step" type="float" value="1" label="step" optional="true" help="Default = 1. " /> + <param argument="cv" type="integer" value="" optional="true" label="cv" help="Determines the cross-validation splitting strategy" /> + <param argument="scoring" type="text" value="" optional="true" label="scoring" help="A string (see model evaluation documentation) or a scorer callable object / function with signature scorer(estimator, X, y)."/> + <param argument="verbose" type="integer" value="0" label="verbose" help="Controls verbosity of output." /> + <param argument="n_jobs" type="integer" value="1" label="n_jobs" help="Number of cores to run in parallel while fitting across folds. Defaults to 1 core."/> + </section> + </when> + <when value="VarianceThreshold"> + <section name="options" title="Options" expanded="True"> + <param argument="threshold" type="float" value="" optional="True" label="Threshold" help="Features with a training-set variance lower than this threshold will be removed."/> + </section> + </when> + <!--when value="chi2"> + </when> + <when value="f_classif"> + </when> + <when value="f_regression"> + </when> + <when value="mutual_info_classif"> + </when> + <when value="mutual_info_regression"> + </when--> + </conditional> + <expand macro="feature_selection_methods" /> + <expand macro="sl_mixed_input"/> + </inputs> + <outputs> + <data format="txt" name="outfile"/> + </outputs> + <tests> + <test> + <param name="selected_algorithm" value="SelectFromModel"/> + <param name="has_estimator" value="no"/> + <param name="new_estimator" value="ensemble.RandomForestRegressor(n_estimators = 1000, random_state = 42)"/> + <param name="infile1" value="regression_X.tabular" ftype="tabular"/> + <param name="header1" value="True"/> + <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> + <param name="infile2" value="regression_y.tabular" ftype="tabular"/> + <param name="col2" value="1"/> + <param name="header2" value="True"/> + <output name="outfile" file="feature_selection_result01"/> + </test> + <test> + <param name="selected_algorithm" value="GenericUnivariateSelect"/> + <param name="param" value="20"/> + <param name="infile1" value="regression_X.tabular" ftype="tabular"/> + <param name="header1" value="True"/> + <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> + <param name="infile2" value="regression_y.tabular" ftype="tabular"/> + <param name="col2" value="1"/> + <param name="header2" value="True"/> + <output name="outfile" file="feature_selection_result02"/> + </test> + <test> + <param name="selected_algorithm" value="SelectPercentile"/> + <param name="infile1" value="regression_X.tabular" ftype="tabular"/> + <param name="header1" value="True"/> + <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> + <param name="infile2" value="regression_y.tabular" ftype="tabular"/> + <param name="col2" value="1"/> + <param name="header2" value="True"/> + <output name="outfile" file="feature_selection_result03"/> + </test> + <test> + <param name="selected_algorithm" value="SelectKBest"/> + <param name="infile1" value="regression_X.tabular" ftype="tabular"/> + <param name="header1" value="True"/> + <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> + <param name="infile2" value="regression_y.tabular" ftype="tabular"/> + <param name="col2" value="1"/> + <param name="header2" value="True"/> + <output name="outfile" file="feature_selection_result04"/> + </test> + <test> + <param name="selected_algorithm" value="SelectFpr"/> + <param name="alpha" value="0.05"/> + <param name="infile1" value="regression_X.tabular" ftype="tabular"/> + <param name="header1" value="True"/> + <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> + <param name="infile2" value="regression_y.tabular" ftype="tabular"/> + <param name="col2" value="1"/> + <param name="header2" value="True"/> + <output name="outfile" file="feature_selection_result05"/> + </test> + <test> + <param name="selected_algorithm" value="SelectFdr"/> + <param name="alpha" value="0.05"/> + <param name="infile1" value="regression_X.tabular" ftype="tabular"/> + <param name="header1" value="True"/> + <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> + <param name="infile2" value="regression_y.tabular" ftype="tabular"/> + <param name="col2" value="1"/> + <param name="header2" value="True"/> + <output name="outfile" file="feature_selection_result06"/> + </test> + <test> + <param name="selected_algorithm" value="SelectFwe"/> + <param name="alpha" value="0.05"/> + <param name="infile1" value="regression_X.tabular" ftype="tabular"/> + <param name="header1" value="True"/> + <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> + <param name="infile2" value="regression_y.tabular" ftype="tabular"/> + <param name="col2" value="1"/> + <param name="header2" value="True"/> + <output name="outfile" file="feature_selection_result07"/> + </test> + <test> + <param name="selected_algorithm" value="RFE"/> + <param name="has_estimator" value="no"/> + <param name="new_estimator" value="ensemble.RandomForestRegressor(n_estimators = 1000, random_state = 42)"/> + <param name="infile1" value="regression_X.tabular" ftype="tabular"/> + <param name="header1" value="True"/> + <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> + <param name="infile2" value="regression_y.tabular" ftype="tabular"/> + <param name="col2" value="1"/> + <param name="header2" value="True"/> + <output name="outfile" file="feature_selection_result08"/> + </test> + <test> + <param name="selected_algorithm" value="RFECV"/> + <param name="has_estimator" value="no"/> + <param name="new_estimator" value="ensemble.RandomForestRegressor(n_estimators = 1000, random_state = 42)"/> + <param name="infile1" value="regression_X.tabular" ftype="tabular"/> + <param name="header1" value="True"/> + <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> + <param name="infile2" value="regression_y.tabular" ftype="tabular"/> + <param name="col2" value="1"/> + <param name="header2" value="True"/> + <output name="outfile" file="feature_selection_result09"/> + </test> + <test> + <param name="selected_algorithm" value="VarianceThreshold"/> + <param name="threshold" value="0.1"/> + <param name="infile1" value="regression_X.tabular" ftype="tabular"/> + <param name="header1" value="True"/> + <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> + <param name="infile2" value="regression_y.tabular" ftype="tabular"/> + <param name="col2" value="1"/> + <param name="header2" value="True"/> + <output name="outfile" file="feature_selection_result10"/> + </test> + </tests> + <help> + <![CDATA[ +**What it does** +This tool provides several loss, score, and utility functions to measure classification performance. Some metrics might require probability estimates of the positive class, confidence values, or binary decisions values. This tool is based on +sklearn.metrics package. +For information about classification metric functions and their parameter settings please refer to `Scikit-learn classification metrics`_. + +.. _`Scikit-learn classification metrics`: http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics + ]]> + </help> + <expand macro="sklearn_citation"/> +</tool>