Mercurial > repos > bgruening > sklearn_feature_selection
diff feature_selection.xml @ 2:2eb90e73f0d5 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 79fe42239dcf077b13f85cbcd6c6e30d7e1e4832
author | bgruening |
---|---|
date | Tue, 22 May 2018 19:31:59 -0400 |
parents | 092199a095dd |
children | 3a1acc39b39b |
line wrap: on
line diff
--- a/feature_selection.xml Sat Apr 28 18:08:48 2018 -0400 +++ b/feature_selection.xml Tue May 22 19:31:59 2018 -0400 @@ -1,4 +1,4 @@ -<tool id="sklearn_feature_selection" name="Feature Selection" version="@VERSION@"> +<tool id="sklearn_feature_selection" name="Feature Selection" version="@VERSION@.1"> <description>module, including univariate filter selection methods and recursive feature elimination algorithm</description> <macros> <import>main_macros.xml</import> @@ -28,12 +28,16 @@ input_json_path = sys.argv[1] params = json.load(open(input_json_path, "r")) +## Read features +features_has_header = params["input_options"]["header1"] input_type = params["input_options"]["selected_input"] if input_type=="tabular": + header = 'infer' if features_has_header else None header = 'infer' if params["input_options"]["header1"] else None - X = read_columns( + X, input_df = read_columns( "$input_options.infile1", "$input_options.col1", + return_df = True, sep='\t', header=header, parse_dates=True @@ -41,6 +45,7 @@ else: X = mmread(open("$input_options.infile1", 'r')) +## Read labels header = 'infer' if params["input_options"]["header2"] else None y = read_columns( "$input_options.infile2", @@ -51,54 +56,55 @@ ) y=y.ravel() +## Create feature selector selector = params["feature_selection_algorithms"]["selected_algorithm"] selector = getattr(sklearn.feature_selection, selector) options = params["feature_selection_algorithms"]["options"] -#if $feature_selection_algorithms.selected_algorithm == 'SelectFromModel': -if not options['threshold'] or options['threshold'] == 'None': - options['threshold'] = None -#if $feature_selection_algorithms.extra_estimator.has_estimator == 'no_load': -fitted_estimator = pickle.load(open("$feature_selection_algorithms.extra_estimator.fitted_estimator", 'r')) -new_selector = selector(fitted_estimator, prefit=True, **options) -#else: -estimator=params["feature_selection_algorithms"]["estimator"] -if params["feature_selection_algorithms"]["extra_estimator"]["has_estimator"]=='no': - estimator=params["feature_selection_algorithms"]["extra_estimator"]["new_estimator"] -estimator=eval(estimator.replace('__dq__', '"').replace("__sq__","'")) -new_selector = selector(estimator, **options) -new_selector.fit(X, y) -#end if +if params['feature_selection_algorithms']['selected_algorithm'] == 'SelectFromModel': + if not options['threshold'] or options['threshold'] == 'None': + options['threshold'] = None + if 'extra_estimator' in params['feature_selection_algorithms'] and params['feature_selection_algorithms']['extra_estimator']['has_estimator'] == 'no_load': + fitted_estimator = pickle.load(open("params['feature_selection_algorithms']['extra_estimator']['fitted_estimator']", 'r')) + new_selector = selector(fitted_estimator, prefit=True, **options) + else: + estimator=params["feature_selection_algorithms"]["estimator"] + if params["feature_selection_algorithms"]["extra_estimator"]["has_estimator"]=='no': + estimator=params["feature_selection_algorithms"]["extra_estimator"]["new_estimator"] + estimator=eval(estimator.replace('__dq__', '"').replace("__sq__","'")) + new_selector = selector(estimator, **options) + new_selector.fit(X, y) -#elif $feature_selection_algorithms.selected_algorithm in ['RFE', 'RFECV']: -if 'scoring' in options and (not options['scoring'] or options['scoring'] == 'None'): - options['scoring'] = None -estimator=params["feature_selection_algorithms"]["estimator"] -if params["feature_selection_algorithms"]["extra_estimator"]["has_estimator"]=='no': - estimator=params["feature_selection_algorithms"]["extra_estimator"]["new_estimator"] -estimator=eval(estimator.replace('__dq__', '"').replace("__sq__","'")) -new_selector = selector(estimator, **options) -new_selector.fit(X, y) +elif params['feature_selection_algorithms']['selected_algorithm'] in ['RFE', 'RFECV']: + if 'scoring' in options and (not options['scoring'] or options['scoring'] == 'None'): + options['scoring'] = None + estimator=params["feature_selection_algorithms"]["estimator"] + if params["feature_selection_algorithms"]["extra_estimator"]["has_estimator"]=='no': + estimator=params["feature_selection_algorithms"]["extra_estimator"]["new_estimator"] + estimator=eval(estimator.replace('__dq__', '"').replace("__sq__","'")) + new_selector = selector(estimator, **options) + new_selector.fit(X, y) -#elif $feature_selection_algorithms.selected_algorithm == "VarianceThreshold": -new_selector = selector(**options) -new_selector.fit(X, y) +elif params['feature_selection_algorithms']['selected_algorithm'] == "VarianceThreshold": + new_selector = selector(**options) + new_selector.fit(X, y) -#else: -score_func = params["feature_selection_algorithms"]["score_func"] -score_func = getattr(sklearn.feature_selection, score_func) -new_selector = selector(score_func, **options) -new_selector.fit(X, y) -#end if +else: + score_func = params["feature_selection_algorithms"]["score_func"] + score_func = getattr(sklearn.feature_selection, score_func) + new_selector = selector(score_func, **options) + new_selector.fit(X, y) -#if $select_methods.selected_method == "fit_transform": -res = new_selector.transform(X) +## Transform to select features +selected_names = None +if "$select_methods.selected_method" == "fit_transform": + res = new_selector.transform(X) + if features_has_header: + selected_names = input_df.columns[new_selector.get_support(indices=True)] +else: + res = new_selector.get_support(params["select_methods"]["indices"]) -#else: -res = new_selector.get_support(params["select_methods"]["indices"]) -#end if - -res = pandas.DataFrame(res) +res = pandas.DataFrame(res, columns = selected_names) res.to_csv(path_or_buf="$outfile", sep='\t', index=False) @@ -106,131 +112,12 @@ </configfile> </configfiles> <inputs> - <conditional name="feature_selection_algorithms"> - <param name="selected_algorithm" type="select" label="Select a feature selection algorithm"> - <option value="SelectFromModel" selected="true">SelectFromModel - Meta-transformer for selecting features based on importance weights</option> - <option value="GenericUnivariateSelect" selected="true">GenericUnivariateSelect - Univariate feature selector with configurable strategy</option> - <option value="SelectPercentile">SelectPercentile - Select features according to a percentile of the highest scores</option> - <option value="SelectKBest">SelectKBest - Select features according to the k highest scores</option> - <option value="SelectFpr">SelectFpr - Filter: Select the p-values below alpha based on a FPR test</option> - <option value="SelectFdr">SelectFdr - Filter: Select the p-values for an estimated false discovery rate</option> - <option value="SelectFwe">SelectFwe - Filter: Select the p-values corresponding to Family-wise error rate</option> - <option value="RFE">RFE - Feature ranking with recursive feature elimination</option> - <option value="RFECV">RFECV - Feature ranking with recursive feature elimination and cross-validated selection of the best number of features</option> - <option value="VarianceThreshold">VarianceThreshold - Feature selector that removes all low-variance features</option> - <!--option value="chi2">Compute chi-squared stats between each non-negative feature and class</option--> - <!--option value="f_classif">Compute the ANOVA F-value for the provided sample</option--> - <!--option value="f_regression">Univariate linear regression tests</option--> - <!--option value="mutual_info_classif">Estimate mutual information for a discrete target variable</option--> - <!--option value="mutual_info_regression">Estimate mutual information for a continuous target variable</option--> - </param> - <when value="SelectFromModel"> - <expand macro="feature_selection_estimator" /> - <conditional name="extra_estimator"> - <expand macro="feature_selection_extra_estimator" > - <option value="no_load">No, I will load a prefitted estimator</option> - </expand> - <expand macro="feature_selection_estimator_choices" > - <when value="no_load"> - <param name="fitted_estimator" type="data" format='zip' label="Load a prefitted estimator" /> - </when> - </expand> - </conditional> - <section name="options" title="Other Options" expanded="True"> - <param argument="threshold" type="text" value="" optional="true" label="threshold" help="The threshold value to use for feature selection. e.g. 'mean', 'median', '1.25*mean'." /> - <param argument="norm_order" type="integer" value="1" label="norm_order" help="Order of the norm used to filter the vectors of coefficients below threshold in the case where the coef_ attribute of the estimator is of dimension 2. " /> - </section> - </when> - <when value="GenericUnivariateSelect"> - <expand macro="feature_selection_score_function" /> - <section name="options" title="Other Options" expanded="True"> - <param argument="mode" type="select" label="Feature selection mode"> - <option value="percentile">percentile</option> - <option value="k_best">k_best</option> - <option value="fpr">fpr</option> - <option value="fdr">fdr</option> - <option value="fwe">fwe</option> - </param> - <param argument="param" type="float" value="" optional="true" label="Parameter of the corresponding mode" help="float or int depending on the feature selection mode" /> - </section> - </when> - <when value="SelectPercentile"> - <expand macro="feature_selection_score_function" /> - <section name="options" title="Other Options" expanded="True"> - <param argument="percentile" type="integer" value="10" optional="True" label="Percent of features to keep" /> - </section> - </when> - <when value="SelectKBest"> - <expand macro="feature_selection_score_function" /> - <section name="options" title="Other Options" expanded="True"> - <param argument="k" type="integer" value="10" optional="True" label="Number of top features to select" help="No 'all' option is supported." /> - </section> - </when> - <when value="SelectFpr"> - <expand macro="feature_selection_score_function" /> - <section name="options" title="Other Options" expanded="True"> - <param argument="alpha" type="float" value="" optional="True" label="Alpha" help="The highest p-value for features to be kept."/> - </section> - </when> - <when value="SelectFdr"> - <expand macro="feature_selection_score_function" /> - <section name="options" title="Other Options" expanded="True"> - <param argument="alpha" type="float" value="" optional="True" label="Alpha" help="The highest uncorrected p-value for features to keep."/> - </section> - </when> - <when value="SelectFwe"> - <expand macro="feature_selection_score_function" /> - <section name="options" title="Other Options" expanded="True"> - <param argument="alpha" type="float" value="" optional="True" label="Alpha" help="The highest uncorrected p-value for features to keep."/> - </section> - </when> - <when value="RFE"> - <expand macro="feature_selection_estimator" /> - <conditional name="extra_estimator"> - <expand macro="feature_selection_extra_estimator" /> - <expand macro="feature_selection_estimator_choices" /> - </conditional> - <section name="options" title="Other Options" expanded="True"> - <param argument="n_features_to_select" type="integer" value="" optional="true" label="n_features_to_select" help="The number of features to select. If None, half of the features are selected." /> - <param argument="step" type="float" value="1" label="step" optional="true" help="Default = 1. " /> - <param argument="verbose" type="integer" value="0" label="verbose" help="Controls verbosity of output." /> - </section> - </when> - <when value="RFECV"> - <expand macro="feature_selection_estimator" /> - <conditional name="extra_estimator"> - <expand macro="feature_selection_extra_estimator" /> - <expand macro="feature_selection_estimator_choices" /> - </conditional> - <section name="options" title="Other Options" expanded="True"> - <param argument="step" type="float" value="1" label="step" optional="true" help="Default = 1. " /> - <param argument="cv" type="integer" value="" optional="true" label="cv" help="Determines the cross-validation splitting strategy" /> - <param argument="scoring" type="text" value="" optional="true" label="scoring" help="A string (see model evaluation documentation) or a scorer callable object / function with signature scorer(estimator, X, y)."/> - <param argument="verbose" type="integer" value="0" label="verbose" help="Controls verbosity of output." /> - <param argument="n_jobs" type="integer" value="1" label="n_jobs" help="Number of cores to run in parallel while fitting across folds. Defaults to 1 core."/> - </section> - </when> - <when value="VarianceThreshold"> - <section name="options" title="Options" expanded="True"> - <param argument="threshold" type="float" value="" optional="True" label="Threshold" help="Features with a training-set variance lower than this threshold will be removed."/> - </section> - </when> - <!--when value="chi2"> - </when> - <when value="f_classif"> - </when> - <when value="f_regression"> - </when> - <when value="mutual_info_classif"> - </when> - <when value="mutual_info_regression"> - </when--> - </conditional> + <expand macro="feature_selection_all" /> <expand macro="feature_selection_methods" /> <expand macro="sl_mixed_input"/> </inputs> <outputs> - <data format="txt" name="outfile"/> + <data format="tabular" name="outfile"/> </outputs> <tests> <test>