comparison feature_selection.xml @ 0:092199a095dd draft

planemo upload for repository https://github.com/bgruening/galaxytools/tools/sklearn commit 7a31960686122d7e53054fef4996525f04ebd254
author bgruening
date Thu, 12 Apr 2018 08:23:30 -0400
parents
children 2eb90e73f0d5
comparison
equal deleted inserted replaced
-1:000000000000 0:092199a095dd
1 <tool id="sklearn_feature_selection" name="Feature Selection" version="@VERSION@">
2 <description>module, including univariate filter selection methods and recursive feature elimination algorithm</description>
3 <macros>
4 <import>main_macros.xml</import>
5 </macros>
6 <expand macro="python_requirements"/>
7 <expand macro="macro_stdio"/>
8 <version_command>echo "@VERSION@"</version_command>
9 <command>
10 <![CDATA[
11 python "$feature_selection_script" '$inputs'
12 ]]>
13 </command>
14 <configfiles>
15 <inputs name="inputs" />
16 <configfile name="feature_selection_script">
17 <![CDATA[
18 import sys
19 import json
20 import pandas
21 import pickle
22 import numpy as np
23 import sklearn.feature_selection
24 from sklearn import svm, linear_model, ensemble
25
26 @COLUMNS_FUNCTION@
27
28 input_json_path = sys.argv[1]
29 params = json.load(open(input_json_path, "r"))
30
31 input_type = params["input_options"]["selected_input"]
32 if input_type=="tabular":
33 header = 'infer' if params["input_options"]["header1"] else None
34 X = read_columns(
35 "$input_options.infile1",
36 "$input_options.col1",
37 sep='\t',
38 header=header,
39 parse_dates=True
40 )
41 else:
42 X = mmread(open("$input_options.infile1", 'r'))
43
44 header = 'infer' if params["input_options"]["header2"] else None
45 y = read_columns(
46 "$input_options.infile2",
47 "$input_options.col2",
48 sep='\t',
49 header=header,
50 parse_dates=True
51 )
52 y=y.ravel()
53
54 selector = params["feature_selection_algorithms"]["selected_algorithm"]
55 selector = getattr(sklearn.feature_selection, selector)
56 options = params["feature_selection_algorithms"]["options"]
57
58 #if $feature_selection_algorithms.selected_algorithm == 'SelectFromModel':
59 if not options['threshold'] or options['threshold'] == 'None':
60 options['threshold'] = None
61 #if $feature_selection_algorithms.extra_estimator.has_estimator == 'no_load':
62 fitted_estimator = pickle.load(open("$feature_selection_algorithms.extra_estimator.fitted_estimator", 'r'))
63 new_selector = selector(fitted_estimator, prefit=True, **options)
64 #else:
65 estimator=params["feature_selection_algorithms"]["estimator"]
66 if params["feature_selection_algorithms"]["extra_estimator"]["has_estimator"]=='no':
67 estimator=params["feature_selection_algorithms"]["extra_estimator"]["new_estimator"]
68 estimator=eval(estimator.replace('__dq__', '"').replace("__sq__","'"))
69 new_selector = selector(estimator, **options)
70 new_selector.fit(X, y)
71 #end if
72
73 #elif $feature_selection_algorithms.selected_algorithm in ['RFE', 'RFECV']:
74 if 'scoring' in options and (not options['scoring'] or options['scoring'] == 'None'):
75 options['scoring'] = None
76 estimator=params["feature_selection_algorithms"]["estimator"]
77 if params["feature_selection_algorithms"]["extra_estimator"]["has_estimator"]=='no':
78 estimator=params["feature_selection_algorithms"]["extra_estimator"]["new_estimator"]
79 estimator=eval(estimator.replace('__dq__', '"').replace("__sq__","'"))
80 new_selector = selector(estimator, **options)
81 new_selector.fit(X, y)
82
83 #elif $feature_selection_algorithms.selected_algorithm == "VarianceThreshold":
84 new_selector = selector(**options)
85 new_selector.fit(X, y)
86
87 #else:
88 score_func = params["feature_selection_algorithms"]["score_func"]
89 score_func = getattr(sklearn.feature_selection, score_func)
90 new_selector = selector(score_func, **options)
91 new_selector.fit(X, y)
92 #end if
93
94 #if $select_methods.selected_method == "fit_transform":
95 res = new_selector.transform(X)
96
97 #else:
98 res = new_selector.get_support(params["select_methods"]["indices"])
99 #end if
100
101 res = pandas.DataFrame(res)
102 res.to_csv(path_or_buf="$outfile", sep='\t', index=False)
103
104
105 ]]>
106 </configfile>
107 </configfiles>
108 <inputs>
109 <conditional name="feature_selection_algorithms">
110 <param name="selected_algorithm" type="select" label="Select a feature selection algorithm">
111 <option value="SelectFromModel" selected="true">SelectFromModel - Meta-transformer for selecting features based on importance weights</option>
112 <option value="GenericUnivariateSelect" selected="true">GenericUnivariateSelect - Univariate feature selector with configurable strategy</option>
113 <option value="SelectPercentile">SelectPercentile - Select features according to a percentile of the highest scores</option>
114 <option value="SelectKBest">SelectKBest - Select features according to the k highest scores</option>
115 <option value="SelectFpr">SelectFpr - Filter: Select the p-values below alpha based on a FPR test</option>
116 <option value="SelectFdr">SelectFdr - Filter: Select the p-values for an estimated false discovery rate</option>
117 <option value="SelectFwe">SelectFwe - Filter: Select the p-values corresponding to Family-wise error rate</option>
118 <option value="RFE">RFE - Feature ranking with recursive feature elimination</option>
119 <option value="RFECV">RFECV - Feature ranking with recursive feature elimination and cross-validated selection of the best number of features</option>
120 <option value="VarianceThreshold">VarianceThreshold - Feature selector that removes all low-variance features</option>
121 <!--option value="chi2">Compute chi-squared stats between each non-negative feature and class</option-->
122 <!--option value="f_classif">Compute the ANOVA F-value for the provided sample</option-->
123 <!--option value="f_regression">Univariate linear regression tests</option-->
124 <!--option value="mutual_info_classif">Estimate mutual information for a discrete target variable</option-->
125 <!--option value="mutual_info_regression">Estimate mutual information for a continuous target variable</option-->
126 </param>
127 <when value="SelectFromModel">
128 <expand macro="feature_selection_estimator" />
129 <conditional name="extra_estimator">
130 <expand macro="feature_selection_extra_estimator" >
131 <option value="no_load">No, I will load a prefitted estimator</option>
132 </expand>
133 <expand macro="feature_selection_estimator_choices" >
134 <when value="no_load">
135 <param name="fitted_estimator" type="data" format='zip' label="Load a prefitted estimator" />
136 </when>
137 </expand>
138 </conditional>
139 <section name="options" title="Other Options" expanded="True">
140 <param argument="threshold" type="text" value="" optional="true" label="threshold" help="The threshold value to use for feature selection. e.g. 'mean', 'median', '1.25*mean'." />
141 <param argument="norm_order" type="integer" value="1" label="norm_order" help="Order of the norm used to filter the vectors of coefficients below threshold in the case where the coef_ attribute of the estimator is of dimension 2. " />
142 </section>
143 </when>
144 <when value="GenericUnivariateSelect">
145 <expand macro="feature_selection_score_function" />
146 <section name="options" title="Other Options" expanded="True">
147 <param argument="mode" type="select" label="Feature selection mode">
148 <option value="percentile">percentile</option>
149 <option value="k_best">k_best</option>
150 <option value="fpr">fpr</option>
151 <option value="fdr">fdr</option>
152 <option value="fwe">fwe</option>
153 </param>
154 <param argument="param" type="float" value="" optional="true" label="Parameter of the corresponding mode" help="float or int depending on the feature selection mode" />
155 </section>
156 </when>
157 <when value="SelectPercentile">
158 <expand macro="feature_selection_score_function" />
159 <section name="options" title="Other Options" expanded="True">
160 <param argument="percentile" type="integer" value="10" optional="True" label="Percent of features to keep" />
161 </section>
162 </when>
163 <when value="SelectKBest">
164 <expand macro="feature_selection_score_function" />
165 <section name="options" title="Other Options" expanded="True">
166 <param argument="k" type="integer" value="10" optional="True" label="Number of top features to select" help="No 'all' option is supported." />
167 </section>
168 </when>
169 <when value="SelectFpr">
170 <expand macro="feature_selection_score_function" />
171 <section name="options" title="Other Options" expanded="True">
172 <param argument="alpha" type="float" value="" optional="True" label="Alpha" help="The highest p-value for features to be kept."/>
173 </section>
174 </when>
175 <when value="SelectFdr">
176 <expand macro="feature_selection_score_function" />
177 <section name="options" title="Other Options" expanded="True">
178 <param argument="alpha" type="float" value="" optional="True" label="Alpha" help="The highest uncorrected p-value for features to keep."/>
179 </section>
180 </when>
181 <when value="SelectFwe">
182 <expand macro="feature_selection_score_function" />
183 <section name="options" title="Other Options" expanded="True">
184 <param argument="alpha" type="float" value="" optional="True" label="Alpha" help="The highest uncorrected p-value for features to keep."/>
185 </section>
186 </when>
187 <when value="RFE">
188 <expand macro="feature_selection_estimator" />
189 <conditional name="extra_estimator">
190 <expand macro="feature_selection_extra_estimator" />
191 <expand macro="feature_selection_estimator_choices" />
192 </conditional>
193 <section name="options" title="Other Options" expanded="True">
194 <param argument="n_features_to_select" type="integer" value="" optional="true" label="n_features_to_select" help="The number of features to select. If None, half of the features are selected." />
195 <param argument="step" type="float" value="1" label="step" optional="true" help="Default = 1. " />
196 <param argument="verbose" type="integer" value="0" label="verbose" help="Controls verbosity of output." />
197 </section>
198 </when>
199 <when value="RFECV">
200 <expand macro="feature_selection_estimator" />
201 <conditional name="extra_estimator">
202 <expand macro="feature_selection_extra_estimator" />
203 <expand macro="feature_selection_estimator_choices" />
204 </conditional>
205 <section name="options" title="Other Options" expanded="True">
206 <param argument="step" type="float" value="1" label="step" optional="true" help="Default = 1. " />
207 <param argument="cv" type="integer" value="" optional="true" label="cv" help="Determines the cross-validation splitting strategy" />
208 <param argument="scoring" type="text" value="" optional="true" label="scoring" help="A string (see model evaluation documentation) or a scorer callable object / function with signature scorer(estimator, X, y)."/>
209 <param argument="verbose" type="integer" value="0" label="verbose" help="Controls verbosity of output." />
210 <param argument="n_jobs" type="integer" value="1" label="n_jobs" help="Number of cores to run in parallel while fitting across folds. Defaults to 1 core."/>
211 </section>
212 </when>
213 <when value="VarianceThreshold">
214 <section name="options" title="Options" expanded="True">
215 <param argument="threshold" type="float" value="" optional="True" label="Threshold" help="Features with a training-set variance lower than this threshold will be removed."/>
216 </section>
217 </when>
218 <!--when value="chi2">
219 </when>
220 <when value="f_classif">
221 </when>
222 <when value="f_regression">
223 </when>
224 <when value="mutual_info_classif">
225 </when>
226 <when value="mutual_info_regression">
227 </when-->
228 </conditional>
229 <expand macro="feature_selection_methods" />
230 <expand macro="sl_mixed_input"/>
231 </inputs>
232 <outputs>
233 <data format="txt" name="outfile"/>
234 </outputs>
235 <tests>
236 <test>
237 <param name="selected_algorithm" value="SelectFromModel"/>
238 <param name="has_estimator" value="no"/>
239 <param name="new_estimator" value="ensemble.RandomForestRegressor(n_estimators = 1000, random_state = 42)"/>
240 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
241 <param name="header1" value="True"/>
242 <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
243 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
244 <param name="col2" value="1"/>
245 <param name="header2" value="True"/>
246 <output name="outfile" file="feature_selection_result01"/>
247 </test>
248 <test>
249 <param name="selected_algorithm" value="GenericUnivariateSelect"/>
250 <param name="param" value="20"/>
251 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
252 <param name="header1" value="True"/>
253 <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
254 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
255 <param name="col2" value="1"/>
256 <param name="header2" value="True"/>
257 <output name="outfile" file="feature_selection_result02"/>
258 </test>
259 <test>
260 <param name="selected_algorithm" value="SelectPercentile"/>
261 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
262 <param name="header1" value="True"/>
263 <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
264 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
265 <param name="col2" value="1"/>
266 <param name="header2" value="True"/>
267 <output name="outfile" file="feature_selection_result03"/>
268 </test>
269 <test>
270 <param name="selected_algorithm" value="SelectKBest"/>
271 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
272 <param name="header1" value="True"/>
273 <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
274 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
275 <param name="col2" value="1"/>
276 <param name="header2" value="True"/>
277 <output name="outfile" file="feature_selection_result04"/>
278 </test>
279 <test>
280 <param name="selected_algorithm" value="SelectFpr"/>
281 <param name="alpha" value="0.05"/>
282 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
283 <param name="header1" value="True"/>
284 <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
285 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
286 <param name="col2" value="1"/>
287 <param name="header2" value="True"/>
288 <output name="outfile" file="feature_selection_result05"/>
289 </test>
290 <test>
291 <param name="selected_algorithm" value="SelectFdr"/>
292 <param name="alpha" value="0.05"/>
293 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
294 <param name="header1" value="True"/>
295 <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
296 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
297 <param name="col2" value="1"/>
298 <param name="header2" value="True"/>
299 <output name="outfile" file="feature_selection_result06"/>
300 </test>
301 <test>
302 <param name="selected_algorithm" value="SelectFwe"/>
303 <param name="alpha" value="0.05"/>
304 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
305 <param name="header1" value="True"/>
306 <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
307 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
308 <param name="col2" value="1"/>
309 <param name="header2" value="True"/>
310 <output name="outfile" file="feature_selection_result07"/>
311 </test>
312 <test>
313 <param name="selected_algorithm" value="RFE"/>
314 <param name="has_estimator" value="no"/>
315 <param name="new_estimator" value="ensemble.RandomForestRegressor(n_estimators = 1000, random_state = 42)"/>
316 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
317 <param name="header1" value="True"/>
318 <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
319 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
320 <param name="col2" value="1"/>
321 <param name="header2" value="True"/>
322 <output name="outfile" file="feature_selection_result08"/>
323 </test>
324 <test>
325 <param name="selected_algorithm" value="RFECV"/>
326 <param name="has_estimator" value="no"/>
327 <param name="new_estimator" value="ensemble.RandomForestRegressor(n_estimators = 1000, random_state = 42)"/>
328 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
329 <param name="header1" value="True"/>
330 <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
331 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
332 <param name="col2" value="1"/>
333 <param name="header2" value="True"/>
334 <output name="outfile" file="feature_selection_result09"/>
335 </test>
336 <test>
337 <param name="selected_algorithm" value="VarianceThreshold"/>
338 <param name="threshold" value="0.1"/>
339 <param name="infile1" value="regression_X.tabular" ftype="tabular"/>
340 <param name="header1" value="True"/>
341 <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/>
342 <param name="infile2" value="regression_y.tabular" ftype="tabular"/>
343 <param name="col2" value="1"/>
344 <param name="header2" value="True"/>
345 <output name="outfile" file="feature_selection_result10"/>
346 </test>
347 </tests>
348 <help>
349 <![CDATA[
350 **What it does**
351 This tool provides several loss, score, and utility functions to measure classification performance. Some metrics might require probability estimates of the positive class, confidence values, or binary decisions values. This tool is based on
352 sklearn.metrics package.
353 For information about classification metric functions and their parameter settings please refer to `Scikit-learn classification metrics`_.
354
355 .. _`Scikit-learn classification metrics`: http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics
356 ]]>
357 </help>
358 <expand macro="sklearn_citation"/>
359 </tool>