comparison feature_selection.xml @ 2:2eb90e73f0d5 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 79fe42239dcf077b13f85cbcd6c6e30d7e1e4832
author bgruening
date Tue, 22 May 2018 19:31:59 -0400
parents 092199a095dd
children 3a1acc39b39b
comparison
equal deleted inserted replaced
1:58322d3c7bd3 2:2eb90e73f0d5
1 <tool id="sklearn_feature_selection" name="Feature Selection" version="@VERSION@"> 1 <tool id="sklearn_feature_selection" name="Feature Selection" version="@VERSION@.1">
2 <description>module, including univariate filter selection methods and recursive feature elimination algorithm</description> 2 <description>module, including univariate filter selection methods and recursive feature elimination algorithm</description>
3 <macros> 3 <macros>
4 <import>main_macros.xml</import> 4 <import>main_macros.xml</import>
5 </macros> 5 </macros>
6 <expand macro="python_requirements"/> 6 <expand macro="python_requirements"/>
26 @COLUMNS_FUNCTION@ 26 @COLUMNS_FUNCTION@
27 27
28 input_json_path = sys.argv[1] 28 input_json_path = sys.argv[1]
29 params = json.load(open(input_json_path, "r")) 29 params = json.load(open(input_json_path, "r"))
30 30
31 ## Read features
32 features_has_header = params["input_options"]["header1"]
31 input_type = params["input_options"]["selected_input"] 33 input_type = params["input_options"]["selected_input"]
32 if input_type=="tabular": 34 if input_type=="tabular":
35 header = 'infer' if features_has_header else None
33 header = 'infer' if params["input_options"]["header1"] else None 36 header = 'infer' if params["input_options"]["header1"] else None
34 X = read_columns( 37 X, input_df = read_columns(
35 "$input_options.infile1", 38 "$input_options.infile1",
36 "$input_options.col1", 39 "$input_options.col1",
40 return_df = True,
37 sep='\t', 41 sep='\t',
38 header=header, 42 header=header,
39 parse_dates=True 43 parse_dates=True
40 ) 44 )
41 else: 45 else:
42 X = mmread(open("$input_options.infile1", 'r')) 46 X = mmread(open("$input_options.infile1", 'r'))
43 47
48 ## Read labels
44 header = 'infer' if params["input_options"]["header2"] else None 49 header = 'infer' if params["input_options"]["header2"] else None
45 y = read_columns( 50 y = read_columns(
46 "$input_options.infile2", 51 "$input_options.infile2",
47 "$input_options.col2", 52 "$input_options.col2",
48 sep='\t', 53 sep='\t',
49 header=header, 54 header=header,
50 parse_dates=True 55 parse_dates=True
51 ) 56 )
52 y=y.ravel() 57 y=y.ravel()
53 58
59 ## Create feature selector
54 selector = params["feature_selection_algorithms"]["selected_algorithm"] 60 selector = params["feature_selection_algorithms"]["selected_algorithm"]
55 selector = getattr(sklearn.feature_selection, selector) 61 selector = getattr(sklearn.feature_selection, selector)
56 options = params["feature_selection_algorithms"]["options"] 62 options = params["feature_selection_algorithms"]["options"]
57 63
58 #if $feature_selection_algorithms.selected_algorithm == 'SelectFromModel': 64 if params['feature_selection_algorithms']['selected_algorithm'] == 'SelectFromModel':
59 if not options['threshold'] or options['threshold'] == 'None': 65 if not options['threshold'] or options['threshold'] == 'None':
60 options['threshold'] = None 66 options['threshold'] = None
61 #if $feature_selection_algorithms.extra_estimator.has_estimator == 'no_load': 67 if 'extra_estimator' in params['feature_selection_algorithms'] and params['feature_selection_algorithms']['extra_estimator']['has_estimator'] == 'no_load':
62 fitted_estimator = pickle.load(open("$feature_selection_algorithms.extra_estimator.fitted_estimator", 'r')) 68 fitted_estimator = pickle.load(open("params['feature_selection_algorithms']['extra_estimator']['fitted_estimator']", 'r'))
63 new_selector = selector(fitted_estimator, prefit=True, **options) 69 new_selector = selector(fitted_estimator, prefit=True, **options)
64 #else: 70 else:
65 estimator=params["feature_selection_algorithms"]["estimator"] 71 estimator=params["feature_selection_algorithms"]["estimator"]
66 if params["feature_selection_algorithms"]["extra_estimator"]["has_estimator"]=='no': 72 if params["feature_selection_algorithms"]["extra_estimator"]["has_estimator"]=='no':
67 estimator=params["feature_selection_algorithms"]["extra_estimator"]["new_estimator"] 73 estimator=params["feature_selection_algorithms"]["extra_estimator"]["new_estimator"]
68 estimator=eval(estimator.replace('__dq__', '"').replace("__sq__","'")) 74 estimator=eval(estimator.replace('__dq__', '"').replace("__sq__","'"))
69 new_selector = selector(estimator, **options) 75 new_selector = selector(estimator, **options)
70 new_selector.fit(X, y) 76 new_selector.fit(X, y)
71 #end if 77
72 78 elif params['feature_selection_algorithms']['selected_algorithm'] in ['RFE', 'RFECV']:
73 #elif $feature_selection_algorithms.selected_algorithm in ['RFE', 'RFECV']: 79 if 'scoring' in options and (not options['scoring'] or options['scoring'] == 'None'):
74 if 'scoring' in options and (not options['scoring'] or options['scoring'] == 'None'): 80 options['scoring'] = None
75 options['scoring'] = None 81 estimator=params["feature_selection_algorithms"]["estimator"]
76 estimator=params["feature_selection_algorithms"]["estimator"] 82 if params["feature_selection_algorithms"]["extra_estimator"]["has_estimator"]=='no':
77 if params["feature_selection_algorithms"]["extra_estimator"]["has_estimator"]=='no': 83 estimator=params["feature_selection_algorithms"]["extra_estimator"]["new_estimator"]
78 estimator=params["feature_selection_algorithms"]["extra_estimator"]["new_estimator"] 84 estimator=eval(estimator.replace('__dq__', '"').replace("__sq__","'"))
79 estimator=eval(estimator.replace('__dq__', '"').replace("__sq__","'")) 85 new_selector = selector(estimator, **options)
80 new_selector = selector(estimator, **options) 86 new_selector.fit(X, y)
81 new_selector.fit(X, y) 87
82 88 elif params['feature_selection_algorithms']['selected_algorithm'] == "VarianceThreshold":
83 #elif $feature_selection_algorithms.selected_algorithm == "VarianceThreshold": 89 new_selector = selector(**options)
84 new_selector = selector(**options) 90 new_selector.fit(X, y)
85 new_selector.fit(X, y) 91
86 92 else:
87 #else: 93 score_func = params["feature_selection_algorithms"]["score_func"]
88 score_func = params["feature_selection_algorithms"]["score_func"] 94 score_func = getattr(sklearn.feature_selection, score_func)
89 score_func = getattr(sklearn.feature_selection, score_func) 95 new_selector = selector(score_func, **options)
90 new_selector = selector(score_func, **options) 96 new_selector.fit(X, y)
91 new_selector.fit(X, y) 97
92 #end if 98 ## Transform to select features
93 99 selected_names = None
94 #if $select_methods.selected_method == "fit_transform": 100 if "$select_methods.selected_method" == "fit_transform":
95 res = new_selector.transform(X) 101 res = new_selector.transform(X)
96 102 if features_has_header:
97 #else: 103 selected_names = input_df.columns[new_selector.get_support(indices=True)]
98 res = new_selector.get_support(params["select_methods"]["indices"]) 104 else:
99 #end if 105 res = new_selector.get_support(params["select_methods"]["indices"])
100 106
101 res = pandas.DataFrame(res) 107 res = pandas.DataFrame(res, columns = selected_names)
102 res.to_csv(path_or_buf="$outfile", sep='\t', index=False) 108 res.to_csv(path_or_buf="$outfile", sep='\t', index=False)
103 109
104 110
105 ]]> 111 ]]>
106 </configfile> 112 </configfile>
107 </configfiles> 113 </configfiles>
108 <inputs> 114 <inputs>
109 <conditional name="feature_selection_algorithms"> 115 <expand macro="feature_selection_all" />
110 <param name="selected_algorithm" type="select" label="Select a feature selection algorithm">
111 <option value="SelectFromModel" selected="true">SelectFromModel - Meta-transformer for selecting features based on importance weights</option>
112 <option value="GenericUnivariateSelect" selected="true">GenericUnivariateSelect - Univariate feature selector with configurable strategy</option>
113 <option value="SelectPercentile">SelectPercentile - Select features according to a percentile of the highest scores</option>
114 <option value="SelectKBest">SelectKBest - Select features according to the k highest scores</option>
115 <option value="SelectFpr">SelectFpr - Filter: Select the p-values below alpha based on a FPR test</option>
116 <option value="SelectFdr">SelectFdr - Filter: Select the p-values for an estimated false discovery rate</option>
117 <option value="SelectFwe">SelectFwe - Filter: Select the p-values corresponding to Family-wise error rate</option>
118 <option value="RFE">RFE - Feature ranking with recursive feature elimination</option>
119 <option value="RFECV">RFECV - Feature ranking with recursive feature elimination and cross-validated selection of the best number of features</option>
120 <option value="VarianceThreshold">VarianceThreshold - Feature selector that removes all low-variance features</option>
121 <!--option value="chi2">Compute chi-squared stats between each non-negative feature and class</option-->
122 <!--option value="f_classif">Compute the ANOVA F-value for the provided sample</option-->
123 <!--option value="f_regression">Univariate linear regression tests</option-->
124 <!--option value="mutual_info_classif">Estimate mutual information for a discrete target variable</option-->
125 <!--option value="mutual_info_regression">Estimate mutual information for a continuous target variable</option-->
126 </param>
127 <when value="SelectFromModel">
128 <expand macro="feature_selection_estimator" />
129 <conditional name="extra_estimator">
130 <expand macro="feature_selection_extra_estimator" >
131 <option value="no_load">No, I will load a prefitted estimator</option>
132 </expand>
133 <expand macro="feature_selection_estimator_choices" >
134 <when value="no_load">
135 <param name="fitted_estimator" type="data" format='zip' label="Load a prefitted estimator" />
136 </when>
137 </expand>
138 </conditional>
139 <section name="options" title="Other Options" expanded="True">
140 <param argument="threshold" type="text" value="" optional="true" label="threshold" help="The threshold value to use for feature selection. e.g. 'mean', 'median', '1.25*mean'." />
141 <param argument="norm_order" type="integer" value="1" label="norm_order" help="Order of the norm used to filter the vectors of coefficients below threshold in the case where the coef_ attribute of the estimator is of dimension 2. " />
142 </section>
143 </when>
144 <when value="GenericUnivariateSelect">
145 <expand macro="feature_selection_score_function" />
146 <section name="options" title="Other Options" expanded="True">
147 <param argument="mode" type="select" label="Feature selection mode">
148 <option value="percentile">percentile</option>
149 <option value="k_best">k_best</option>
150 <option value="fpr">fpr</option>
151 <option value="fdr">fdr</option>
152 <option value="fwe">fwe</option>
153 </param>
154 <param argument="param" type="float" value="" optional="true" label="Parameter of the corresponding mode" help="float or int depending on the feature selection mode" />
155 </section>
156 </when>
157 <when value="SelectPercentile">
158 <expand macro="feature_selection_score_function" />
159 <section name="options" title="Other Options" expanded="True">
160 <param argument="percentile" type="integer" value="10" optional="True" label="Percent of features to keep" />
161 </section>
162 </when>
163 <when value="SelectKBest">
164 <expand macro="feature_selection_score_function" />
165 <section name="options" title="Other Options" expanded="True">
166 <param argument="k" type="integer" value="10" optional="True" label="Number of top features to select" help="No 'all' option is supported." />
167 </section>
168 </when>
169 <when value="SelectFpr">
170 <expand macro="feature_selection_score_function" />
171 <section name="options" title="Other Options" expanded="True">
172 <param argument="alpha" type="float" value="" optional="True" label="Alpha" help="The highest p-value for features to be kept."/>
173 </section>
174 </when>
175 <when value="SelectFdr">
176 <expand macro="feature_selection_score_function" />
177 <section name="options" title="Other Options" expanded="True">
178 <param argument="alpha" type="float" value="" optional="True" label="Alpha" help="The highest uncorrected p-value for features to keep."/>
179 </section>
180 </when>
181 <when value="SelectFwe">
182 <expand macro="feature_selection_score_function" />
183 <section name="options" title="Other Options" expanded="True">
184 <param argument="alpha" type="float" value="" optional="True" label="Alpha" help="The highest uncorrected p-value for features to keep."/>
185 </section>
186 </when>
187 <when value="RFE">
188 <expand macro="feature_selection_estimator" />
189 <conditional name="extra_estimator">
190 <expand macro="feature_selection_extra_estimator" />
191 <expand macro="feature_selection_estimator_choices" />
192 </conditional>
193 <section name="options" title="Other Options" expanded="True">
194 <param argument="n_features_to_select" type="integer" value="" optional="true" label="n_features_to_select" help="The number of features to select. If None, half of the features are selected." />
195 <param argument="step" type="float" value="1" label="step" optional="true" help="Default = 1. " />
196 <param argument="verbose" type="integer" value="0" label="verbose" help="Controls verbosity of output." />
197 </section>
198 </when>
199 <when value="RFECV">
200 <expand macro="feature_selection_estimator" />
201 <conditional name="extra_estimator">
202 <expand macro="feature_selection_extra_estimator" />
203 <expand macro="feature_selection_estimator_choices" />
204 </conditional>
205 <section name="options" title="Other Options" expanded="True">
206 <param argument="step" type="float" value="1" label="step" optional="true" help="Default = 1. " />
207 <param argument="cv" type="integer" value="" optional="true" label="cv" help="Determines the cross-validation splitting strategy" />
208 <param argument="scoring" type="text" value="" optional="true" label="scoring" help="A string (see model evaluation documentation) or a scorer callable object / function with signature scorer(estimator, X, y)."/>
209 <param argument="verbose" type="integer" value="0" label="verbose" help="Controls verbosity of output." />
210 <param argument="n_jobs" type="integer" value="1" label="n_jobs" help="Number of cores to run in parallel while fitting across folds. Defaults to 1 core."/>
211 </section>
212 </when>
213 <when value="VarianceThreshold">
214 <section name="options" title="Options" expanded="True">
215 <param argument="threshold" type="float" value="" optional="True" label="Threshold" help="Features with a training-set variance lower than this threshold will be removed."/>
216 </section>
217 </when>
218 <!--when value="chi2">
219 </when>
220 <when value="f_classif">
221 </when>
222 <when value="f_regression">
223 </when>
224 <when value="mutual_info_classif">
225 </when>
226 <when value="mutual_info_regression">
227 </when-->
228 </conditional>
229 <expand macro="feature_selection_methods" /> 116 <expand macro="feature_selection_methods" />
230 <expand macro="sl_mixed_input"/> 117 <expand macro="sl_mixed_input"/>
231 </inputs> 118 </inputs>
232 <outputs> 119 <outputs>
233 <data format="txt" name="outfile"/> 120 <data format="tabular" name="outfile"/>
234 </outputs> 121 </outputs>
235 <tests> 122 <tests>
236 <test> 123 <test>
237 <param name="selected_algorithm" value="SelectFromModel"/> 124 <param name="selected_algorithm" value="SelectFromModel"/>
238 <param name="has_estimator" value="no"/> 125 <param name="has_estimator" value="no"/>