Mercurial > repos > bgruening > sklearn_feature_selection
comparison feature_selection.xml @ 18:ec25331946b8 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit c0a3a186966888e5787335a7628bf0a4382637e7
author | bgruening |
---|---|
date | Tue, 14 May 2019 18:17:57 -0400 |
parents | 2bbbac61e48d |
children | 0b88494bdcac |
comparison
equal
deleted
inserted
replaced
17:2bbbac61e48d | 18:ec25331946b8 |
---|---|
2 <description>module, including univariate filter selection methods and recursive feature elimination algorithm</description> | 2 <description>module, including univariate filter selection methods and recursive feature elimination algorithm</description> |
3 <macros> | 3 <macros> |
4 <import>main_macros.xml</import> | 4 <import>main_macros.xml</import> |
5 </macros> | 5 </macros> |
6 <expand macro="python_requirements"/> | 6 <expand macro="python_requirements"/> |
7 <!--TODO: Add imblearn package support--> | |
7 <expand macro="macro_stdio"/> | 8 <expand macro="macro_stdio"/> |
8 <version_command>echo "@VERSION@"</version_command> | 9 <version_command>echo "@VERSION@"</version_command> |
9 <command> | 10 <command> |
10 <![CDATA[ | 11 <![CDATA[ |
11 python "$feature_selection_script" '$inputs' | 12 python "$feature_selection_script" '$inputs' |
15 <inputs name="inputs" /> | 16 <inputs name="inputs" /> |
16 <configfile name="feature_selection_script"> | 17 <configfile name="feature_selection_script"> |
17 <![CDATA[ | 18 <![CDATA[ |
18 import json | 19 import json |
19 import sklearn.feature_selection | 20 import sklearn.feature_selection |
20 | 21 import skrebate |
21 with open('$__tool_directory__/sk_whitelist.json', 'r') as f: | 22 import pandas |
22 sk_whitelist = json.load(f) | 23 import sys |
23 exec(open('$__tool_directory__/utils.py').read(), globals()) | 24 import warnings |
25 import xgboost | |
26 from sklearn import ( | |
27 cluster, compose, decomposition, ensemble, feature_extraction, | |
28 feature_selection, gaussian_process, kernel_approximation, metrics, | |
29 model_selection, naive_bayes, neighbors, pipeline, preprocessing, | |
30 svm, linear_model, tree, discriminant_analysis) | |
31 from imblearn.pipeline import Pipeline as imbPipeline | |
32 from sklearn.pipeline import Pipeline | |
33 | |
34 sys.path.insert(0, '$__tool_directory__') | |
35 from utils import SafeEval, feature_selector, read_columns | |
24 | 36 |
25 warnings.simplefilter('ignore') | 37 warnings.simplefilter('ignore') |
26 | 38 |
27 safe_eval = SafeEval() | 39 safe_eval = SafeEval() |
28 | 40 |
29 input_json_path = sys.argv[1] | 41 input_json_path = sys.argv[1] |
30 with open(input_json_path, 'r') as param_handler: | 42 with open(input_json_path, 'r') as param_handler: |
31 params = json.load(param_handler) | 43 params = json.load(param_handler) |
32 | 44 |
33 #handle cheetah | 45 ## handle cheetah |
34 #if $fs_algorithm_selector.selected_algorithm == 'SelectFromModel'\ | 46 #if $fs_algorithm_selector.selected_algorithm == 'SelectFromModel'\ |
35 and $fs_algorithm_selector.model_inputter.input_mode == 'prefitted': | 47 and $fs_algorithm_selector.model_inputter.input_mode == 'prefitted': |
36 params['fs_algorithm_selector']['model_inputter']['fitted_estimator'] =\ | 48 params['fs_algorithm_selector']['model_inputter']['fitted_estimator'] =\ |
37 '$fs_algorithm_selector.model_inputter.fitted_estimator' | 49 '$fs_algorithm_selector.model_inputter.fitted_estimator' |
38 #end if | 50 #end if |
39 | 51 |
40 #if $fs_algorithm_selector.selected_algorithm == 'SelectFromModel'\ | 52 #if $fs_algorithm_selector.selected_algorithm == 'SelectFromModel'\ |
41 and $fs_algorithm_selector.model_inputter.input_mode == 'new'\ | 53 and $fs_algorithm_selector.model_inputter.input_mode == 'new'\ |
42 and $fs_algorithm_selector.model_inputter.estimator_selector.selected_module == 'customer_estimator': | 54 and $fs_algorithm_selector.model_inputter.estimator_selector.selected_module == 'custom_estimator': |
43 params['fs_algorithm_selector']['model_inputter']['estimator_selector']['c_estimator'] =\ | 55 params['fs_algorithm_selector']['model_inputter']['estimator_selector']['c_estimator'] =\ |
44 '$fs_algorithm_selector.model_inputter.estimator_selector.c_estimator' | 56 '$fs_algorithm_selector.model_inputter.estimator_selector.c_estimator' |
45 #end if | 57 #end if |
46 | 58 |
47 #if $fs_algorithm_selector.selected_algorithm in ['RFE', 'RFECV']\ | 59 #if $fs_algorithm_selector.selected_algorithm in ['RFE', 'RFECV', 'DyRFECV']\ |
48 and $fs_algorithm_selector.estimator_selector.selected_module == 'customer_estimator': | 60 and $fs_algorithm_selector.estimator_selector.selected_module == 'custom_estimator': |
49 params['fs_algorithm_selector']['estimator_selector']['c_estimator'] =\ | 61 params['fs_algorithm_selector']['estimator_selector']['c_estimator'] =\ |
50 '$fs_algorithm_selector.estimator_selector.c_estimator' | 62 '$fs_algorithm_selector.estimator_selector.c_estimator' |
51 #end if | 63 #end if |
52 | 64 |
53 # Read features | 65 #if $fs_algorithm_selector.selected_algorithm in ['RFECV', 'DyRFECV']\ |
66 and $fs_algorithm_selector.options.cv_selector.selected_cv\ | |
67 in ['GroupKFold', 'GroupShuffleSplit', 'LeaveOneGroupOut', 'LeavePGroupsOut']: | |
68 params['fs_algorithm_selector']['options']['cv_selector']['groups_selector']['infile_g'] =\ | |
69 '$fs_algorithm_selector.options.cv_selector.groups_selector.infile_g' | |
70 #end if | |
71 | |
72 ## Read features | |
54 features_has_header = params['input_options']['header1'] | 73 features_has_header = params['input_options']['header1'] |
55 input_type = params['input_options']['selected_input'] | 74 input_type = params['input_options']['selected_input'] |
56 if input_type == 'tabular': | 75 if input_type == 'tabular': |
57 header = 'infer' if features_has_header else None | 76 header = 'infer' if features_has_header else None |
58 column_option = params['input_options']['column_selector_options_1']['selected_column_selector_option'] | 77 column_option = params['input_options']['column_selector_options_1']['selected_column_selector_option'] |
65 c = c, | 84 c = c, |
66 c_option = column_option, | 85 c_option = column_option, |
67 return_df = True, | 86 return_df = True, |
68 sep='\t', | 87 sep='\t', |
69 header=header, | 88 header=header, |
70 parse_dates=True | 89 parse_dates=True) |
71 ) | 90 X = X.astype(float) |
72 else: | 91 else: |
73 X = mmread('$input_options.infile1') | 92 X = mmread('$input_options.infile1') |
74 | 93 |
75 # Read labels | 94 ## Read labels |
76 header = 'infer' if params['input_options']['header2'] else None | 95 header = 'infer' if params['input_options']['header2'] else None |
77 column_option = params['input_options']['column_selector_options_2']['selected_column_selector_option2'] | 96 column_option = params['input_options']['column_selector_options_2']['selected_column_selector_option2'] |
78 if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']: | 97 if column_option in ['by_index_number', 'all_but_by_index_number', 'by_header_name', 'all_but_by_header_name']: |
79 c = params['input_options']['column_selector_options_2']['col2'] | 98 c = params['input_options']['column_selector_options_2']['col2'] |
80 else: | 99 else: |
83 '$input_options.infile2', | 102 '$input_options.infile2', |
84 c = c, | 103 c = c, |
85 c_option = column_option, | 104 c_option = column_option, |
86 sep='\t', | 105 sep='\t', |
87 header=header, | 106 header=header, |
88 parse_dates=True | 107 parse_dates=True) |
89 ) | 108 y = y.ravel() |
90 y=y.ravel() | 109 |
91 | 110 ## Create feature selector |
92 # Create feature selector | 111 new_selector = feature_selector(params['fs_algorithm_selector'], X=X, y=y) |
93 new_selector = feature_selector(params['fs_algorithm_selector']) | |
94 if params['fs_algorithm_selector']['selected_algorithm'] != 'SelectFromModel'\ | 112 if params['fs_algorithm_selector']['selected_algorithm'] != 'SelectFromModel'\ |
95 or params['fs_algorithm_selector']['model_inputter']['input_mode'] != 'prefitted' : | 113 or params['fs_algorithm_selector']['model_inputter']['input_mode'] != 'prefitted' : |
96 new_selector.fit(X, y) | 114 new_selector.fit(X, y) |
97 | 115 |
98 ## Transform to select features | 116 ## Transform to select features |
264 <param name="infile2" value="regression_train.tabular" ftype="tabular"/> | 282 <param name="infile2" value="regression_train.tabular" ftype="tabular"/> |
265 <param name="col2" value="1"/> | 283 <param name="col2" value="1"/> |
266 <param name="header2" value="false"/> | 284 <param name="header2" value="false"/> |
267 <output name="outfile" file="feature_selection_result12"/> | 285 <output name="outfile" file="feature_selection_result12"/> |
268 </test> | 286 </test> |
287 <test> | |
288 <param name="selected_algorithm" value="RFECV"/> | |
289 <param name="input_mode" value="new"/> | |
290 <param name="selected_module" value="ensemble"/> | |
291 <param name="selected_estimator" value="RandomForestRegressor"/> | |
292 <param name="text_params" value="n_estimators=10, random_state=10"/> | |
293 <section name="groups_selector"> | |
294 <param name="infile_groups" value="regression_y.tabular" ftype="tabular"/> | |
295 <param name="header_g" value="true"/> | |
296 <param name="selected_column_selector_option_g" value="by_index_number"/> | |
297 <param name="col_g" value="1"/> | |
298 </section> | |
299 <param name="selected_cv" value="GroupShuffleSplit"/> | |
300 <param name="random_state" value="0"/> | |
301 <param name="infile1" value="regression_X.tabular" ftype="tabular"/> | |
302 <param name="header1" value="true"/> | |
303 <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> | |
304 <param name="infile2" value="regression_y.tabular" ftype="tabular"/> | |
305 <param name="col2" value="1"/> | |
306 <param name="header2" value="true"/> | |
307 <output name="outfile" file="feature_selection_result13"/> | |
308 </test> | |
269 </tests> | 309 </tests> |
270 <help> | 310 <help> |
271 <![CDATA[ | 311 <![CDATA[ |
272 **What it does** | 312 **What it does** |
273 This tool provides several loss, score, and utility functions to measure classification performance. Some metrics might require probability estimates of the positive class, confidence values, or binary decisions values. This tool is based on | 313 This tool provides several loss, score, and utility functions to measure classification performance. Some metrics might require probability estimates of the positive class, confidence values, or binary decisions values. This tool is based on |