Mercurial > repos > bgruening > sklearn_feature_selection
view feature_selection.xml @ 15:026667802750 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 2a058459e6daf0486871f93845f00fdb4a4eaca1
author | bgruening |
---|---|
date | Sat, 29 Sep 2018 07:30:44 -0400 |
parents | dc411a215138 |
children | 2bbbac61e48d |
line wrap: on
line source
<tool id="sklearn_feature_selection" name="Feature Selection" version="@VERSION@.1"> <description>module, including univariate filter selection methods and recursive feature elimination algorithm</description> <macros> <import>main_macros.xml</import> </macros> <expand macro="python_requirements"/> <expand macro="macro_stdio"/> <version_command>echo "@VERSION@"</version_command> <command> <![CDATA[ python "$feature_selection_script" '$inputs' ]]> </command> <configfiles> <inputs name="inputs" /> <configfile name="feature_selection_script"> <![CDATA[ import sys import os import json import pandas import sklearn.feature_selection with open("$__tool_directory__/sk_whitelist.json", "r") as f: sk_whitelist = json.load(f) exec(open("$__tool_directory__/utils.py").read(), globals()) safe_eval = SafeEval() input_json_path = sys.argv[1] with open(input_json_path, "r") as param_handler: params = json.load(param_handler) #handle cheetah #if $fs_algorithm_selector.selected_algorithm == "SelectFromModel"\ and $fs_algorithm_selector.model_inputter.input_mode == "prefitted": params['fs_algorithm_selector']['model_inputter']['fitted_estimator'] =\ "$fs_algorithm_selector.model_inputter.fitted_estimator" #end if # Read features features_has_header = params["input_options"]["header1"] input_type = params["input_options"]["selected_input"] if input_type=="tabular": header = 'infer' if features_has_header else None column_option = params["input_options"]["column_selector_options_1"]["selected_column_selector_option"] if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: c = params["input_options"]["column_selector_options_1"]["col1"] else: c = None X, input_df = read_columns( "$input_options.infile1", c = c, c_option = column_option, return_df = True, sep='\t', header=header, parse_dates=True ) else: X = mmread("$input_options.infile1") # Read labels header = 'infer' if params["input_options"]["header2"] else None column_option = params["input_options"]["column_selector_options_2"]["selected_column_selector_option2"] if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: c = params["input_options"]["column_selector_options_2"]["col2"] else: c = None y = read_columns( "$input_options.infile2", c = c, c_option = column_option, sep='\t', header=header, parse_dates=True ) y=y.ravel() # Create feature selector new_selector = feature_selector(params['fs_algorithm_selector']) if params['fs_algorithm_selector']['selected_algorithm'] != 'SelectFromModel'\ or params['fs_algorithm_selector']['model_inputter']['input_mode'] != 'prefitted' : new_selector.fit(X, y) ## Transform to select features selected_names = None if "$output_method_selector.selected_method" == "fit_transform": res = new_selector.transform(X) if features_has_header: selected_names = input_df.columns[new_selector.get_support(indices=True)] else: res = new_selector.get_support(params["output_method_selector"]["indices"]) res = pandas.DataFrame(res, columns = selected_names) res.to_csv(path_or_buf="$outfile", sep='\t', index=False) ]]> </configfile> </configfiles> <inputs> <expand macro="feature_selection_all"> <expand macro="fs_selectfrommodel_prefitted"/> </expand> <expand macro="feature_selection_output_mothods" /> <expand macro="sl_mixed_input"/> </inputs> <outputs> <data format="tabular" name="outfile"/> </outputs> <tests> <test> <param name="selected_algorithm" value="SelectFromModel"/> <param name="input_mode" value="new"/> <param name="selected_module" value="ensemble"/> <param name="selected_estimator" value="RandomForestRegressor"/> <param name="text_params" value="n_estimators=10, random_state=10"/> <param name="infile1" value="regression_train.tabular" ftype="tabular"/> <param name="header1" value="false"/> <param name="col1" value="1,2,3,4,5"/> <param name="infile2" value="regression_train.tabular" ftype="tabular"/> <param name="col2" value="6"/> <param name="header2" value="false"/> <output name="outfile" file="feature_selection_result01"/> </test> <test> <param name="selected_algorithm" value="GenericUnivariateSelect"/> <param name="param" value="20"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="True"/> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="col2" value="1"/> <param name="header2" value="True"/> <output name="outfile" file="feature_selection_result02"/> </test> <test> <param name="selected_algorithm" value="SelectPercentile"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="True"/> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="col2" value="1"/> <param name="header2" value="True"/> <output name="outfile" file="feature_selection_result03"/> </test> <test> <param name="selected_algorithm" value="SelectKBest"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="True"/> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="col2" value="1"/> <param name="header2" value="True"/> <output name="outfile" file="feature_selection_result04"/> </test> <test> <param name="selected_algorithm" value="SelectFpr"/> <param name="alpha" value="0.05"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="True"/> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="col2" value="1"/> <param name="header2" value="True"/> <output name="outfile" file="feature_selection_result05"/> </test> <test> <param name="selected_algorithm" value="SelectFdr"/> <param name="alpha" value="0.05"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="True"/> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="col2" value="1"/> <param name="header2" value="True"/> <output name="outfile" file="feature_selection_result06"/> </test> <test> <param name="selected_algorithm" value="SelectFwe"/> <param name="alpha" value="0.05"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="True"/> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="col2" value="1"/> <param name="header2" value="True"/> <output name="outfile" file="feature_selection_result07"/> </test> <test> <param name="selected_algorithm" value="RFE"/> <param name="input_mode" value="new"/> <param name="selected_module" value="ensemble"/> <param name="selected_estimator" value="RandomForestRegressor"/> <param name="text_params" value="n_estimators=10, random_state=10"/> <param name="infile1" value="regression_train.tabular" ftype="tabular"/> <param name="header1" value="false"/> <param name="col1" value="1,2,3,4,5"/> <param name="infile2" value="regression_train.tabular" ftype="tabular"/> <param name="col2" value="6"/> <param name="header2" value="false"/> <output name="outfile" file="feature_selection_result08"/> </test> <test> <param name="selected_algorithm" value="RFECV"/> <param name="input_mode" value="new"/> <param name="selected_module" value="ensemble"/> <param name="selected_estimator" value="RandomForestRegressor"/> <param name="text_params" value="n_estimators=10, random_state=10"/> <param name="infile1" value="regression_train.tabular" ftype="tabular"/> <param name="header1" value="false"/> <param name="col1" value="1,2,3,4,5"/> <param name="infile2" value="regression_train.tabular" ftype="tabular"/> <param name="col2" value="6"/> <param name="header2" value="false"/> <output name="outfile" file="feature_selection_result09"/> </test> <test> <param name="selected_algorithm" value="VarianceThreshold"/> <param name="threshold" value="0.1"/> <param name="infile1" value="regression_X.tabular" ftype="tabular"/> <param name="header1" value="True"/> <param name="col1" value="1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17"/> <param name="infile2" value="regression_y.tabular" ftype="tabular"/> <param name="col2" value="1"/> <param name="header2" value="True"/> <output name="outfile" file="feature_selection_result10"/> </test> <test> <param name="selected_algorithm" value="SelectKBest"/> <param name="k" value="3"/> <param name="infile1" value="test3.tabular" ftype="tabular"/> <param name="header1" value="True"/> <param name="selected_column_selector_option" value="all_but_by_header_name"/> <param name="col1" value="target"/> <param name="infile2" value="test3.tabular" ftype="tabular"/> <param name="header2" value="True"/> <param name="selected_column_selector_option2" value="by_header_name"/> <param name="col2" value="target"/> <output name="outfile" file="feature_selection_result11"/> </test> <test> <param name="selected_algorithm" value="SelectFromModel"/> <param name="input_mode" value="prefitted"/> <param name="fitted_estimator" value="rfr_model01" ftype="zip"/> <param name="infile1" value="regression_train.tabular" ftype="tabular"/> <param name="header1" value="false"/> <param name="col1" value="1,2,3,4,5"/> <param name="infile2" value="regression_train.tabular" ftype="tabular"/> <param name="col2" value="1"/> <param name="header2" value="false"/> <output name="outfile" file="feature_selection_result12"/> </test> </tests> <help> <![CDATA[ **What it does** This tool provides several loss, score, and utility functions to measure classification performance. Some metrics might require probability estimates of the positive class, confidence values, or binary decisions values. This tool is based on sklearn.metrics package. For information about classification metric functions and their parameter settings please refer to `Scikit-learn classification metrics`_. .. _`Scikit-learn classification metrics`: http://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics ]]> </help> <expand macro="sklearn_citation"> <expand macro="skrebate_citation"/> <expand macro="xgboost_citation"/> </expand> </tool>