0
|
1 <tool id="secimtools_random_forest" name="Random Forest (RF)" version="@WRAPPER_VERSION@">
|
|
2 <description>algorithm to select features.</description>
|
|
3 <macros>
|
|
4 <import>macros.xml</import>
|
|
5 </macros>
|
|
6 <expand macro="requirements" />
|
|
7 <command detect_errors="exit_code"><![CDATA[
|
|
8 random_forest.py
|
|
9 --input $input
|
|
10 --design $design
|
|
11 --ID $uniqID
|
|
12 --group $group
|
|
13 --snum $number_of_estimators
|
|
14 --num $number_of_factors
|
|
15 --out $outfile1
|
|
16 --out2 $outfile2
|
|
17 --figure $figure
|
|
18 ]]></command>
|
|
19 <inputs>
|
|
20 <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/>
|
|
21 <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/>
|
|
22 <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers."/>
|
|
23 <param name="group" type="text" size="30" value="" label="Group/Treatment" help="Name of the column in your design file that contains group classifications."/>
|
|
24 <param name="number_of_estimators" type="integer" size="30" value="1000" label="Number of trees in the forest" help="Recommend a minimum of 1000 trees."/>
|
|
25 <param name="number_of_factors" type="integer" size="30" value="20" label="Number of factors to plot" help="Plots the (Default = 20) most important factors."/>
|
|
26 </inputs>
|
|
27 <outputs>
|
|
28 <data format="csv" name="outfile1" label="${tool.name} on ${on_string}: Transformed Data"/>
|
|
29 <data format="csv" name="outfile2" label="${tool.name} on ${on_string}: Importance Factors"/>
|
|
30 <data format="pdf" name="figure" label="${tool.name} on ${on_string}: Variable Importance Plot"/>
|
|
31 </outputs>
|
|
32 <tests>
|
|
33 <test>
|
|
34 <param name="input" value="ST000006_data.tsv"/>
|
|
35 <param name="design" value="ST000006_design.tsv"/>
|
|
36 <param name="uniqID" value="Retention_Index" />
|
|
37 <param name="group" value="White_wine_type_and_source" />
|
|
38 <output name="outfile1" file="ST000006_random_forest_out.tsv" compare="sim_size" delta="10000" />
|
|
39 <output name="outfile2" file="ST000006_random_forest_out2.tsv" compare="sim_size" delta="10000" />
|
|
40 <output name="figure" file="ST000006_random_forest_figure.pdf" compare="sim_size" delta="10000" />
|
|
41 </test>
|
|
42 </tests>
|
|
43 <help><![CDATA[
|
|
44
|
|
45 @TIP_AND_WARNING@
|
|
46
|
|
47 **Tool Description**
|
|
48
|
|
49 The tool identifies features that are different between treatment groups based on the random forest algorithm.
|
|
50 Based on Classification and Regression Trees (CART), random forests are an ensemble learning method for classification, regression and variable importance evaluation.
|
|
51 More details about the algorithm can be found in the book:
|
|
52
|
|
53 Breiman, L. (2001). Random forests. Machine learning, 45(1), 5-32.
|
|
54
|
|
55 **NOTE: The use of machine learning algorithms (including random forest) on datasets with a small number of samples is ambiguous and should be executed with caution.**
|
|
56
|
|
57 --------------------------------------------------------------------------------------------------------------
|
|
58
|
|
59 **Input**
|
|
60
|
|
61 - Two input datasets are required.
|
|
62
|
|
63 @WIDE@
|
|
64
|
|
65 **NOTE:** The sample IDs must match the sample IDs in the Design File (below).
|
|
66 Extra columns will automatically be ignored.
|
|
67
|
|
68
|
|
69 @METADATA@
|
|
70
|
|
71 @UNIQID@
|
|
72
|
|
73 @GROUP@
|
|
74
|
|
75 **Number of Trees in the Forest**
|
|
76
|
|
77 - Run a minimum of 1000 trees.
|
|
78
|
|
79 **Number of factors to plot**
|
|
80
|
|
81 - Plots the 20 most important factors.
|
|
82
|
|
83 --------------------------------------------------------------------------------
|
|
84
|
|
85 **Output**
|
|
86
|
|
87 This tool will always output three different files:
|
|
88
|
|
89 (1) a TSV file with features ranked according to their relative importance
|
|
90
|
|
91 (2) a TSV file where ranked features from the wide format dataset are saved in columns in the order that corresponds to their relative importance
|
|
92
|
|
93 (3) and a PDF file a variable importance plot for the first 50 components. The variable importance plot displays the X (Default = 20) most important features based on the random forest algorithm. The color of each feature changes from the most important (dark blue) to the least important (light blue).
|
|
94
|
|
95 **NOTE:** The user can take the resulting TSV file and plot any two (or three) features using the Scatter Plot 2D or Scatter Plot 2D tools.
|
|
96
|
|
97 A plot of two (or three) most important features is recommended since they are probably the most meaningful, but other features can be also considered for plotting.
|
|
98
|
|
99 **To plot the 2 most important features**: use the SECIM Tools 'Scatter Plot 2D' tool on the transformed dataset to plot the features against each other and evaluate separation levels.
|
|
100
|
|
101 ]]></help>
|
|
102 <expand macro="citations"/>
|
|
103 </tool>
|