annotate secimtools/random_forest.xml @ 0:b54326490b4d draft

Upload 21.3.4.2 release
author malex
date Mon, 08 Mar 2021 20:55:03 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
1 <tool id="secimtools_random_forest" name="Random Forest (RF)" version="@WRAPPER_VERSION@">
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
2 <description>algorithm to select features.</description>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
3 <macros>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
4 <import>macros.xml</import>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
5 </macros>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
6 <expand macro="requirements" />
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
7 <command detect_errors="exit_code"><![CDATA[
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
8 random_forest.py
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
9 --input $input
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
10 --design $design
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
11 --ID $uniqID
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
12 --group $group
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
13 --snum $number_of_estimators
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
14 --num $number_of_factors
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
15 --out $outfile1
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
16 --out2 $outfile2
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
17 --figure $figure
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
18 ]]></command>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
19 <inputs>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
20 <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
21 <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
22 <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers."/>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
23 <param name="group" type="text" size="30" value="" label="Group/Treatment" help="Name of the column in your design file that contains group classifications."/>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
24 <param name="number_of_estimators" type="integer" size="30" value="1000" label="Number of trees in the forest" help="Recommend a minimum of 1000 trees."/>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
25 <param name="number_of_factors" type="integer" size="30" value="20" label="Number of factors to plot" help="Plots the (Default = 20) most important factors."/>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
26 </inputs>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
27 <outputs>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
28 <data format="csv" name="outfile1" label="${tool.name} on ${on_string}: Transformed Data"/>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
29 <data format="csv" name="outfile2" label="${tool.name} on ${on_string}: Importance Factors"/>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
30 <data format="pdf" name="figure" label="${tool.name} on ${on_string}: Variable Importance Plot"/>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
31 </outputs>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
32 <tests>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
33 <test>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
34 <param name="input" value="ST000006_data.tsv"/>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
35 <param name="design" value="ST000006_design.tsv"/>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
36 <param name="uniqID" value="Retention_Index" />
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
37 <param name="group" value="White_wine_type_and_source" />
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
38 <output name="outfile1" file="ST000006_random_forest_out.tsv" compare="sim_size" delta="10000" />
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
39 <output name="outfile2" file="ST000006_random_forest_out2.tsv" compare="sim_size" delta="10000" />
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
40 <output name="figure" file="ST000006_random_forest_figure.pdf" compare="sim_size" delta="10000" />
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
41 </test>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
42 </tests>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
43 <help><![CDATA[
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
44
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
45 @TIP_AND_WARNING@
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
46
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
47 **Tool Description**
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
48
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
49 The tool identifies features that are different between treatment groups based on the random forest algorithm.
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
50 Based on Classification and Regression Trees (CART), random forests are an ensemble learning method for classification, regression and variable importance evaluation.
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
51 More details about the algorithm can be found in the book:
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
52
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
53 Breiman, L. (2001). Random forests. Machine learning, 45(1), 5-32.
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
54
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
55 **NOTE: The use of machine learning algorithms (including random forest) on datasets with a small number of samples is ambiguous and should be executed with caution.**
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
56
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
57 --------------------------------------------------------------------------------------------------------------
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
58
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
59 **Input**
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
60
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
61 - Two input datasets are required.
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
62
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
63 @WIDE@
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
64
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
65 **NOTE:** The sample IDs must match the sample IDs in the Design File (below).
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
66 Extra columns will automatically be ignored.
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
67
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
68
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
69 @METADATA@
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
70
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
71 @UNIQID@
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
72
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
73 @GROUP@
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
74
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
75 **Number of Trees in the Forest**
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
76
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
77 - Run a minimum of 1000 trees.
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
78
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
79 **Number of factors to plot**
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
80
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
81 - Plots the 20 most important factors.
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
82
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
83 --------------------------------------------------------------------------------
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
84
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
85 **Output**
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
86
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
87 This tool will always output three different files:
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
88
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
89 (1) a TSV file with features ranked according to their relative importance
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
90
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
91 (2) a TSV file where ranked features from the wide format dataset are saved in columns in the order that corresponds to their relative importance
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
92
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
93 (3) and a PDF file a variable importance plot for the first 50 components. The variable importance plot displays the X (Default = 20) most important features based on the random forest algorithm. The color of each feature changes from the most important (dark blue) to the least important (light blue).
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
94
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
95 **NOTE:** The user can take the resulting TSV file and plot any two (or three) features using the Scatter Plot 2D or Scatter Plot 2D tools.
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
96
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
97 A plot of two (or three) most important features is recommended since they are probably the most meaningful, but other features can be also considered for plotting.
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
98
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
99 **To plot the 2 most important features**: use the SECIM Tools 'Scatter Plot 2D' tool on the transformed dataset to plot the features against each other and evaluate separation levels.
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
100
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
101 ]]></help>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
102 <expand macro="citations"/>
b54326490b4d Upload 21.3.4.2 release
malex
parents:
diff changeset
103 </tool>