comparison random_forest.xml @ 1:2e7d47c0b027 draft

"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
author malex
date Mon, 08 Mar 2021 22:04:06 +0000
parents
children caba07f41453
comparison
equal deleted inserted replaced
0:b54326490b4d 1:2e7d47c0b027
1 <tool id="secimtools_random_forest" name="Random Forest (RF)" version="@WRAPPER_VERSION@">
2 <description>algorithm to select features.</description>
3 <macros>
4 <import>macros.xml</import>
5 </macros>
6 <expand macro="requirements" />
7 <command detect_errors="exit_code"><![CDATA[
8 random_forest.py
9 --input $input
10 --design $design
11 --ID $uniqID
12 --group $group
13 --snum $number_of_estimators
14 --num $number_of_factors
15 --out $outfile1
16 --out2 $outfile2
17 --figure $figure
18 ]]></command>
19 <inputs>
20 <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/>
21 <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/>
22 <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers."/>
23 <param name="group" type="text" size="30" value="" label="Group/Treatment" help="Name of the column in your design file that contains group classifications."/>
24 <param name="number_of_estimators" type="integer" size="30" value="1000" label="Number of trees in the forest" help="Recommend a minimum of 1000 trees."/>
25 <param name="number_of_factors" type="integer" size="30" value="20" label="Number of factors to plot" help="Plots the (Default = 20) most important factors."/>
26 </inputs>
27 <outputs>
28 <data format="csv" name="outfile1" label="${tool.name} on ${on_string}: Transformed Data"/>
29 <data format="csv" name="outfile2" label="${tool.name} on ${on_string}: Importance Factors"/>
30 <data format="pdf" name="figure" label="${tool.name} on ${on_string}: Variable Importance Plot"/>
31 </outputs>
32 <tests>
33 <test>
34 <param name="input" value="ST000006_data.tsv"/>
35 <param name="design" value="ST000006_design.tsv"/>
36 <param name="uniqID" value="Retention_Index" />
37 <param name="group" value="White_wine_type_and_source" />
38 <output name="outfile1" file="ST000006_random_forest_out.tsv" compare="sim_size" delta="10000" />
39 <output name="outfile2" file="ST000006_random_forest_out2.tsv" compare="sim_size" delta="10000" />
40 <output name="figure" file="ST000006_random_forest_figure.pdf" compare="sim_size" delta="10000" />
41 </test>
42 </tests>
43 <help><![CDATA[
44
45 @TIP_AND_WARNING@
46
47 **Tool Description**
48
49 The tool identifies features that are different between treatment groups based on the random forest algorithm.
50 Based on Classification and Regression Trees (CART), random forests are an ensemble learning method for classification, regression and variable importance evaluation.
51 More details about the algorithm can be found in the book:
52
53 Breiman, L. (2001). Random forests. Machine learning, 45(1), 5-32.
54
55 **NOTE: The use of machine learning algorithms (including random forest) on datasets with a small number of samples is ambiguous and should be executed with caution.**
56
57 --------------------------------------------------------------------------------------------------------------
58
59 **Input**
60
61 - Two input datasets are required.
62
63 @WIDE@
64
65 **NOTE:** The sample IDs must match the sample IDs in the Design File (below).
66 Extra columns will automatically be ignored.
67
68
69 @METADATA@
70
71 @UNIQID@
72
73 @GROUP@
74
75 **Number of Trees in the Forest**
76
77 - Run a minimum of 1000 trees.
78
79 **Number of factors to plot**
80
81 - Plots the 20 most important factors.
82
83 --------------------------------------------------------------------------------
84
85 **Output**
86
87 This tool will always output three different files:
88
89 (1) a TSV file with features ranked according to their relative importance
90
91 (2) a TSV file where ranked features from the wide format dataset are saved in columns in the order that corresponds to their relative importance
92
93 (3) and a PDF file a variable importance plot for the first 50 components. The variable importance plot displays the X (Default = 20) most important features based on the random forest algorithm. The color of each feature changes from the most important (dark blue) to the least important (light blue).
94
95 **NOTE:** The user can take the resulting TSV file and plot any two (or three) features using the Scatter Plot 2D or Scatter Plot 2D tools.
96
97 A plot of two (or three) most important features is recommended since they are probably the most meaningful, but other features can be also considered for plotting.
98
99 **To plot the 2 most important features**: use the SECIM Tools 'Scatter Plot 2D' tool on the transformed dataset to plot the features against each other and evaluate separation levels.
100
101 ]]></help>
102 <expand macro="citations"/>
103 </tool>