Mercurial > repos > malex > secimtools
annotate random_forest.xml @ 1:2e7d47c0b027 draft
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
author | malex |
---|---|
date | Mon, 08 Mar 2021 22:04:06 +0000 |
parents | |
children | caba07f41453 |
rev | line source |
---|---|
1
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
1 <tool id="secimtools_random_forest" name="Random Forest (RF)" version="@WRAPPER_VERSION@"> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
2 <description>algorithm to select features.</description> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
3 <macros> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
4 <import>macros.xml</import> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
5 </macros> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
6 <expand macro="requirements" /> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
7 <command detect_errors="exit_code"><![CDATA[ |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
8 random_forest.py |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
9 --input $input |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
10 --design $design |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
11 --ID $uniqID |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
12 --group $group |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
13 --snum $number_of_estimators |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
14 --num $number_of_factors |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
15 --out $outfile1 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
16 --out2 $outfile2 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
17 --figure $figure |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
18 ]]></command> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
19 <inputs> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
20 <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
21 <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
22 <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers."/> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
23 <param name="group" type="text" size="30" value="" label="Group/Treatment" help="Name of the column in your design file that contains group classifications."/> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
24 <param name="number_of_estimators" type="integer" size="30" value="1000" label="Number of trees in the forest" help="Recommend a minimum of 1000 trees."/> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
25 <param name="number_of_factors" type="integer" size="30" value="20" label="Number of factors to plot" help="Plots the (Default = 20) most important factors."/> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
26 </inputs> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
27 <outputs> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
28 <data format="csv" name="outfile1" label="${tool.name} on ${on_string}: Transformed Data"/> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
29 <data format="csv" name="outfile2" label="${tool.name} on ${on_string}: Importance Factors"/> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
30 <data format="pdf" name="figure" label="${tool.name} on ${on_string}: Variable Importance Plot"/> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
31 </outputs> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
32 <tests> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
33 <test> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
34 <param name="input" value="ST000006_data.tsv"/> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
35 <param name="design" value="ST000006_design.tsv"/> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
36 <param name="uniqID" value="Retention_Index" /> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
37 <param name="group" value="White_wine_type_and_source" /> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
38 <output name="outfile1" file="ST000006_random_forest_out.tsv" compare="sim_size" delta="10000" /> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
39 <output name="outfile2" file="ST000006_random_forest_out2.tsv" compare="sim_size" delta="10000" /> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
40 <output name="figure" file="ST000006_random_forest_figure.pdf" compare="sim_size" delta="10000" /> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
41 </test> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
42 </tests> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
43 <help><![CDATA[ |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
44 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
45 @TIP_AND_WARNING@ |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
46 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
47 **Tool Description** |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
48 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
49 The tool identifies features that are different between treatment groups based on the random forest algorithm. |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
50 Based on Classification and Regression Trees (CART), random forests are an ensemble learning method for classification, regression and variable importance evaluation. |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
51 More details about the algorithm can be found in the book: |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
52 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
53 Breiman, L. (2001). Random forests. Machine learning, 45(1), 5-32. |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
54 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
55 **NOTE: The use of machine learning algorithms (including random forest) on datasets with a small number of samples is ambiguous and should be executed with caution.** |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
56 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
57 -------------------------------------------------------------------------------------------------------------- |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
58 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
59 **Input** |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
60 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
61 - Two input datasets are required. |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
62 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
63 @WIDE@ |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
64 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
65 **NOTE:** The sample IDs must match the sample IDs in the Design File (below). |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
66 Extra columns will automatically be ignored. |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
67 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
68 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
69 @METADATA@ |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
70 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
71 @UNIQID@ |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
72 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
73 @GROUP@ |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
74 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
75 **Number of Trees in the Forest** |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
76 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
77 - Run a minimum of 1000 trees. |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
78 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
79 **Number of factors to plot** |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
80 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
81 - Plots the 20 most important factors. |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
82 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
83 -------------------------------------------------------------------------------- |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
84 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
85 **Output** |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
86 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
87 This tool will always output three different files: |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
88 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
89 (1) a TSV file with features ranked according to their relative importance |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
90 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
91 (2) a TSV file where ranked features from the wide format dataset are saved in columns in the order that corresponds to their relative importance |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
92 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
93 (3) and a PDF file a variable importance plot for the first 50 components. The variable importance plot displays the X (Default = 20) most important features based on the random forest algorithm. The color of each feature changes from the most important (dark blue) to the least important (light blue). |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
94 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
95 **NOTE:** The user can take the resulting TSV file and plot any two (or three) features using the Scatter Plot 2D or Scatter Plot 2D tools. |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
96 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
97 A plot of two (or three) most important features is recommended since they are probably the most meaningful, but other features can be also considered for plotting. |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
98 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
99 **To plot the 2 most important features**: use the SECIM Tools 'Scatter Plot 2D' tool on the transformed dataset to plot the features against each other and evaluate separation levels. |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
100 |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
101 ]]></help> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
102 <expand macro="citations"/> |
2e7d47c0b027
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
malex
parents:
diff
changeset
|
103 </tool> |