diff random_forest.xml @ 1:2e7d47c0b027 draft

"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
author malex
date Mon, 08 Mar 2021 22:04:06 +0000
parents
children caba07f41453
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/random_forest.xml	Mon Mar 08 22:04:06 2021 +0000
@@ -0,0 +1,103 @@
+<tool id="secimtools_random_forest" name="Random Forest (RF)" version="@WRAPPER_VERSION@">
+    <description>algorithm to select features.</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements" />
+    <command detect_errors="exit_code"><![CDATA[
+random_forest.py
+--input $input
+--design $design
+--ID $uniqID
+--group $group
+--snum $number_of_estimators
+--num $number_of_factors
+--out $outfile1
+--out2 $outfile2
+--figure $figure
+    ]]></command>
+    <inputs>
+        <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/>
+        <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/>
+        <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers."/>
+        <param name="group" type="text" size="30" value="" label="Group/Treatment" help="Name of the column in your design file that contains group classifications."/>
+        <param name="number_of_estimators" type="integer" size="30" value="1000" label="Number of trees in the forest" help="Recommend a minimum of 1000 trees."/>
+        <param name="number_of_factors" type="integer" size="30" value="20" label="Number of factors to plot" help="Plots the (Default = 20) most important factors."/>
+    </inputs>
+    <outputs>
+        <data format="csv" name="outfile1" label="${tool.name} on ${on_string}: Transformed Data"/>
+        <data format="csv" name="outfile2" label="${tool.name} on ${on_string}: Importance Factors"/>
+        <data format="pdf" name="figure" label="${tool.name} on ${on_string}: Variable Importance Plot"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input"  value="ST000006_data.tsv"/>
+            <param name="design" value="ST000006_design.tsv"/>
+            <param name="uniqID" value="Retention_Index" />
+            <param name="group"  value="White_wine_type_and_source" />
+            <output name="outfile1" file="ST000006_random_forest_out.tsv" compare="sim_size" delta="10000" />
+            <output name="outfile2" file="ST000006_random_forest_out2.tsv" compare="sim_size" delta="10000" />
+            <output name="figure"   file="ST000006_random_forest_figure.pdf" compare="sim_size" delta="10000" />
+        </test>
+    </tests>
+    <help><![CDATA[
+
+@TIP_AND_WARNING@
+
+**Tool Description**
+
+The tool identifies features that are different between treatment groups based on the random forest algorithm.
+Based on Classification and Regression Trees (CART), random forests are an ensemble learning method for classification, regression and variable importance evaluation.
+More details about the algorithm can be found in the book:
+
+Breiman, L. (2001). Random forests. Machine learning, 45(1), 5-32.
+
+**NOTE: The use of machine learning algorithms (including random forest) on datasets with a small number of samples is ambiguous and should be executed with caution.**
+
+--------------------------------------------------------------------------------------------------------------
+
+**Input**
+
+    - Two input datasets are required.
+
+@WIDE@
+
+**NOTE:** The sample IDs must match the sample IDs in the Design File (below).
+Extra columns will automatically be ignored.
+
+
+@METADATA@
+
+@UNIQID@
+
+@GROUP@
+
+**Number of Trees in the Forest**
+
+    - Run a minimum of 1000 trees.
+
+**Number of factors to plot**
+
+    - Plots the 20 most important factors.
+
+--------------------------------------------------------------------------------
+
+**Output**
+
+This tool will always output three different files:
+
+(1) a TSV file with features ranked according to their relative importance
+
+(2) a TSV file where ranked features from the wide format dataset are saved in columns in the order that corresponds to their relative importance
+
+(3) and a PDF file a variable importance plot for the first 50 components. The variable importance plot displays the X (Default = 20) most important features based on the random forest algorithm. The color of each feature changes from the most important (dark blue) to the least important (light blue).
+
+    **NOTE:** The user can take the resulting TSV file and plot any two (or three) features using the Scatter Plot 2D or Scatter Plot 2D tools.
+
+A plot of two (or three) most important features is recommended since they are probably the most meaningful, but other features can be also considered for plotting.
+
+    **To plot the 2 most important features**: use the SECIM Tools 'Scatter Plot 2D' tool on the transformed dataset to plot the features against each other and evaluate separation levels.
+
+    ]]></help>
+    <expand macro="citations"/>
+</tool>