Mercurial > repos > malex > secimtools
diff random_forest.xml @ 1:2e7d47c0b027 draft
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
author | malex |
---|---|
date | Mon, 08 Mar 2021 22:04:06 +0000 |
parents | |
children | caba07f41453 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/random_forest.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,103 @@ +<tool id="secimtools_random_forest" name="Random Forest (RF)" version="@WRAPPER_VERSION@"> + <description>algorithm to select features.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command detect_errors="exit_code"><![CDATA[ +random_forest.py +--input $input +--design $design +--ID $uniqID +--group $group +--snum $number_of_estimators +--num $number_of_factors +--out $outfile1 +--out2 $outfile2 +--figure $figure + ]]></command> + <inputs> + <param name="input" type="data" format="tabular" label="Wide Dataset" help="Input your tab-separated wide format dataset. If file is not tab separated see TIP below."/> + <param name="design" type="data" format="tabular" label="Design File" help="Input your design file (tab-separated). Note you need a 'sampleID' column. If not tab separated see TIP below."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your wide dataset that has unique identifiers."/> + <param name="group" type="text" size="30" value="" label="Group/Treatment" help="Name of the column in your design file that contains group classifications."/> + <param name="number_of_estimators" type="integer" size="30" value="1000" label="Number of trees in the forest" help="Recommend a minimum of 1000 trees."/> + <param name="number_of_factors" type="integer" size="30" value="20" label="Number of factors to plot" help="Plots the (Default = 20) most important factors."/> + </inputs> + <outputs> + <data format="csv" name="outfile1" label="${tool.name} on ${on_string}: Transformed Data"/> + <data format="csv" name="outfile2" label="${tool.name} on ${on_string}: Importance Factors"/> + <data format="pdf" name="figure" label="${tool.name} on ${on_string}: Variable Importance Plot"/> + </outputs> + <tests> + <test> + <param name="input" value="ST000006_data.tsv"/> + <param name="design" value="ST000006_design.tsv"/> + <param name="uniqID" value="Retention_Index" /> + <param name="group" value="White_wine_type_and_source" /> + <output name="outfile1" file="ST000006_random_forest_out.tsv" compare="sim_size" delta="10000" /> + <output name="outfile2" file="ST000006_random_forest_out2.tsv" compare="sim_size" delta="10000" /> + <output name="figure" file="ST000006_random_forest_figure.pdf" compare="sim_size" delta="10000" /> + </test> + </tests> + <help><![CDATA[ + +@TIP_AND_WARNING@ + +**Tool Description** + +The tool identifies features that are different between treatment groups based on the random forest algorithm. +Based on Classification and Regression Trees (CART), random forests are an ensemble learning method for classification, regression and variable importance evaluation. +More details about the algorithm can be found in the book: + +Breiman, L. (2001). Random forests. Machine learning, 45(1), 5-32. + +**NOTE: The use of machine learning algorithms (including random forest) on datasets with a small number of samples is ambiguous and should be executed with caution.** + +-------------------------------------------------------------------------------------------------------------- + +**Input** + + - Two input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File (below). +Extra columns will automatically be ignored. + + +@METADATA@ + +@UNIQID@ + +@GROUP@ + +**Number of Trees in the Forest** + + - Run a minimum of 1000 trees. + +**Number of factors to plot** + + - Plots the 20 most important factors. + +-------------------------------------------------------------------------------- + +**Output** + +This tool will always output three different files: + +(1) a TSV file with features ranked according to their relative importance + +(2) a TSV file where ranked features from the wide format dataset are saved in columns in the order that corresponds to their relative importance + +(3) and a PDF file a variable importance plot for the first 50 components. The variable importance plot displays the X (Default = 20) most important features based on the random forest algorithm. The color of each feature changes from the most important (dark blue) to the least important (light blue). + + **NOTE:** The user can take the resulting TSV file and plot any two (or three) features using the Scatter Plot 2D or Scatter Plot 2D tools. + +A plot of two (or three) most important features is recommended since they are probably the most meaningful, but other features can be also considered for plotting. + + **To plot the 2 most important features**: use the SECIM Tools 'Scatter Plot 2D' tool on the transformed dataset to plot the features against each other and evaluate separation levels. + + ]]></help> + <expand macro="citations"/> +</tool>