Mercurial > repos > malex > secimtools
diff svm_classifier.xml @ 1:2e7d47c0b027 draft
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
author | malex |
---|---|
date | Mon, 08 Mar 2021 22:04:06 +0000 |
parents | |
children | caba07f41453 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/svm_classifier.xml Mon Mar 08 22:04:06 2021 +0000 @@ -0,0 +1,198 @@ +<tool id="secimtools_svm_classifier" name="Support Vector Machine (SVM) Classifier" version="@WRAPPER_VERSION@"> + <description>- Predict sample groups.</description> + <macros> + <import>macros.xml</import> + </macros> + <expand macro="requirements" /> + <command><![CDATA[ +svm_classifier.py +--train_wide $train_wide +--train_design $train_design +--test_wide $test_wide +--test_design $test_design +--group $group +--ID $uniqID +--kernel $kernel +--degree $degree +--C $C +--cross_validation $cross_validation +--C_lower_bound $C_lower_bound +--C_upper_bound $C_upper_bound +--a $a +--b $b +--outClassification $outClassification +--outClassificationAccuracy $outClassificationAccuracy +--outPrediction $outPrediction +--outPredictionAccuracy $outPredictionAccuracy + ]]></command> + <inputs> + <param name="train_wide" type="data" format="tabular" label="Training wide dataset" help="Dataset missing? See TIP below."/> + <param name="train_design" type="data" format="tabular" label="Training design file" help="Dataset missing? See TIP below."/> + <param name="test_wide" type="data" format="tabular" label="Target wide dataset" help="Dataset missing? See TIP below."/> + <param name="test_design" type="data" format="tabular" label="Target design file" help="Dataset missing? See TIP below."/> + <param name="group" size="30" type="text" value="" label="Group/Treatment" help="Name of the column in your Training and Target design files that contain group classifications."/> + <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your Training and Target wide datasets that contain unique identifiers."/> + <param name="kernel" type="select" size="30" display="radio" value="rbf" label="Select a SVM Kernel Function"> + <option value="rbf">Radial Basis function (Gaussian)</option> + <option value="linear">Linear</option> + <option value="poly">Polynomial</option> + <option value="sigmoid">Sigmoid</option> + </param> + <param name="degree" size="30" type="text" value="3" label="Polynomial Degree" help='Only used for the polynomial kernel.'/> + <param name="cross_validation" type="select" size="30" display="radio" value="double" label="Select Cross-Validation"> + <option value="none">None</option> + <option value="single">Single</option> + <option value="double">Double</option> + </param> + <param name="C" size="30" type="text" value="1" label="Regularization Parameter C" help='See references in tool description for setting this parameter. Value must be positive (C > 0). Used only if cross-validation is not selected. Default = 1.'/> + <param name="C_lower_bound" size="30" type="text" value="0.1" label="Regularization Parameter C (Lower Bound)" help='Defines the lower bound for regularization parameter C when cross-validation is used. Must have a positive value (C > 0) Default = 0.1. '/> + <param name="C_upper_bound" size="30" type="text" value="10" label="Regularization Parameter C (Upper Bound)" help='Defines the upper bound for regularization parameter C when cross-validation is used. Must have a positive value that is larger than the lower bound. Default = 10. '/> + <param name="a" size="30" type="text" value="0.0" label="Coefficient A" help='Used in the kernel functions above. Must be greater than zero. However, the default = 0 and translates to a = 1/n_features, where n_features is the number of features. Default = 0.'/> + <param name="b" size="30" type="text" value="0.0" label="Coefficient B" help='Independent term in kernel function. It is only significant in polynomial and sigmoid kernels. Default = 0.'/> + </inputs> + <outputs> + <data name="outClassification" format="tabular" label="${tool.name} on ${on_string}: Classification of the Training Data Set"/> + <data name="outClassificationAccuracy" format='tabular' label="${tool.name} on ${on_string}: Classification Accuracy of the Training Data Set"/> + <data name="outPrediction" format="tabular" label="${tool.name} on ${on_string}: Prediction Accuracy of the Training Data Set"/> + <data name="outPredictionAccuracy" format='tabular' label="${tool.name} on ${on_string}: Prediction Accuracy of the Training Data Set"/> + </outputs> + <tests> + <test> + <param name="train_wide" value="ST000006_data.tsv"/> + <param name="train_design" value="ST000006_design.tsv"/> + <param name="test_wide" value="ST000006_data.tsv"/> + <param name="test_design" value="ST000006_design.tsv"/> + <param name="group" value="White_wine_type_and_source" /> + <param name="uniqID" value="Retention_Index" /> + <param name="kernel" value="linear"/> + <param name="degree" value="3"/> + <param name="cross_validation" value="none"/> + <param name="C" value="1"/> + <param name="C_lower_bound" value="0.1"/> + <param name="C_upper_bound" value="2"/> + <param name="a" value="1"/> + <param name="b" value="1"/> + <output name="outClassification" file="ST000006_svm_classifier_train_classification.tsv" /> + <output name="outClassificationAccuracy" file="ST000006_svm_classifier_train_classification_accuracy.tsv" /> + <output name="outPrediction" file="ST000006_svm_classifier_target_classification.tsv" /> + <output name="outPredictionAccuracy" file="ST000006_svm_classifier_target_classification_accuracy.tsv" /> + </test> + </tests> + <help><![CDATA[ + +**TIP:** +If your data is not TAB delimited, use *Text Manipulation->Convert*. + +**WARNINGS:** + - (1) This script automatically removes spaces and special characters from strings. + - (2) If a feature name starts with a number it will prepend an '_'. + +**Tool Description** + +**NOTE: A minimum of 100 samples is required by the tool for single or double cross validation** + +Given a set of supervised samples in a Training Dataset, the SVM training algorithm builds a model based on these samples that can be used for predicting the categories of new, unclassified samples in a Target Dataset. +The Target Dataset is not used for model training or evaluation, only for prediction based on the finalized model. +SVM classification is performed on the target data and accuracy is estimated for both Target and Training Datasets. + +SVM uses different kernel functions to carry out different types of classification such as radial bassis (gaussian), linear, polynomial, and sigmoid. +The classification model can be trained with and without cross-validation (single or double). + +For single and double cross-validation: the training dataset is split differently when the model fit is performed. + +In single cross-validation: the same data are used to both fit and evaluate the model. + +In double cross-validation: the training dataset is split into pieces and the model fit is performed on one of the pieces and evaluated on the other pieces. + +Under cross-validation, the user specifies Regularization Parameter C and the Upper and Lower bounds of Regularization Parameter C. +For more information about Regularization Parameter C, see references below: + +Cortes, C. and Vapnik, V. 1995. Support-vector networks. Machine Learning. 20(3) 273-297. + +Steinwart, I and Christmann, A. 2008. Support vector machines. Springer Science and Business Media. + + +To use the SVM tool, users need the following information: + +(i) a Training Dataset with known categories in the training design file and +(ii) a Target Dataset with predicted categories in the target design file. +(iii) the name of the Group/Treatment classification column should be the same for both design files. +(iv) the Unique Feature IDs should be the same in both the wide datasets. +(v) the number of Unique Feature IDs should be the same in both the wide datasets. + +------------------------------------------------------------------------------ + +**Input** + + - Four input datasets are required. + +@WIDE@ + +**NOTE:** The sample IDs must match the sample IDs in the Design File +(below). Extra columns will automatically be ignored. + +@METADATA@ + +@GROUP@ + +@UNIQID@ + +**SVM Kernel Function** + + - Kernel functions available for the SVM algorithm. + +**Polynomial Degree** + + - Only used for the polynomial kernel. + +**Cross-Validation Choice** + + - Cross-validation options available for the user. 'None' corresponds to no cross-validation- the user specifies regularization parameter C manually. + + +**Regularization Parameter C** + + - Penalizes potential overfitting, must be positive. + + +**Regularization Parameter C (Lower Bound)** + + - Lower bound for regularization parameter C. Value must be greater than 0. Only if cross-validation is selected. + + +**Regularization Parameter C (Upper Bound)** + + - Upper bound for regularization parameter C. Value must be greater than the Lower Bound. + + +**Coefficient A** + + - Used in the kernel functions above. Must be greater than zero. Default = 0, however, + this translates to a = 1/n_features, where n_features is the number of features. + +**Coefficent B** + + - Independent term in the kernel function. It is only significant in + polynomial and sigmoid kernels. + +------------------------------------------------------------------------------ + +**Output** + +This tool will output two files for the Training dataset and two for the Target datset: + +Training: + +(1) a TSV file containing the observed and predicted grouping classifications for each sample and +(2) a TSV file containing the accuracy (percentage) of the classification. + +Target: + +(3) a TSV file containing suspected and predicted grouping classifications for each sample and +(4) a TSV file containing the accuracy (percentage) of the prediction in comparison to the suspected grouping specified in the design file. + +**NOTE:** Some knowledge about the SVM classifier algorithm and different kernel types is recommended for users who plan to use the tool frequently with different settings. + + ]]></help> + <expand macro="citations"/> +</tool>