Mercurial > repos > malex > secimtools
comparison svm_classifier.xml @ 1:2e7d47c0b027 draft
"planemo upload for repository https://malex@toolshed.g2.bx.psu.edu/repos/malex/secimtools"
author | malex |
---|---|
date | Mon, 08 Mar 2021 22:04:06 +0000 |
parents | |
children | caba07f41453 |
comparison
equal
deleted
inserted
replaced
0:b54326490b4d | 1:2e7d47c0b027 |
---|---|
1 <tool id="secimtools_svm_classifier" name="Support Vector Machine (SVM) Classifier" version="@WRAPPER_VERSION@"> | |
2 <description>- Predict sample groups.</description> | |
3 <macros> | |
4 <import>macros.xml</import> | |
5 </macros> | |
6 <expand macro="requirements" /> | |
7 <command><![CDATA[ | |
8 svm_classifier.py | |
9 --train_wide $train_wide | |
10 --train_design $train_design | |
11 --test_wide $test_wide | |
12 --test_design $test_design | |
13 --group $group | |
14 --ID $uniqID | |
15 --kernel $kernel | |
16 --degree $degree | |
17 --C $C | |
18 --cross_validation $cross_validation | |
19 --C_lower_bound $C_lower_bound | |
20 --C_upper_bound $C_upper_bound | |
21 --a $a | |
22 --b $b | |
23 --outClassification $outClassification | |
24 --outClassificationAccuracy $outClassificationAccuracy | |
25 --outPrediction $outPrediction | |
26 --outPredictionAccuracy $outPredictionAccuracy | |
27 ]]></command> | |
28 <inputs> | |
29 <param name="train_wide" type="data" format="tabular" label="Training wide dataset" help="Dataset missing? See TIP below."/> | |
30 <param name="train_design" type="data" format="tabular" label="Training design file" help="Dataset missing? See TIP below."/> | |
31 <param name="test_wide" type="data" format="tabular" label="Target wide dataset" help="Dataset missing? See TIP below."/> | |
32 <param name="test_design" type="data" format="tabular" label="Target design file" help="Dataset missing? See TIP below."/> | |
33 <param name="group" size="30" type="text" value="" label="Group/Treatment" help="Name of the column in your Training and Target design files that contain group classifications."/> | |
34 <param name="uniqID" type="text" size="30" value="" label="Unique Feature ID" help="Name of the column in your Training and Target wide datasets that contain unique identifiers."/> | |
35 <param name="kernel" type="select" size="30" display="radio" value="rbf" label="Select a SVM Kernel Function"> | |
36 <option value="rbf">Radial Basis function (Gaussian)</option> | |
37 <option value="linear">Linear</option> | |
38 <option value="poly">Polynomial</option> | |
39 <option value="sigmoid">Sigmoid</option> | |
40 </param> | |
41 <param name="degree" size="30" type="text" value="3" label="Polynomial Degree" help='Only used for the polynomial kernel.'/> | |
42 <param name="cross_validation" type="select" size="30" display="radio" value="double" label="Select Cross-Validation"> | |
43 <option value="none">None</option> | |
44 <option value="single">Single</option> | |
45 <option value="double">Double</option> | |
46 </param> | |
47 <param name="C" size="30" type="text" value="1" label="Regularization Parameter C" help='See references in tool description for setting this parameter. Value must be positive (C > 0). Used only if cross-validation is not selected. Default = 1.'/> | |
48 <param name="C_lower_bound" size="30" type="text" value="0.1" label="Regularization Parameter C (Lower Bound)" help='Defines the lower bound for regularization parameter C when cross-validation is used. Must have a positive value (C > 0) Default = 0.1. '/> | |
49 <param name="C_upper_bound" size="30" type="text" value="10" label="Regularization Parameter C (Upper Bound)" help='Defines the upper bound for regularization parameter C when cross-validation is used. Must have a positive value that is larger than the lower bound. Default = 10. '/> | |
50 <param name="a" size="30" type="text" value="0.0" label="Coefficient A" help='Used in the kernel functions above. Must be greater than zero. However, the default = 0 and translates to a = 1/n_features, where n_features is the number of features. Default = 0.'/> | |
51 <param name="b" size="30" type="text" value="0.0" label="Coefficient B" help='Independent term in kernel function. It is only significant in polynomial and sigmoid kernels. Default = 0.'/> | |
52 </inputs> | |
53 <outputs> | |
54 <data name="outClassification" format="tabular" label="${tool.name} on ${on_string}: Classification of the Training Data Set"/> | |
55 <data name="outClassificationAccuracy" format='tabular' label="${tool.name} on ${on_string}: Classification Accuracy of the Training Data Set"/> | |
56 <data name="outPrediction" format="tabular" label="${tool.name} on ${on_string}: Prediction Accuracy of the Training Data Set"/> | |
57 <data name="outPredictionAccuracy" format='tabular' label="${tool.name} on ${on_string}: Prediction Accuracy of the Training Data Set"/> | |
58 </outputs> | |
59 <tests> | |
60 <test> | |
61 <param name="train_wide" value="ST000006_data.tsv"/> | |
62 <param name="train_design" value="ST000006_design.tsv"/> | |
63 <param name="test_wide" value="ST000006_data.tsv"/> | |
64 <param name="test_design" value="ST000006_design.tsv"/> | |
65 <param name="group" value="White_wine_type_and_source" /> | |
66 <param name="uniqID" value="Retention_Index" /> | |
67 <param name="kernel" value="linear"/> | |
68 <param name="degree" value="3"/> | |
69 <param name="cross_validation" value="none"/> | |
70 <param name="C" value="1"/> | |
71 <param name="C_lower_bound" value="0.1"/> | |
72 <param name="C_upper_bound" value="2"/> | |
73 <param name="a" value="1"/> | |
74 <param name="b" value="1"/> | |
75 <output name="outClassification" file="ST000006_svm_classifier_train_classification.tsv" /> | |
76 <output name="outClassificationAccuracy" file="ST000006_svm_classifier_train_classification_accuracy.tsv" /> | |
77 <output name="outPrediction" file="ST000006_svm_classifier_target_classification.tsv" /> | |
78 <output name="outPredictionAccuracy" file="ST000006_svm_classifier_target_classification_accuracy.tsv" /> | |
79 </test> | |
80 </tests> | |
81 <help><![CDATA[ | |
82 | |
83 **TIP:** | |
84 If your data is not TAB delimited, use *Text Manipulation->Convert*. | |
85 | |
86 **WARNINGS:** | |
87 - (1) This script automatically removes spaces and special characters from strings. | |
88 - (2) If a feature name starts with a number it will prepend an '_'. | |
89 | |
90 **Tool Description** | |
91 | |
92 **NOTE: A minimum of 100 samples is required by the tool for single or double cross validation** | |
93 | |
94 Given a set of supervised samples in a Training Dataset, the SVM training algorithm builds a model based on these samples that can be used for predicting the categories of new, unclassified samples in a Target Dataset. | |
95 The Target Dataset is not used for model training or evaluation, only for prediction based on the finalized model. | |
96 SVM classification is performed on the target data and accuracy is estimated for both Target and Training Datasets. | |
97 | |
98 SVM uses different kernel functions to carry out different types of classification such as radial bassis (gaussian), linear, polynomial, and sigmoid. | |
99 The classification model can be trained with and without cross-validation (single or double). | |
100 | |
101 For single and double cross-validation: the training dataset is split differently when the model fit is performed. | |
102 | |
103 In single cross-validation: the same data are used to both fit and evaluate the model. | |
104 | |
105 In double cross-validation: the training dataset is split into pieces and the model fit is performed on one of the pieces and evaluated on the other pieces. | |
106 | |
107 Under cross-validation, the user specifies Regularization Parameter C and the Upper and Lower bounds of Regularization Parameter C. | |
108 For more information about Regularization Parameter C, see references below: | |
109 | |
110 Cortes, C. and Vapnik, V. 1995. Support-vector networks. Machine Learning. 20(3) 273-297. | |
111 | |
112 Steinwart, I and Christmann, A. 2008. Support vector machines. Springer Science and Business Media. | |
113 | |
114 | |
115 To use the SVM tool, users need the following information: | |
116 | |
117 (i) a Training Dataset with known categories in the training design file and | |
118 (ii) a Target Dataset with predicted categories in the target design file. | |
119 (iii) the name of the Group/Treatment classification column should be the same for both design files. | |
120 (iv) the Unique Feature IDs should be the same in both the wide datasets. | |
121 (v) the number of Unique Feature IDs should be the same in both the wide datasets. | |
122 | |
123 ------------------------------------------------------------------------------ | |
124 | |
125 **Input** | |
126 | |
127 - Four input datasets are required. | |
128 | |
129 @WIDE@ | |
130 | |
131 **NOTE:** The sample IDs must match the sample IDs in the Design File | |
132 (below). Extra columns will automatically be ignored. | |
133 | |
134 @METADATA@ | |
135 | |
136 @GROUP@ | |
137 | |
138 @UNIQID@ | |
139 | |
140 **SVM Kernel Function** | |
141 | |
142 - Kernel functions available for the SVM algorithm. | |
143 | |
144 **Polynomial Degree** | |
145 | |
146 - Only used for the polynomial kernel. | |
147 | |
148 **Cross-Validation Choice** | |
149 | |
150 - Cross-validation options available for the user. 'None' corresponds to no cross-validation- the user specifies regularization parameter C manually. | |
151 | |
152 | |
153 **Regularization Parameter C** | |
154 | |
155 - Penalizes potential overfitting, must be positive. | |
156 | |
157 | |
158 **Regularization Parameter C (Lower Bound)** | |
159 | |
160 - Lower bound for regularization parameter C. Value must be greater than 0. Only if cross-validation is selected. | |
161 | |
162 | |
163 **Regularization Parameter C (Upper Bound)** | |
164 | |
165 - Upper bound for regularization parameter C. Value must be greater than the Lower Bound. | |
166 | |
167 | |
168 **Coefficient A** | |
169 | |
170 - Used in the kernel functions above. Must be greater than zero. Default = 0, however, | |
171 this translates to a = 1/n_features, where n_features is the number of features. | |
172 | |
173 **Coefficent B** | |
174 | |
175 - Independent term in the kernel function. It is only significant in | |
176 polynomial and sigmoid kernels. | |
177 | |
178 ------------------------------------------------------------------------------ | |
179 | |
180 **Output** | |
181 | |
182 This tool will output two files for the Training dataset and two for the Target datset: | |
183 | |
184 Training: | |
185 | |
186 (1) a TSV file containing the observed and predicted grouping classifications for each sample and | |
187 (2) a TSV file containing the accuracy (percentage) of the classification. | |
188 | |
189 Target: | |
190 | |
191 (3) a TSV file containing suspected and predicted grouping classifications for each sample and | |
192 (4) a TSV file containing the accuracy (percentage) of the prediction in comparison to the suspected grouping specified in the design file. | |
193 | |
194 **NOTE:** Some knowledge about the SVM classifier algorithm and different kernel types is recommended for users who plan to use the tool frequently with different settings. | |
195 | |
196 ]]></help> | |
197 <expand macro="citations"/> | |
198 </tool> |