comparison w4mkmeans.xml @ 2:c415b7dc6f37 draft default tip

planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 3e916537da6bb37e6f3927d7a11e98e0ab6ef5ec
author eschen42
date Mon, 05 Mar 2018 12:40:17 -0500
parents 02cafb660b72
children
comparison
equal deleted inserted replaced
1:02cafb660b72 2:c415b7dc6f37
1 <tool id="w4mkmeans" name="w4mKmeans" version="0.98.3"> 1 <tool id="w4mkmeans" name="Kmeans for W4m" version="0.98.4">
2 <description>Calculate K-means for W4M dataMatrix features or samples</description> 2 <description>Calculate K-means for W4m dataMatrix features or samples</description>
3 3
4 <requirements> 4 <requirements>
5 <requirement type="package" version="3.3.2">r-base</requirement> 5 <requirement type="package" version="3.4.1">r-base</requirement>
6 <requirement type="package" version="1.1_4">r-batch</requirement> 6 <requirement type="package" version="1.1_4">r-batch</requirement>
7 <requirement type="package" version="1.8.0">libssh2</requirement>
8 <requirement type="package" version="1.13.2">krb5</requirement>
7 </requirements> 9 </requirements>
8 10
9 <stdio> 11 <stdio>
10 <exit_code range="1:" level="fatal" /> 12 <exit_code range="1:" level="fatal" />
11 </stdio> 13 </stdio>
25 sample_metadata_path '$sampleMetadata_in' 27 sample_metadata_path '$sampleMetadata_in'
26 scores_out '$scores_out' 28 scores_out '$scores_out'
27 slots "\${GALAXY_SLOTS:-1}" 29 slots "\${GALAXY_SLOTS:-1}"
28 variableMetadata_out '$variableMetadata_out' 30 variableMetadata_out '$variableMetadata_out'
29 variable_metadata_path '$variableMetadata_in' 31 variable_metadata_path '$variableMetadata_in'
30 ; echo exit code $?
31 ]]></command> 32 ]]></command>
32 33
33 <inputs> 34 <inputs>
34 <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" /> 35 <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="Feature (variable) x sample; decimal point: '.'; missing: NA; mode: numerical; separator: tab" />
35 <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" /> 36 <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="Sample x metadata columns; separator: tab" />
36 <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" /> 37 <param name="variableMetadata_in" label="Variable (feature) metadata file" type="data" format="tabular" help="Feature (variable) x metadata columns; separator: tab" />
37 <param name="categoricalPrefix" label="prefix for cluster names " type="text" value="k" help="[categorical_prefix] Some tools require non-numeric values to discern categorical data; e.g., enter 'k' here to prepend 'k' to cluster numbers in the output; default 'k'." /> 38 <param name="categoricalPrefix" label="Prefix for cluster names " type="text" value="c" help="String prepended to cluster numbers in output; default 'c'; leave blank for no prefix." />
38 <param name="ksamples" label="K value(s) for samples" type="text" value = "0" help="[ksamples] Single K or comma-separated Ks for samples, or 0 for none." /> 39 <param name="ksamples" label="K value(s) for samples" type="text" value = "0" help="[ksamples] Single K or comma-separated Ks for samples, or 0 for none." />
39 <param name="kfeatures" label="K value(s) for features" type="text" value = "0" help="[kfeatures] Single K or comma-separated Ks for features (variables), or 0 for none." /> 40 <param name="kfeatures" label="K value(s) for features" type="text" value = "0" help="[kfeatures] Single K or comma-separated Ks for features (variables), or 0 for none." />
40 <param name="iter_max" label="Max number of iterations" type="text" value = "10" help="[iter_max] The maximum number of iterations allowed; default 10." /> 41 <param name="iter_max" label="Maximum number of iterations" type="text" value = "20" help="[iter_max] The maximum number of iterations allowed; default 20." />
41 <param name="nstart" label="Number of random sets" type="text" value = "1" help="[nstart] How many random sets should be chosen; default 1." /> 42 <param name="nstart" label="Number of random sets of clusters" type="text" value = "20" help="[nstart] How many random sets of clusters should be chosen initially; default 20." />
42 <param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="[algorithm] K-means clustering algorithm, default 'Hartigan-Wong'; alternatives 'Lloyd', 'MacQueen'; 'Forgy' is a synonym for 'Lloyd', see stats::kmeans reference for further info."> 43 <param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="[algorithm] K-means clustering algorithm, default 'Hartigan-Wong'; &lt;br /&gt;alternatives 'Lloyd', 'MacQueen'; 'Forgy' (synonym for 'Lloyd'); see references.">
43 <option value="Forgy">Forgy</option> 44 <option value="Forgy">Forgy</option>
44 <option value="Hartigan-Wong" selected="True">Hartigan-Wong</option> 45 <option value="Hartigan-Wong" selected="True">Hartigan-Wong</option>
45 <option value="Lloyd">Lloyd</option> 46 <option value="Lloyd">Lloyd</option>
46 <option value="MacQueen">MacQueen</option> 47 <option value="MacQueen">MacQueen</option>
47 </param> 48 </param>
48 </inputs> 49 </inputs>
49 50
50 <outputs> 51 <outputs>
51 <data name="sampleMetadata_out" label="${tool.name}_${sampleMetadata_in.name}" format="tabular" ></data> 52 <data name="sampleMetadata_out" label="${sampleMetadata_in.name}.kmeans-smpl" format="tabular" ></data>
52 <data name="variableMetadata_out" label="${tool.name}_${variableMetadata_in.name}" format="tabular" ></data> 53 <data name="variableMetadata_out" label="${variableMetadata_in.name}.kmeans-vrbl" format="tabular" ></data>
53 <data name="scores_out" label="${tool.name}_${dataMatrix_in.name}.kmeans" format="tabular" ></data> 54 <data name="scores_out" label="${dataMatrix_in.name}.kmeans-score" format="tabular" ></data>
54 </outputs> 55 </outputs>
55 56
56 <tests> 57 <tests>
57 <test> 58 <test>
58 <param name="dataMatrix_in" value="input_dataMatrix.tsv"/> 59 <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
59 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> 60 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
60 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> 61 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
61 <param name="ksamples" value="3,4"/> 62 <param name="ksamples" value="3,4"/>
62 <param name="kfeatures" value="5,6,7"/> 63 <param name="kfeatures" value="5,6,7"/>
63 <param name="iter_max" value="10"/> 64 <param name="iter_max" value="20"/>
64 <param name="nstart" value="1"/> 65 <param name="nstart" value="20"/>
65 <param name="algorithm" value="Hartigan-Wong"/> 66 <param name="algorithm" value="Hartigan-Wong"/>
66 <output name="scores_out"> 67 <output name="scores_out">
67 <assert_contents> 68 <assert_contents>
68 <has_text text="proportion" /> 69 <has_text text="proportion" />
69 <has_text text="0.87482" /> 70 <has_text text="0.87482" />
70 <has_text text="0.89248" /> 71 <has_text text="0.91765" />
71 <has_text text="0.95355" /> 72 <has_text text="0.95362" />
72 <has_text text="0.95673" /> 73 <has_text text="0.95719" />
73 <has_text text="0.95963" /> 74 <has_text text="0.97966" />
74 </assert_contents> 75 </assert_contents>
75 </output> 76 </output>
76 </test> 77 </test>
77 </tests> 78 </tests>
78 79
79 <help> 80 <help>
80 <![CDATA[ 81 <![CDATA[
81 82
83 ===========================
84 K-means for W4m data matrix
85 ===========================
86
82 **Author** - Arthur Eschenlauer (University of Minnesota, esch0041@umn.edu) 87 **Author** - Arthur Eschenlauer (University of Minnesota, esch0041@umn.edu)
83 88
84 --------------------------------------------------------------------------- 89 **Source Code** - The source code for the w4mkmeans tool is available (from the Hegeman lab github repository) at https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper
85 90
86 91 **R code used** - The R code invoked by this wrapper is the R kmeans package at https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html
87 **Source** - The source code for the w4mkmeans tool is available (from the Hegeman lab github repository) at https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper 92
88 93 **Tool updates** - See the **NEWS** section at the bottom of this page
89 **R code used** - The R code invoked by this wrapper is the R 'stats::kmeans' package
90
91 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------
92
93
94 **Tool updates**
95
96 See the **NEWS** section at the bottom of this page
97
98 ---------------------------------------------------
99
100 ===========================
101 K-means for W4M data matrix
102 ===========================
103 94
104 ----------- 95 -----------
105 Description 96 Description
106 ----------- 97 -----------
107 98
108 Calculate K-means for sample-clusters (or feature-clusters, or both) using W4M dataMatrix (i.e., XCMS-preprocessed data files) as input. 99 This tool calculate K-means clusters for samples, features, or both using W4m dataMatrix (i.e., XCMS-preprocessed data files) as input and writes the results to new columns in sampleMetadata, variableMetadata, or both.
109 100
110 *Please note that XCMS refers to features as 'variables'. This documentation does not use either term consistently.* 101 - If several, comma-separated K's are supplied, then one column is added for each K.
111 102 - For feature-clustering, each feature is assigned to a cluster such that the feature's response for all samples is closer to the mean of all features for that cluster than to the mean for any other cluster.
103 - For sample-clustering, each sample is assigned to a cluster such that the sample's response for all features is closer to the mean of all samples for that cluster than to the mean for any other cluster.
104 - Clustering is mutually exclusive, **not** hierarchical.
105
106 - Hierarchical clustering is available through the W4m Heat Map tool, https://github.com/workflow4metabolomics/heatmap
112 107
113 ----------------- 108 -----------------
114 Workflow Position 109 Workflow Position
115 ----------------- 110 -----------------
116 111
117 - Tool category: Statistical Analysis 112 - Tool category: Statistical Analysis
118 - Upstream tool category: Preprocessing 113 - Upstream tool category: Preprocessing
119 - Downstream tool categories: Statistical Analysis 114 - Downstream tool categories: Statistical Analysis
120
121
122 ----------
123 Motivation
124 ----------
125
126 This tool clusters samples, features (variables), or both from the W4M dataMatrix and writes the results to new columns in sampleMetadata, variableMetadata, or both, respectively.
127
128 - If several, comma-separated K's are supplied, then one column is added for each K.
129 - This clustering is **not** hierarchical; each member of a cluster is not a member of any other cluster.
130 - For feature-clustering, each feature is assigned to a cluster such that the feature's response for all samples is closer to the mean of all features for that cluster than to the mean for any other cluster.
131 - For sample-clustering, each sample is assigned to a cluster such that the sample's response for all features is closer to the mean of all samples for that cluster than to the mean for any other cluster.
132
133 115
134 ----------- 116 -----------
135 Input files 117 Input files
136 ----------- 118 -----------
137 119
150 Parameters 132 Parameters
151 ---------- 133 ----------
152 134
153 **Data matrix** - input-file dataset 135 **Data matrix** - input-file dataset
154 136
155 - XCMS variable x sample 'dataMatrix' (tabular separated values) file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical to the rownames of the sample and feature metadata, respectively (see below) 137 - W4m variable (i.e. feature) x sample 'dataMatrix' (tabular separated values) file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical to the rownames of the sample and feature metadata, respectively (see below)
156 138
157 **Sample metadata** - input-file dataset 139 **Sample metadata** - input-file dataset
158 140
159 - XCMS sample x metadata 'sampleMetadata' (tabular separated values) file of the numeric and/or character sample metadata, with . as decimal and NA for missing values 141 - W4m sample x metadata 'sampleMetadata' (tabular separated values) file of the numeric and/or character sample metadata, with . as decimal and NA for missing values
160 142
161 **Feature metadata** - input-file dataset 143 **Feature metadata** - input-file dataset
162 144
163 - XCMS variable x metadata 'variableMetadata' (tabular separated values) file of the numeric and/or character feature metadata, with . as decimal and NA for missing values 145 - W4m variable (i.e. feature) x metadata 'variableMetadata' (tabular separated values) file of the numeric and/or character feature metadata, with . as decimal and NA for missing values
164 146
165 **kfeatures** - K or K's for features (default = 0) 147 **Prefix for cluster names** - character(s) to add as prefix to category number (default = 'c')
166 148
167 - integer or comma-separated integers ; zero (the default) or less will result in no calculation. 149 - Some tools treat only non-numeric data as categorical; this prefix ensures that clusters data will be treated as categorical; an empty string is permitted here if desired (and succeeding tools requiring categorical data accept integers).
168 150
169 **ksamples** - K or K-range for samples (default = 0) 151 **K-values for samples** - K or K-range for samples (default = 0)
170 152
171 - integer or comma-separated integers ; zero (the default) or less will result in no calculation. 153 - Integer or comma-separated positive integers ; zero (or less) will result in no calculation.
172 154
173 **iter_max** - maximum_iterations (default = 10) 155 **K-values for features** - K or K's for features (default = 0)
174 156
175 - maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html). 157 - Integer or comma-separated positive integers ; zero (or less) will result in no calculation.
176 158
177 **nstart** - how many random sets should be chosen (default = 1) 159 **Maximumn number of iterations** - (default = 20)
178 160
179 - maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html). 161 - Maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html for further info).
180 162
181 **categorical_prefix** - character(s) to add as prefix to category number (default = 'k') 163 **Number of random sets** - how many random sets should be chosen to start (default = 20)
182 164
183 - some tools treat only non-numeric data as categorical; this prefix ('k' by default) ensures that clusters data will be treated as categorical; an empty string is permitted here if desired (and succeeding tools accept integers as categorical data). 165 - Number of random sets of clusters to be chosen to start calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html for further info).
166
167 **Algorithm** - Algorithm for clustering" (default = Hartigan-Wong)
168
169 - K-means clustering algorithm: 'Hartigan-Wong', 'Lloyd', or 'MacQueen'; 'Forgy' is a synonym for 'Lloyd' (see references for further info).
184 170
185 ------------ 171 ------------
186 Output files 172 Output files
187 ------------ 173 ------------
188 174
189 **XCMS sampleMetadata** - (tabular separated values) file identical to the Sample metadata file given as an input argument, excepting one column added for each K 175 **Sample Metadata** - (tabular separated values) file identical to the Sample metadata file given as an input argument, excepting one column added for each K
190 176
191 - **k#** - cluster number for clustering samples with K = # 177 - **k#** - cluster number for clustering samples with K = #
192 178
193 **XCMS variableMetadata** - (tabular separated values) file identical to the Feature metadata file given as an input argument, excepting one column added for each K 179 **Variable Metadata** - (tabular separated values) file identical to the Feature metadata file given as an input argument, excepting one column added for each K
194 180
195 - **k#** - cluster number for clustering features with K = # 181 - **k#** - cluster number for clustering features with K = #
196 182
197 **scores** - (tabular separated values) file with one line for each K. 183 **scores** - (tabular separated values) file with one line for each K.
198 184
199 - **clusterOn** - what was clustered - either 'sample' or 'feature' 185 - **clusterOn** - what was clustered - either 'sample' or 'feature'
200 - **k** - the chosen K for clustering 186 - **k** - the chosen K for clustering
201 - **totalSS** - total (*between-treatements* plus total of *within-treatements*) sum of squares 187 - **totalSS** - total (*between-treatements* plus total of *within-treatements*) sum of squares
202 - **betweenSS** - *between-treatements* sum of squares 188 - **betweenSS** - *between-treatements* sum of squares
203 - **proportion** - betweenSS / totalSS 189 - **proportion** - betweenSS / totalSS
204 190
205 --------------- 191 ---------------
206 Working example 192 Working example
207 --------------- 193 ---------------
208 194
195 .. class:: infomark
196
209 **Input files** 197 **Input files**
210 198
211 +-------------------+-------------------------------------------------------------------------------------------------------------------+ 199 +-------------------------------------------------------------------------------------------------------------------+
212 | Input File | Download from URL | 200 | URL |
213 +===================+===================================================================================================================+ 201 +===================================================================================================================+
214 | Data matrix | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_dataMatrix.tsv | 202 | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_dataMatrix.tsv |
215 +-------------------+-------------------------------------------------------------------------------------------------------------------+ 203 +-------------------------------------------------------------------------------------------------------------------+
216 | Sample metadata | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_sampleMetadata.tsv | 204 | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_sampleMetadata.tsv |
217 +-------------------+-------------------------------------------------------------------------------------------------------------------+ 205 +-------------------------------------------------------------------------------------------------------------------+
218 | Feature metadata | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_variableMetadata.tsv | 206 | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_variableMetadata.tsv |
219 +-------------------+-------------------------------------------------------------------------------------------------------------------+ 207 +-------------------------------------------------------------------------------------------------------------------+
208
209 .. class:: infomark
220 210
221 **Other input parameters** 211 **Other input parameters**
222 212
223 +-----------------+---------------+ 213 +-----------------+---------------+
224 | Input Parameter | Value | 214 | Input Parameter | Value |
225 +=================+===============+ 215 +=================+===============+
216 | prefix | c |
217 +-----------------+---------------+
226 | ksamples | 3,4 | 218 | ksamples | 3,4 |
227 +-----------------+---------------+ 219 +-----------------+---------------+
228 | kfeatures | 5,6,7 | 220 | kfeatures | 5,6,7 |
229 +-----------------+---------------+ 221 +-----------------+---------------+
230 | iter_max | 10 | 222 | iter_max | 20 |
231 +-----------------+---------------+ 223 +-----------------+---------------+
232 | nstart | 1 | 224 | nstart | 20 |
233 +-----------------+---------------+ 225 +-----------------+---------------+
234 | algorithm | Hartigan-Wong | 226 | algorithm | Hartigan-Wong |
235 +-----------------+---------------+ 227 +-----------------+---------------+
228
229 .. class:: infomark
230
231 **Expected output files**
232
233 +-------------------------------------------------------------------------------------------------------------------+
234 | URL |
235 +===================================================================================================================+
236 | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/output_kmeans-score.tsv |
237 +-------------------------------------------------------------------------------------------------------------------+
238 | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/output_kmeans-vrbl.tsv |
239 +-------------------------------------------------------------------------------------------------------------------+
240 | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/output_kmeans-smpl.tsv |
241 +-------------------------------------------------------------------------------------------------------------------+
236 242
237 ---- 243 ----
238 NEWS 244 NEWS
239 ---- 245 ----
240 246
247 - February 2018, Version 0.98.4 - Renamed output datasets to append '``.kmeans-smpl``', '``.kmeans-vrbl``', or '``.kmeans-score``'; refactored multi-threading.
241 - August 2017, Version 0.98.3 - Add (optional) prefix to category numbers for downstream tools that treat only non-numeric data as categorical. 248 - August 2017, Version 0.98.3 - Add (optional) prefix to category numbers for downstream tools that treat only non-numeric data as categorical.
242 - August 2017, Version 0.98.1 - First release 249 - August 2017, Version 0.98.1 - First release
243 250
244 --------- 251 ---------
245 Citations 252 Citations
275 title = {Cluster Analysis of Multivariate Data: Efficiency versus Interpretability of Classification}, 282 title = {Cluster Analysis of Multivariate Data: Efficiency versus Interpretability of Classification},
276 volume = 21, 283 volume = 21,
277 year = 1965 284 year = 1965
278 } 285 }
279 ]]></citation> 286 ]]></citation>
280 <!-- W4M 3.0 - Guitton et al. 2017--> 287 <!-- W4m 3.0 - Guitton et al. 2017-->
281 <citation type="doi">10.1016/j.biocel.2017.07.002</citation> 288 <citation type="doi">10.1016/j.biocel.2017.07.002</citation>
282 <!-- W4M 2.5 - Giacomini et al. 2014 --> 289 <!-- W4m 2.5 - Giacomini et al. 2014 -->
283 <citation type="doi">10.1093/bioinformatics/btu813</citation> 290 <citation type="doi">10.1093/bioinformatics/btu813</citation>
284 <!-- Hartigan and Wong algorithm --> 291 <!-- Hartigan and Wong algorithm -->
285 <citation type="bibtex"><![CDATA[ 292 <citation type="doi">10.2307/2346830</citation>
286 @article{Hartigan79,
287 added-at = {2007-02-27T16:22:09.000+0100},
288 author = {Hartigan, J. and Wong, M.},
289 biburl = {https://www.bibsonomy.org/bibtex/23d8bfc440c5725783876929c022f67ce/pierpaolo.pk81},
290 description = {WSD},
291 interhash = {10d6d33920d9af578a4d0a556dc1477d},
292 intrahash = {3d8bfc440c5725783876929c022f67ce},
293 journal = {Applied Statistics},
294 keywords = {imported},
295 pages = {100-108},
296 timestamp = {2007-02-27T16:22:11.000+0100},
297 title = {Algorithm AS136: A k-means clustering algorithm},
298 volume = 28,
299 year = 1979
300 }
301 ]]></citation>
302 <!-- Lloyd algorithm --> 293 <!-- Lloyd algorithm -->
303 <citation type="doi">10.1109/TIT.1982.1056489</citation> 294 <citation type="doi">10.1109/TIT.1982.1056489</citation>
304 <!-- MacQueen algorithm --> 295 <!-- MacQueen algorithm -->
305 <citation type="bibtex"><![CDATA[ 296 <citation type="bibtex"><![CDATA[
306 @inproceedings{MacQueen1967, 297 @inproceedings{MacQueen1967,