comparison w4mkmeans.xml @ 0:6ccbe18131a6 draft

planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 299e5c7fdb0d6eb0773f3660009f6d63c2082a8d
author eschen42
date Tue, 08 Aug 2017 15:30:38 -0400
parents
children 02cafb660b72
comparison
equal deleted inserted replaced
-1:000000000000 0:6ccbe18131a6
1 <tool id="w4mkmeans" name="Kmeans_for_W4M" version="0.98.1">
2 <description>Calculate K-means for dataMatrix features or samples</description>
3
4 <requirements>
5 <requirement type="package" version="3.3.2">r-base</requirement>
6 <requirement type="package" version="1.1_4">r-batch</requirement>
7 </requirements>
8
9 <stdio>
10 <exit_code range="1:" level="fatal" />
11 </stdio>
12
13
14 <command detect_errors="aggressive"><![CDATA[
15 Rscript $__tool_directory__/w4mkmeans_wrapper.R
16 tool_directory $__tool_directory__
17 data_matrix_path '$dataMatrix_in'
18 variable_metadata_path '$variableMetadata_in'
19 sample_metadata_path '$sampleMetadata_in'
20 ksamples '$ksamples'
21 kfeatures '$kfeatures'
22 iter_max '$iter_max'
23 nstart '$nstart'
24 algorithm '$algorithm'
25 scores_out '$scores_out'
26 sampleMetadata_out '$sampleMetadata_out'
27 variableMetadata_out '$variableMetadata_out'
28 slots "\${GALAXY_SLOTS:-1}"
29 ; echo exit code $?
30 ]]></command>
31
32 <inputs>
33 <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" />
34 <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" />
35 <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" />
36 <param name="ksamples" label="K value(s) for samples" type="text" value = "0" help="[ksamples] Single K or comma-separated Ks for samples, or 0 for none." />
37 <param name="kfeatures" label="K value(s) for features" type="text" value = "0" help="[kfeatures] Single K or comma-separated Ks for features (variables), or 0 for none." />
38 <param name="iter_max" label="Max number of iterations" type="text" value = "10" help="[iter_max] The maximum number of iterations allowed; default 10." />
39 <param name="nstart" label="Number of random sets" type="text" value = "1" help="[nstart] How many random sets should be chosen; default 1." />
40 <param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="[algorithm] K-means clustering algorithm, default 'Hartigan-Wong'; alternatives 'Lloyd', 'MacQueen'; 'Forgy' is a synonym for 'Lloyd', see references for further info.">
41 <option value="Forgy">Forgy</option>
42 <option value="Hartigan-Wong" selected="True">Hartigan-Wong</option>
43 <option value="Lloyd">Lloyd</option>
44 <option value="MacQueen">MacQueen</option>
45 </param>
46 </inputs>
47
48 <outputs>
49 <data name="sampleMetadata_out" label="${tool.name}_${sampleMetadata_in.name}" format="tabular" ></data>
50 <data name="variableMetadata_out" label="${tool.name}_${variableMetadata_in.name}" format="tabular" ></data>
51 <data name="scores_out" label="${tool.name}_${dataMatrix_in.name}.kmeans" format="tabular" ></data>
52 </outputs>
53
54 <tests>
55 <test>
56 <param name="dataMatrix_in" value="input_dataMatrix.tsv"/>
57 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/>
58 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/>
59 <param name="ksamples" value="3,4"/>
60 <param name="kfeatures" value="5,6,7"/>
61 <param name="iter_max" value="10"/>
62 <param name="nstart" value="1"/>
63 <param name="algorithm" value="Hartigan-Wong"/>
64 <output name="scores_out">
65 <assert_contents>
66 <has_text text="proportion" />
67 <has_text text="0.87482" />
68 <has_text text="0.89248" />
69 <has_text text="0.95355" />
70 <has_text text="0.95673" />
71 <has_text text="0.95963" />
72 </assert_contents>
73 </output>
74 </test>
75 </tests>
76
77 <help>
78 <![CDATA[
79
80 **Author** - Arthur Eschenlauer (University of Minnesota, esch0041@umn.edu)
81
82 ---------------------------------------------------------------------------
83
84
85 **Source** - The source code for the w4mkmeans tool is available (from the Hegeman lab github repository) at https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper
86
87 **R code used** - The R code invoked by this wrapper is the R 'stats::kmeans' package
88
89 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------
90
91
92 **Tool updates**
93
94 See the **NEWS** section at the bottom of this page
95
96 ---------------------------------------------------
97
98 ===========================
99 K-means for W4M data matrix
100 ===========================
101
102 -----------
103 Description
104 -----------
105
106 Calculate K-means for sample-clusters (or feature-clusters, or both) using W4M dataMatrix (i.e., XCMS-preprocessed data files) as input.
107
108 *Please note that XCMS refers to features as 'variables'. This documentation does not use either term consistently.*
109
110
111 -----------------
112 Workflow Position
113 -----------------
114
115 - Tool category: Statistical Analysis
116 - Upstream tool category: Preprocessing
117 - Downstream tool categories: Statistical Analysis
118
119
120 ----------
121 Motivation
122 ----------
123
124 This tool clusters samples, features (variables), or both from the W4M dataMatrix and writes the results to new columns in sampleMetadata, variableMetadata, or both, respectively.
125
126 - If several, comma-separated K's are supplied, then one column is added for each K.
127 - This clustering is **not** hierarchical; each member of a cluster is not a member of any other cluster.
128 - For feature-clustering, each feature is assigned to a cluster such that the feature's response for all samples is closer to the mean of all features for that cluster than to the mean for any other cluster.
129 - For sample-clustering, each sample is assigned to a cluster such that the sample's response for all features is closer to the mean of all samples for that cluster than to the mean for any other cluster.
130
131
132 -----------
133 Input files
134 -----------
135
136 +--------------------------------------------+------------+
137 | File | Format |
138 +============================================+============+
139 | Data matrix | tabular |
140 +--------------------------------------------+------------+
141 | Sample metadata | tabular |
142 +--------------------------------------------+------------+
143 | Variable (i.e., feature) metadata | tabular |
144 +--------------------------------------------+------------+
145
146
147 ----------
148 Parameters
149 ----------
150
151 **Data matrix** - input-file dataset
152
153 - XCMS variable x sample 'dataMatrix' (tabular separated values) file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical to the rownames of the sample and feature metadata, respectively (see below)
154
155 **Sample metadata** - input-file dataset
156
157 - XCMS sample x metadata 'sampleMetadata' (tabular separated values) file of the numeric and/or character sample metadata, with . as decimal and NA for missing values
158
159 **Feature metadata** - input-file dataset
160
161 - XCMS variable x metadata 'variableMetadata' (tabular separated values) file of the numeric and/or character feature metadata, with . as decimal and NA for missing values
162
163 **kfeatures** - K or K's for features (default = 0)
164
165 - integer or comma-separated integers ; zero (the default) or less will result in no calculation.
166
167 **ksamples** - K or K-range for samples (default = 0)
168
169 - integer or comma-separated integers ; zero (the default) or less will result in no calculation.
170
171 **iter_max** - maximum_iterations (default = 10)
172
173 - maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html).
174
175 **nstart** - how many random sets should be chosen (default = 1)
176
177 - maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html).
178
179 ------------
180 Output files
181 ------------
182
183 **XCMS sampleMetadata** - (tabular separated values) file identical to the Sample metadata file given as an input argument, excepting one column added for each K
184
185 - **k#** - cluster number for clustering samples with K = #
186
187 **XCMS variableMetadata** - (tabular separated values) file identical to the Feature metadata file given as an input argument, excepting one column added for each K
188
189 - **k#** - cluster number for clustering features with K = #
190
191 **scores** - (tabular separated values) file with one line for each K.
192
193 - **clusterOn** - what was clustered - either 'sample' or 'feature'
194 - **k** - the chosen K for clustering
195 - **totalSS** - total (*between-treatements* plus total of *within-treatements*) sum of squares
196 - **betweenSS** - *between-treatements* sum of squares
197 - **proportion** - betweenSS / totalSS
198
199 ---------------
200 Working example
201 ---------------
202
203 **Input files**
204
205 +-------------------+-------------------------------------------------------------------------------------------------------------------+
206 | Input File | Download from URL |
207 +===================+===================================================================================================================+
208 | Data matrix | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_dataMatrix.tsv |
209 +-------------------+-------------------------------------------------------------------------------------------------------------------+
210 | Sample metadata | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_sampleMetadata.tsv |
211 +-------------------+-------------------------------------------------------------------------------------------------------------------+
212 | Feature metadata | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_variableMetadata.tsv |
213 +-------------------+-------------------------------------------------------------------------------------------------------------------+
214
215 **Other input parameters**
216
217 +-----------------+---------------+
218 | Input Parameter | Value |
219 +=================+===============+
220 | ksamples | 3,4 |
221 +-----------------+---------------+
222 | kfeatures | 5,6,7 |
223 +-----------------+---------------+
224 | iter_max | 10 |
225 +-----------------+---------------+
226 | nstart | 1 |
227 +-----------------+---------------+
228 | algorithm | Hartigan-Wong |
229 +-----------------+---------------+
230
231 ----
232 NEWS
233 ----
234
235 August 2017, Version 0.98.1 - First release
236
237 ---------
238 Citations
239 ---------
240
241 ]]>
242 </help>
243 <citations>
244 <citation type="bibtex"><![CDATA[
245 @incollection{RCoreTeam2017,
246 title = {stats::kmeans - K-Means Clustering},
247 booktitle = {R: A Language and Environment for Statistical Computing},
248 author = {{R Core Team}},
249 publisher = {R Foundation for Statistical Computing},
250 address = {Vienna, Austria},
251 year = {2017},
252 url = {https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html},
253 }
254 ]]></citation>
255 <!-- Forgy algorithm -->
256 <citation type="bibtex"><![CDATA[
257 @article{forgy65,
258 added-at = {2006-03-23T12:22:43.000+0100},
259 author = {Forgy, E.},
260 biburl = {https://www.bibsonomy.org/bibtex/21e31409932ce91df646c4731350e1207/hotho},
261 interhash = {c86383cba8cfe00d5e6ef200016aca3f},
262 intrahash = {1e31409932ce91df646c4731350e1207},
263 journal = {Biometrics},
264 keywords = {clustering kmeans},
265 number = 3,
266 pages = {768-769},
267 timestamp = {2006-03-23T12:22:43.000+0100},
268 title = {Cluster Analysis of Multivariate Data: Efficiency versus Interpretability of Classification},
269 volume = 21,
270 year = 1965
271 }
272 ]]></citation>
273 <!-- W4M 3.0 - Guitton et al. 2017-->
274 <citation type="doi">10.1016/j.biocel.2017.07.002</citation>
275 <!-- W4M 2.5 - Giacomini et al. 2014 -->
276 <citation type="doi">10.1093/bioinformatics/btu813</citation>
277 <!-- Hartigan and Wong algorithm -->
278 <citation type="bibtex"><![CDATA[
279 @article{Hartigan79,
280 added-at = {2007-02-27T16:22:09.000+0100},
281 author = {Hartigan, J. and Wong, M.},
282 biburl = {https://www.bibsonomy.org/bibtex/23d8bfc440c5725783876929c022f67ce/pierpaolo.pk81},
283 description = {WSD},
284 interhash = {10d6d33920d9af578a4d0a556dc1477d},
285 intrahash = {3d8bfc440c5725783876929c022f67ce},
286 journal = {Applied Statistics},
287 keywords = {imported},
288 pages = {100-108},
289 timestamp = {2007-02-27T16:22:11.000+0100},
290 title = {Algorithm AS136: A k-means clustering algorithm},
291 volume = 28,
292 year = 1979
293 }
294 ]]></citation>
295 <!-- Lloyd algorithm -->
296 <citation type="doi">10.1109/TIT.1982.1056489</citation>
297 <!-- MacQueen algorithm -->
298 <citation type="bibtex"><![CDATA[
299 @inproceedings{MacQueen1967,
300 added-at = {2011-01-11T13:35:01.000+0100},
301 author = {MacQueen, J. B.},
302 biburl = {https://www.bibsonomy.org/bibtex/25dcdb8cd9fba78e0e791af619d61d66d/enitsirhc},
303 booktitle = {Proc. of the fifth Berkeley Symposium on Mathematical Statistics and Probability},
304 editor = {Cam, L. M. Le and Neyman, J.},
305 interhash = {8d7d4dfe7d3a06b8c9c3c2bb7aa91e28},
306 intrahash = {5dcdb8cd9fba78e0e791af619d61d66d},
307 keywords = {kmeans clustering},
308 pages = {281-297},
309 publisher = {University of California Press},
310 timestamp = {2011-01-11T13:35:01.000+0100},
311 title = {Some Methods for Classification and Analysis of MultiVariate Observations},
312 volume = 1,
313 year = 1967
314 }
315 ]]></citation>
316 </citations>
317 <!--
318 vim:et:sw=2:ts=2:
319 --> </tool>