Mercurial > repos > eschen42 > w4mkmeans
comparison w4mkmeans.xml @ 0:6ccbe18131a6 draft
planemo upload for repository https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper/tree/master commit 299e5c7fdb0d6eb0773f3660009f6d63c2082a8d
author | eschen42 |
---|---|
date | Tue, 08 Aug 2017 15:30:38 -0400 |
parents | |
children | 02cafb660b72 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:6ccbe18131a6 |
---|---|
1 <tool id="w4mkmeans" name="Kmeans_for_W4M" version="0.98.1"> | |
2 <description>Calculate K-means for dataMatrix features or samples</description> | |
3 | |
4 <requirements> | |
5 <requirement type="package" version="3.3.2">r-base</requirement> | |
6 <requirement type="package" version="1.1_4">r-batch</requirement> | |
7 </requirements> | |
8 | |
9 <stdio> | |
10 <exit_code range="1:" level="fatal" /> | |
11 </stdio> | |
12 | |
13 | |
14 <command detect_errors="aggressive"><![CDATA[ | |
15 Rscript $__tool_directory__/w4mkmeans_wrapper.R | |
16 tool_directory $__tool_directory__ | |
17 data_matrix_path '$dataMatrix_in' | |
18 variable_metadata_path '$variableMetadata_in' | |
19 sample_metadata_path '$sampleMetadata_in' | |
20 ksamples '$ksamples' | |
21 kfeatures '$kfeatures' | |
22 iter_max '$iter_max' | |
23 nstart '$nstart' | |
24 algorithm '$algorithm' | |
25 scores_out '$scores_out' | |
26 sampleMetadata_out '$sampleMetadata_out' | |
27 variableMetadata_out '$variableMetadata_out' | |
28 slots "\${GALAXY_SLOTS:-1}" | |
29 ; echo exit code $? | |
30 ]]></command> | |
31 | |
32 <inputs> | |
33 <param name="dataMatrix_in" label="Data matrix file" type="data" format="tabular" help="variable x sample, decimal: '.', missing: NA, mode: numerical, separator: tab" /> | |
34 <param name="sampleMetadata_in" label="Sample metadata file" type="data" format="tabular" help="sample x metadata columns, separator: tab" /> | |
35 <param name="variableMetadata_in" label="Variable metadata file" type="data" format="tabular" help="variable x metadata columns, separator: tab" /> | |
36 <param name="ksamples" label="K value(s) for samples" type="text" value = "0" help="[ksamples] Single K or comma-separated Ks for samples, or 0 for none." /> | |
37 <param name="kfeatures" label="K value(s) for features" type="text" value = "0" help="[kfeatures] Single K or comma-separated Ks for features (variables), or 0 for none." /> | |
38 <param name="iter_max" label="Max number of iterations" type="text" value = "10" help="[iter_max] The maximum number of iterations allowed; default 10." /> | |
39 <param name="nstart" label="Number of random sets" type="text" value = "1" help="[nstart] How many random sets should be chosen; default 1." /> | |
40 <param name="algorithm" label="Algorithm for clustering" type="select" value = "Hartigan-Wong" help="[algorithm] K-means clustering algorithm, default 'Hartigan-Wong'; alternatives 'Lloyd', 'MacQueen'; 'Forgy' is a synonym for 'Lloyd', see references for further info."> | |
41 <option value="Forgy">Forgy</option> | |
42 <option value="Hartigan-Wong" selected="True">Hartigan-Wong</option> | |
43 <option value="Lloyd">Lloyd</option> | |
44 <option value="MacQueen">MacQueen</option> | |
45 </param> | |
46 </inputs> | |
47 | |
48 <outputs> | |
49 <data name="sampleMetadata_out" label="${tool.name}_${sampleMetadata_in.name}" format="tabular" ></data> | |
50 <data name="variableMetadata_out" label="${tool.name}_${variableMetadata_in.name}" format="tabular" ></data> | |
51 <data name="scores_out" label="${tool.name}_${dataMatrix_in.name}.kmeans" format="tabular" ></data> | |
52 </outputs> | |
53 | |
54 <tests> | |
55 <test> | |
56 <param name="dataMatrix_in" value="input_dataMatrix.tsv"/> | |
57 <param name="sampleMetadata_in" value="input_sampleMetadata.tsv"/> | |
58 <param name="variableMetadata_in" value="input_variableMetadata.tsv"/> | |
59 <param name="ksamples" value="3,4"/> | |
60 <param name="kfeatures" value="5,6,7"/> | |
61 <param name="iter_max" value="10"/> | |
62 <param name="nstart" value="1"/> | |
63 <param name="algorithm" value="Hartigan-Wong"/> | |
64 <output name="scores_out"> | |
65 <assert_contents> | |
66 <has_text text="proportion" /> | |
67 <has_text text="0.87482" /> | |
68 <has_text text="0.89248" /> | |
69 <has_text text="0.95355" /> | |
70 <has_text text="0.95673" /> | |
71 <has_text text="0.95963" /> | |
72 </assert_contents> | |
73 </output> | |
74 </test> | |
75 </tests> | |
76 | |
77 <help> | |
78 <![CDATA[ | |
79 | |
80 **Author** - Arthur Eschenlauer (University of Minnesota, esch0041@umn.edu) | |
81 | |
82 --------------------------------------------------------------------------- | |
83 | |
84 | |
85 **Source** - The source code for the w4mkmeans tool is available (from the Hegeman lab github repository) at https://github.com/HegemanLab/w4mkmeans_galaxy_wrapper | |
86 | |
87 **R code used** - The R code invoked by this wrapper is the R 'stats::kmeans' package | |
88 | |
89 ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | |
90 | |
91 | |
92 **Tool updates** | |
93 | |
94 See the **NEWS** section at the bottom of this page | |
95 | |
96 --------------------------------------------------- | |
97 | |
98 =========================== | |
99 K-means for W4M data matrix | |
100 =========================== | |
101 | |
102 ----------- | |
103 Description | |
104 ----------- | |
105 | |
106 Calculate K-means for sample-clusters (or feature-clusters, or both) using W4M dataMatrix (i.e., XCMS-preprocessed data files) as input. | |
107 | |
108 *Please note that XCMS refers to features as 'variables'. This documentation does not use either term consistently.* | |
109 | |
110 | |
111 ----------------- | |
112 Workflow Position | |
113 ----------------- | |
114 | |
115 - Tool category: Statistical Analysis | |
116 - Upstream tool category: Preprocessing | |
117 - Downstream tool categories: Statistical Analysis | |
118 | |
119 | |
120 ---------- | |
121 Motivation | |
122 ---------- | |
123 | |
124 This tool clusters samples, features (variables), or both from the W4M dataMatrix and writes the results to new columns in sampleMetadata, variableMetadata, or both, respectively. | |
125 | |
126 - If several, comma-separated K's are supplied, then one column is added for each K. | |
127 - This clustering is **not** hierarchical; each member of a cluster is not a member of any other cluster. | |
128 - For feature-clustering, each feature is assigned to a cluster such that the feature's response for all samples is closer to the mean of all features for that cluster than to the mean for any other cluster. | |
129 - For sample-clustering, each sample is assigned to a cluster such that the sample's response for all features is closer to the mean of all samples for that cluster than to the mean for any other cluster. | |
130 | |
131 | |
132 ----------- | |
133 Input files | |
134 ----------- | |
135 | |
136 +--------------------------------------------+------------+ | |
137 | File | Format | | |
138 +============================================+============+ | |
139 | Data matrix | tabular | | |
140 +--------------------------------------------+------------+ | |
141 | Sample metadata | tabular | | |
142 +--------------------------------------------+------------+ | |
143 | Variable (i.e., feature) metadata | tabular | | |
144 +--------------------------------------------+------------+ | |
145 | |
146 | |
147 ---------- | |
148 Parameters | |
149 ---------- | |
150 | |
151 **Data matrix** - input-file dataset | |
152 | |
153 - XCMS variable x sample 'dataMatrix' (tabular separated values) file of the numeric data matrix, with . as decimal, and NA for missing values; the table must not contain metadata apart from row and column names; the row and column names must be identical to the rownames of the sample and feature metadata, respectively (see below) | |
154 | |
155 **Sample metadata** - input-file dataset | |
156 | |
157 - XCMS sample x metadata 'sampleMetadata' (tabular separated values) file of the numeric and/or character sample metadata, with . as decimal and NA for missing values | |
158 | |
159 **Feature metadata** - input-file dataset | |
160 | |
161 - XCMS variable x metadata 'variableMetadata' (tabular separated values) file of the numeric and/or character feature metadata, with . as decimal and NA for missing values | |
162 | |
163 **kfeatures** - K or K's for features (default = 0) | |
164 | |
165 - integer or comma-separated integers ; zero (the default) or less will result in no calculation. | |
166 | |
167 **ksamples** - K or K-range for samples (default = 0) | |
168 | |
169 - integer or comma-separated integers ; zero (the default) or less will result in no calculation. | |
170 | |
171 **iter_max** - maximum_iterations (default = 10) | |
172 | |
173 - maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html). | |
174 | |
175 **nstart** - how many random sets should be chosen (default = 1) | |
176 | |
177 - maximum number of iterations per calculation (see https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html). | |
178 | |
179 ------------ | |
180 Output files | |
181 ------------ | |
182 | |
183 **XCMS sampleMetadata** - (tabular separated values) file identical to the Sample metadata file given as an input argument, excepting one column added for each K | |
184 | |
185 - **k#** - cluster number for clustering samples with K = # | |
186 | |
187 **XCMS variableMetadata** - (tabular separated values) file identical to the Feature metadata file given as an input argument, excepting one column added for each K | |
188 | |
189 - **k#** - cluster number for clustering features with K = # | |
190 | |
191 **scores** - (tabular separated values) file with one line for each K. | |
192 | |
193 - **clusterOn** - what was clustered - either 'sample' or 'feature' | |
194 - **k** - the chosen K for clustering | |
195 - **totalSS** - total (*between-treatements* plus total of *within-treatements*) sum of squares | |
196 - **betweenSS** - *between-treatements* sum of squares | |
197 - **proportion** - betweenSS / totalSS | |
198 | |
199 --------------- | |
200 Working example | |
201 --------------- | |
202 | |
203 **Input files** | |
204 | |
205 +-------------------+-------------------------------------------------------------------------------------------------------------------+ | |
206 | Input File | Download from URL | | |
207 +===================+===================================================================================================================+ | |
208 | Data matrix | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_dataMatrix.tsv | | |
209 +-------------------+-------------------------------------------------------------------------------------------------------------------+ | |
210 | Sample metadata | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_sampleMetadata.tsv | | |
211 +-------------------+-------------------------------------------------------------------------------------------------------------------+ | |
212 | Feature metadata | https://raw.githubusercontent.com/HegemanLab/w4mkmeans_galaxy_wrapper/master/test-data/input_variableMetadata.tsv | | |
213 +-------------------+-------------------------------------------------------------------------------------------------------------------+ | |
214 | |
215 **Other input parameters** | |
216 | |
217 +-----------------+---------------+ | |
218 | Input Parameter | Value | | |
219 +=================+===============+ | |
220 | ksamples | 3,4 | | |
221 +-----------------+---------------+ | |
222 | kfeatures | 5,6,7 | | |
223 +-----------------+---------------+ | |
224 | iter_max | 10 | | |
225 +-----------------+---------------+ | |
226 | nstart | 1 | | |
227 +-----------------+---------------+ | |
228 | algorithm | Hartigan-Wong | | |
229 +-----------------+---------------+ | |
230 | |
231 ---- | |
232 NEWS | |
233 ---- | |
234 | |
235 August 2017, Version 0.98.1 - First release | |
236 | |
237 --------- | |
238 Citations | |
239 --------- | |
240 | |
241 ]]> | |
242 </help> | |
243 <citations> | |
244 <citation type="bibtex"><![CDATA[ | |
245 @incollection{RCoreTeam2017, | |
246 title = {stats::kmeans - K-Means Clustering}, | |
247 booktitle = {R: A Language and Environment for Statistical Computing}, | |
248 author = {{R Core Team}}, | |
249 publisher = {R Foundation for Statistical Computing}, | |
250 address = {Vienna, Austria}, | |
251 year = {2017}, | |
252 url = {https://stat.ethz.ch/R-manual/R-devel/library/stats/html/kmeans.html}, | |
253 } | |
254 ]]></citation> | |
255 <!-- Forgy algorithm --> | |
256 <citation type="bibtex"><![CDATA[ | |
257 @article{forgy65, | |
258 added-at = {2006-03-23T12:22:43.000+0100}, | |
259 author = {Forgy, E.}, | |
260 biburl = {https://www.bibsonomy.org/bibtex/21e31409932ce91df646c4731350e1207/hotho}, | |
261 interhash = {c86383cba8cfe00d5e6ef200016aca3f}, | |
262 intrahash = {1e31409932ce91df646c4731350e1207}, | |
263 journal = {Biometrics}, | |
264 keywords = {clustering kmeans}, | |
265 number = 3, | |
266 pages = {768-769}, | |
267 timestamp = {2006-03-23T12:22:43.000+0100}, | |
268 title = {Cluster Analysis of Multivariate Data: Efficiency versus Interpretability of Classification}, | |
269 volume = 21, | |
270 year = 1965 | |
271 } | |
272 ]]></citation> | |
273 <!-- W4M 3.0 - Guitton et al. 2017--> | |
274 <citation type="doi">10.1016/j.biocel.2017.07.002</citation> | |
275 <!-- W4M 2.5 - Giacomini et al. 2014 --> | |
276 <citation type="doi">10.1093/bioinformatics/btu813</citation> | |
277 <!-- Hartigan and Wong algorithm --> | |
278 <citation type="bibtex"><![CDATA[ | |
279 @article{Hartigan79, | |
280 added-at = {2007-02-27T16:22:09.000+0100}, | |
281 author = {Hartigan, J. and Wong, M.}, | |
282 biburl = {https://www.bibsonomy.org/bibtex/23d8bfc440c5725783876929c022f67ce/pierpaolo.pk81}, | |
283 description = {WSD}, | |
284 interhash = {10d6d33920d9af578a4d0a556dc1477d}, | |
285 intrahash = {3d8bfc440c5725783876929c022f67ce}, | |
286 journal = {Applied Statistics}, | |
287 keywords = {imported}, | |
288 pages = {100-108}, | |
289 timestamp = {2007-02-27T16:22:11.000+0100}, | |
290 title = {Algorithm AS136: A k-means clustering algorithm}, | |
291 volume = 28, | |
292 year = 1979 | |
293 } | |
294 ]]></citation> | |
295 <!-- Lloyd algorithm --> | |
296 <citation type="doi">10.1109/TIT.1982.1056489</citation> | |
297 <!-- MacQueen algorithm --> | |
298 <citation type="bibtex"><![CDATA[ | |
299 @inproceedings{MacQueen1967, | |
300 added-at = {2011-01-11T13:35:01.000+0100}, | |
301 author = {MacQueen, J. B.}, | |
302 biburl = {https://www.bibsonomy.org/bibtex/25dcdb8cd9fba78e0e791af619d61d66d/enitsirhc}, | |
303 booktitle = {Proc. of the fifth Berkeley Symposium on Mathematical Statistics and Probability}, | |
304 editor = {Cam, L. M. Le and Neyman, J.}, | |
305 interhash = {8d7d4dfe7d3a06b8c9c3c2bb7aa91e28}, | |
306 intrahash = {5dcdb8cd9fba78e0e791af619d61d66d}, | |
307 keywords = {kmeans clustering}, | |
308 pages = {281-297}, | |
309 publisher = {University of California Press}, | |
310 timestamp = {2011-01-11T13:35:01.000+0100}, | |
311 title = {Some Methods for Classification and Analysis of MultiVariate Observations}, | |
312 volume = 1, | |
313 year = 1967 | |
314 } | |
315 ]]></citation> | |
316 </citations> | |
317 <!-- | |
318 vim:et:sw=2:ts=2: | |
319 --> </tool> |