comparison numeric_clustering.xml @ 0:a3fd214e7555 draft default tip

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/numeric_clustering commit bafd56379ff227fb81f8cd61d708ebc39814da54
author bgruening
date Fri, 01 Jan 2016 18:37:54 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a3fd214e7555
1 <tool id="numeric_clustering" name="Numeric Clustering" version="@VERSION@">
2 <description></description>
3 <requirements>
4 <requirement type="package" version="2.3.0">anaconda</requirement>
5 </requirements>
6 <stdio>
7 <exit_code level="fatal" range="1:"/>
8 </stdio>
9 <macros>
10 <token name="@VERSION@">0.9</token>
11 <macro name="n_clusters" token_default_value="8">
12 <param name="n_clusters" type="integer" optional="true" value="@DEFAULT_VALUE@" label="Number of clusters"
13 help="default value is @DEFAULT_VALUE@ (--n_clusters)"/>
14 </macro>
15 <macro name="n_init">
16 <param name="n_init" type="integer" optional="true" value="" label="Number of runs with different centroid seeds"/>
17 </macro>
18 <macro name="max_iter">
19 <param name="max_iter" type="integer" optional="true" value="" label="Maximum number of iterations per single run"/>
20 </macro>
21 <macro name="random_state">
22 <param name="random_state" type="integer" optional="true" value="" label="Initialize centers"/>
23 </macro>
24 <macro name="affinity">
25 <param name="affinity" type="text" optional="true" value="" label="Affinity"/>
26 </macro>
27 <macro name="tol">
28 <param name="tol" type="float" optional="true" value="" label="Relative tolerance"/>
29 </macro>
30 <macro name="init">
31 <param name="init" type="select" label="Select initialization method">
32 <option value="k-means++">k-means++</option>
33 <option value="random">random</option>
34 </param>
35 </macro>
36 </macros>
37 <version_command>echo "@VERSION@"</version_command>
38 <command><![CDATA[
39 cat "$cluster_script" >&2
40 &&
41 #import json
42 #set $params = dict()
43 #for $key, $value in $algorithm_options.items():
44 #if not $key.startswith('__') and $key.strip() != 'selected_algorithm' and str($value).strip():
45 #if str($value).strip() == 'false':
46 #set $value = False
47 #elif str($value).strip() == 'true':
48 #set $value = True
49 #else:
50 #try:
51 #set $val = float($value)
52 #try:
53 #set $value = int($value)
54 #except:
55 #set $value = float($value)
56 #end try
57 #except:
58 #set $value = str($value)
59 #end try
60 #end if
61 $params.update({str($key): $value})
62 #end if
63 #end for
64 #set $json_string = json.dumps( $params )
65
66 python "$cluster_script" '$json_string'
67
68 ]]>
69 </command>
70 <configfiles>
71 <configfile name="cluster_script">
72 <![CDATA[
73 import sys
74 import json
75 import numpy as np
76 import sklearn.cluster
77 import pandas
78
79 data = pandas.read_csv("$infile", sep='\t', header=0, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False )
80 my_class = getattr(sklearn.cluster, "$algorithm_options.selected_algorithm")
81 cluster_object = my_class()
82
83 params = json.loads( sys.argv[1] )
84 cluster_object.set_params(**params)
85 #if $end_column and $start_column:
86
87 if $end_column >= $start_column:
88 data_matrix = data.values[:, $start_column-1:$end_column]
89 else:
90 data_matrix = data.values
91
92 #else:
93 data_matrix = data.values
94 #end if
95 prediction = cluster_object.fit_predict( data_matrix )
96 prediction_df = pandas.DataFrame(prediction)
97 res = pandas.concat([data, prediction_df], axis=1)
98 res.to_csv(path_or_buf = "$outfile", sep="\t", index=False)
99 ]]>
100 </configfile>
101 </configfiles>
102 <inputs>
103 <param name="infile" type="data" format="tabular" label="Data file with numeric values" />
104 <param name="start_column" type="data_column" data_ref="infile" optional="True" label="Clustering column from" />
105 <param name="end_column" type="data_column" data_ref="infile" optional="True" label="to" />
106 <conditional name="algorithm_options">
107 <param name="selected_algorithm" type="select" label="Clustering Algorithm">
108 <option value="KMeans">KMeans</option>
109 <option value="DBSCAN">DBSCAN</option>
110 <option value="Birch">Birch</option>
111 <option value="MeanShift">MeanShift</option>
112 <option value="AffinityPropagation">Affinity Propagation</option>
113 <option value="AgglomerativeClustering">Agglomerative Clustering</option>
114 <option value="SpectralClustering">Spectral Clustering</option>
115 <option value="MiniBatchKMeans">Mini Batch KMeans</option>
116 </param>
117 <when value="KMeans">
118 <expand macro="n_clusters" default_label="8"/>
119 <expand macro="init"/>
120 <expand macro="n_init"/>
121 <expand macro="max_iter"/>
122 <expand macro="tol"/>
123 <param name="precompute_distances" type="text" optional="true" value="" label="Precompute distances"/>
124 <expand macro="random_state"/>
125 <param name="copy_x" type="boolean" optional="true" truevalue="--copy_x" falsevale="" label="Do not modify original data"/>
126 </when>
127 <when value="DBSCAN">
128 <param name="eps" type="float" optional="true" value="0.5" label="Maximum neghborhood distance"/>
129 <param name="min_samples" type="integer" optional="true" value="5" label="Core point minimum population"/>
130 <param name="metric" type="text" optional="true" value="euclidean" label="Metric"/>
131 <param name="algorithm" type="select" optional="true" value="auto" label="Pointwise distance algorithm">
132 <option value="auto">auto</option>
133 <option value="ball_tree">ball_tree</option>
134 <option value="kd_tree">kd_tree</option>
135 <option value="brute">brute</option>
136 </param>
137 <param name="leaf_size" type="integer" optional="true" value="30" label="Leaf size"/>
138 </when>
139 <when value="Birch">
140 <param name="threshold" type="float" optional="true" value="0.5" label="Subcluster radius threshold"/>
141 <param name="branching_factor" type="integer" optional="true" value="50" label="Maximum number of subclusters per branch"/>
142 <expand macro="n_clusters" default_label="3" /> <!-- default to 3-->
143 <!--param name="compute_labels" type="boolean" optional="true" truevalue="true" falsevale="false" label="Compute labels for each fit"/-->
144 </when>
145 <when value="AffinityPropagation">
146 <param name="damping" type="float" optional="true" value="0.5" label="Damping factor"/>
147 <expand macro="max_iter"/> <!--default to 200 -->
148 <param name="convergence_iter" type="integer" optional="true" value="15" label="Number of iterations at each convergence step"/>
149 <param name="copy" type="boolean" optional="true" truevalue="true" falsevale="false" label="Make a copy of input data"/>
150 <!--param name="preference" type="text" optional="true" value="None" label="Array like shape (n_samples,)"/-->
151 <expand macro="affinity"/> <!--default = euclidean-->
152 </when>
153 <when value="MeanShift">
154 <param name="bandwidth" type="float" optional="true" value="" label="RBF kernel bandwidth"/>
155 <!--param name="seeds" type="list" optional="true" value="None" label=""/-->
156 <param name="bin_seeding" type="boolean" optional="true" truevalue="true" falsevale="false" label="Discretize initial kernel locations"/>
157 <param name="min_bin_freq" type="integer" optional="true" value="1" label="Minimum number of seeds per bin"/>
158 <param name="cluster_all" type="boolean" optional="true" truevalue="true" falsevale="false" label="Cluster all"/>
159 </when>
160 <when value="AgglomerativeClustering">
161 <expand macro="n_clusters" default_label="2" /> <!-- deafault 2-->
162 <expand macro="affinity"/> <!--default = euclidean-->
163 <!--param name="memory" type="callable" optional="true" value="Memory(cachedir=None)" label="Caching path"/-->
164 <!--param name="connectivity" type="list array-like or callable" optional="true" value="None" label="Connectivity matrix"/-->
165 <param name="n_components" type="integer" optional="true" value="" label="Number of connected components"/>
166 <!--param name="compute_full_tree" type="text or boolean" optional="true" value="auto" label=""/-->
167 <param name="linkage" type="select" optional="true" value="ward" label="Linkage">
168 <option value="ward">ward</option>
169 <option value="complete">complete</option>
170 <option value="average">average</option>
171 </param>
172 <!--param name="pooling_func" type="callable" optional="np.mean" value="None" label=""/-->
173 </when>
174 <when value="SpectralClustering">
175 <expand macro="n_clusters" default_label="8" />
176 <param name="eigen_solver" type="select" value="arpack" label="Eigenvalue decomposition strategy">
177 <option value="arpack">arpack</option>
178 <option value="lobpcg">lobpcg</option>
179 <option value="amg">amg</option>
180 </param>
181 <expand macro="random_state"/>
182 <!-- Todo: extend random_state type to int seed, RandomState instance, or None. -->
183 <expand macro="n_init"/> <!-- default to 10-->
184 <param name="gamma" type="float" optional="true" value="1.0" label="Kernel scaling factor"/>
185 <expand macro="affinity"/> <!--default =rbf-->
186 <param name="n_neighbors" type="integer" optional="true" value="10" label="Number of neighbors"/>
187 <!--param name="eigen_tol" type="float" optional="true" value="0.0" label="arpack eigendecomposition stopping threshold"/-->
188 <param name="assign_labels" type="select" optional="true" value="kmeans" label="Assign labels">
189 <option value="kmeans">kmeans</option>
190 <option value="discretize">discretize</option>
191 </param>
192 <param name="degree" type="integer" optional="true" value="3" label="Degree of the polynomial (polynomial kernel only)"/>
193 <param name="coef0" type="integer" optional="true" value="1" label="Zero coefficient (polynomial and sigmoid kernels only)"/>
194 <!--param name="kernel_params" type="dict" optional="true" value="None" label=""/-->
195 </when>
196 <when value="MiniBatchKMeans">
197 <expand macro="n_clusters" default_label="8"/>
198 <expand macro="init"/>
199 <expand macro="n_init"/> <!-- default to 3-->
200 <expand macro="max_iter"/> <!--default to 100-->
201 <expand macro="tol"/> <!--default = 0.0-->
202 <expand macro="random_state"/>
203 <param name="batch_size" type="integer" optional="true" value="100" label="Mini batch size"/>
204 <!--param name="compute_labels" type="boolean" optional="true" truevalue="true" falsevale="false" label="Compute labels for all data"/-->
205 <param name="max_no_improvement" type="integer" optional="true" value="10" label="Maximum number of improvement attempts"/>
206 <param name="init_size" type="integer" optional="true" value="" label="Number of random init samples"/>
207 <param name="reassignment_ratio" type="float" optional="true" value="0.01" label="Re-assignment ratio"/>
208 </when>
209 </conditional>
210 </inputs>
211 <outputs>
212 <data format_source="infile" name="outfile"/>
213 </outputs>
214 <tests>
215 <test>
216 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
217 <param name="selected_algorithm" value="KMeans"/>
218 <param name="start_column" value="2" />
219 <param name="end_column" value="4" />
220 <param name="n_clusters" value="4" />
221 <param name="init" value="k-means++" />
222 <param name="random_state" value="100"/>
223 <output name="outfile" file="cluster_result01.txt"/>
224 </test>
225 <test>
226 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
227 <param name="selected_algorithm" value="KMeans"/>
228 <param name="start_column" value="2" />
229 <param name="end_column" value="4" />
230 <param name="n_clusters" value="4" />
231 <param name="init" value="random" />
232 <param name="random_state" value="100"/>
233 <output name="outfile" file="cluster_result02.txt"/>
234 </test>
235 <test>
236 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
237 <param name="selected_algorithm" value="DBSCAN"/>
238 <param name="start_column" value="2" />
239 <param name="end_column" value="4" />
240 <param name="algorithm" value="kd_tree"/>
241 <param name="leaf_size" value="10"/>
242 <param name="eps" value="1.0"/>
243 <output name="outfile" file="cluster_result03.txt"/>
244 </test>
245 <test>
246 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
247 <param name="selected_algorithm" value="Birch"/>
248 <param name="start_column" value="2" />
249 <param name="end_column" value="4" />
250 <param name="n_clusters" value="4"/>
251 <param name="threshold" value="0.008"/>
252 <output name="outfile" file="cluster_result04.txt"/>
253 </test>
254 <test>
255 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
256 <param name="selected_algorithm" value="Birch"/>
257 <param name="start_column" value="2" />
258 <param name="end_column" value="4" />
259 <param name="branching_factor" value="20"/>
260 <output name="outfile" file="cluster_result05.txt"/>
261 </test>
262 <test>
263 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
264 <param name="selected_algorithm" value="AffinityPropagation"/>
265 <param name="start_column" value="2" />
266 <param name="end_column" value="4" />
267 <param name="affinity" value="euclidean"/>
268 <param name="copy" value="false"/>
269 <output name="outfile" file="cluster_result06.txt"/>
270 </test>
271 <test>
272 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
273 <param name="selected_algorithm" value="AffinityPropagation"/>
274 <param name="start_column" value="2" />
275 <param name="end_column" value="4" />
276 <param name="damping" value="0.8"/>
277 <output name="outfile" file="cluster_result07.txt"/>
278 </test>
279 <test>
280 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
281 <param name="selected_algorithm" value="MeanShift"/>
282 <param name="start_column" value="2" />
283 <param name="end_column" value="4" />
284 <param name="min_bin_freq" value="3"/>
285 <output name="outfile" file="cluster_result08.txt"/>
286 </test>
287 <test>
288 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
289 <param name="selected_algorithm" value="MeanShift"/>
290 <param name="start_column" value="2" />
291 <param name="end_column" value="4" />
292 <param name="cluster_all" value="False"/>
293 <output name="outfile" file="cluster_result09.txt"/>
294 </test>
295 <test>
296 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
297 <param name="selected_algorithm" value="AgglomerativeClustering"/>
298 <param name="start_column" value="2" />
299 <param name="end_column" value="4" />
300 <param name="affinity" value="euclidean"/>
301 <param name="linkage" value="average"/>
302 <param name="n_clusters" value="4"/>
303 <output name="outfile" file="cluster_result10.txt"/>
304 </test>
305 <test>
306 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
307 <param name="selected_algorithm" value="AgglomerativeClustering"/>
308 <param name="start_column" value="2" />
309 <param name="end_column" value="4" />
310 <param name="linkage" value="complete"/>
311 <param name="n_clusters" value="4"/>
312 <output name="outfile" file="cluster_result11.txt"/>
313 </test>
314 <test>
315 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
316 <param name="selected_algorithm" value="SpectralClustering"/>
317 <param name="start_column" value="2" />
318 <param name="end_column" value="4" />
319 <param name="eigen_solver" value="arpack"/>
320 <param name="n_neighbors" value="12"/>
321 <param name="n_clusters" value="4"/>
322 <param name="assign_labels" value="discretize"/>
323 <param name="random_state" value="100"/>
324 <output name="outfile" file="cluster_result12.txt"/>
325 </test>
326 <test>
327 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
328 <param name="selected_algorithm" value="SpectralClustering"/>
329 <param name="start_column" value="2" />
330 <param name="end_column" value="4" />
331 <param name="assign_labels" value="discretize"/>
332 <param name="random_state" value="100"/>
333 <param name="degree" value="2"/>
334 <output name="outfile" file="cluster_result13.txt"/>
335 </test>
336 <test>
337 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
338 <param name="selected_algorithm" value="MiniBatchKMeans"/>
339 <param name="start_column" value="2" />
340 <param name="end_column" value="4" />
341 <param name="tol" value="0.5"/>
342 <param name="random_state" value="100"/>
343 <output name="outfile" file="cluster_result14.txt"/>
344 </test>
345 <test>
346 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
347 <param name="selected_algorithm" value="MiniBatchKMeans"/>
348 <param name="n_init" value="5"/>
349 <param name="start_column" value="2" />
350 <param name="end_column" value="4" />
351 <param name="batch_size" value="10"/>
352 <param name="n_clusters" value="4"/>
353 <param name="random_state" value="100"/>
354 <param name="reassignment_ratio" value="1.0"/>
355 <output name="outfile" file="cluster_result15.txt"/>
356 </test>
357 <test>
358 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
359 <param name="selected_algorithm" value="KMeans"/>
360 <param name="start_column" value="1" />
361 <param name="end_column" value="1" />
362 <param name="n_clusters" value="4" />
363 <param name="random_state" value="100"/>
364 <output name="outfile" file="cluster_result16.txt"/>
365 </test>
366 </tests>
367 <help><![CDATA[
368 **What it does**
369
370 This clustering tool offers different clustering algorithms which are provided by
371 scikit-learn to find similarities among samples and cluster the samples based on these similarities.
372
373 ]]></help>
374 <citations>
375 <citation type="bibtex">
376 @article{scikit-learn,
377 title={Scikit-learn: Machine Learning in {P}ython},
378 author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
379 and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
380 and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
381 Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
382 journal={Journal of Machine Learning Research},
383 volume={12},
384 pages={2825--2830},
385 year={2011}
386 url = {https://github.com/scikit-learn/scikit-learn}
387 }
388 </citation>
389 </citations>
390 </tool>