comparison numeric_clustering.xml @ 0:c7b8fab00c0f draft

planemo upload for repository https://github.com/bgruening/galaxytools/tools/sklearn commit 0e582cf1f3134c777cce3aa57d71b80ed95e6ba9
author bgruening
date Fri, 16 Feb 2018 09:17:59 -0500
parents
children 40f3318b61c2
comparison
equal deleted inserted replaced
-1:000000000000 0:c7b8fab00c0f
1 <tool id="sklearn_numeric_clustering" name="Numeric Clustering" version="@VERSION@">
2 <description></description>
3 <macros>
4 <import>main_macros.xml</import>
5 </macros>
6 <expand macro="python_requirements"/>
7 <expand macro="macro_stdio"/>
8 <version_command>echo "@VERSION@"</version_command>
9 <command><![CDATA[
10 python "$cluster_script" '$inputs'
11 ]]>
12 </command>
13 <configfiles>
14 <inputs name="inputs"/>
15 <configfile name="cluster_script">
16 <![CDATA[
17 import sys
18 import json
19 import numpy as np
20 import sklearn.cluster
21 import pandas
22 from sklearn import metrics
23 from scipy.io import mmread
24
25 input_json_path = sys.argv[1]
26 params = json.load(open(input_json_path, "r"))
27
28 selected_algorithm = params["input_types"]["algorithm_options"]["selected_algorithm"]
29
30 my_class = getattr(sklearn.cluster, selected_algorithm)
31 cluster_object = my_class()
32 options = params["input_types"]["algorithm_options"]["options"]
33
34 cluster_object.set_params(**options)
35
36 #if $input_types.selected_input_type == "sparse":
37 data_matrix = mmread(open("$infile", 'r'))
38 #else:
39 data = pandas.read_csv("$infile", sep='\t', header=0, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False )
40
41 start_column = $input_types.start_column
42 end_column = $input_types.end_column
43
44 if end_column and start_column:
45 if end_column >= start_column:
46 data_matrix = data.values[:, start_column-1:end_column]
47 else:
48 data_matrix = data.values
49 else:
50 data_matrix = data.values
51 #end if
52
53 prediction = cluster_object.fit_predict( data_matrix )
54
55 if len(np.unique(prediction)) > 1:
56 silhouette_score = metrics.silhouette_score(data_matrix,prediction,metric='euclidean')
57 else:
58 silhouette_score = -1
59 sys.stdout.write('silhouette score:' + '\t' + str(silhouette_score) + '\n')
60
61 prediction_df = pandas.DataFrame(prediction)
62
63 #if $input_types.selected_input_type == "sparse":
64 res = prediction_df
65 #else:
66 res = pandas.concat([data, prediction_df], axis=1)
67 #end if
68
69 res.to_csv(path_or_buf = "$outfile", sep="\t", index=False, header=False)
70 ]]>
71 </configfile>
72 </configfiles>
73 <inputs>
74 <conditional name="input_types">
75 <param name="selected_input_type" type="select" label="Select the format of input data">
76 <option value="tabular" selected="true">Tabular Format (tabular, txt)</option>
77 <option value="sparse">Sparse Vector Representation (mtx)</option>
78 </param>
79 <when value="sparse">
80 <param name="infile" type="data" format="txt" label="Sparse vector (scipy.sparse.csr_matrix) file:" help="The following clustering algorithms support sparse matrix operations: ''Birch'', ''DBSCAN'', ''KMeans'', ''Mini BatchK Means'', and ''Spectral Clustering''. If your data is in tabular format, please use other clustering algorithms."/>
81 <expand macro="clustering_algorithms_options"/>
82 </when>
83 <when value="tabular">
84 <param name="infile" type="data" format="tabular" label="Data file with numeric values"/>
85 <param name="start_column" type="data_column" data_ref="infile" optional="True" label="Select a subset of data. Start column:" />
86 <param name="end_column" type="data_column" data_ref="infile" optional="True" label="End column:" />
87 <!--expand macro="clustering_algorithms_options"-->
88 <conditional name="algorithm_options">
89 <param name="selected_algorithm" type="select" label="Clustering Algorithm">
90 <option value="AgglomerativeClustering">Hierarchical Agglomerative Clustering</option>
91 <option value="AffinityPropagation">Affinity Propagation</option>
92 <option value="SpectralClustering">Spectral Clustering</option>
93 <option value="MiniBatchKMeans">Mini Batch KMeans</option>
94 <option value="MeanShift">MeanShift</option>
95 <option value="KMeans">KMeans</option>
96 <option value="DBSCAN">DBSCAN</option>
97 <option value="Birch">Birch</option>
98 </param>
99 <when value="KMeans">
100 <expand macro="kmeans_advanced_options"/>
101 </when>
102 <when value="DBSCAN">
103 <expand macro="dbscan_advanced_options"/>
104 </when>
105 <when value="Birch">
106 <expand macro="birch_advanced_options"/>
107 </when>
108 <when value="SpectralClustering">
109 <expand macro="spectral_clustering_advanced_options"/>
110 </when>
111 <when value="MiniBatchKMeans">
112 <expand macro="minibatch_kmeans_advanced_options"/>
113 </when>
114 <when value="AffinityPropagation">
115 <section name="options" title="Advanced Options" expanded="False">
116 <param argument="damping" type="float" optional="true" value="0.5" label="Damping factor" help="Damping factor between 0.5 and 1."/>
117 <expand macro="max_iter" default_value="200"/>
118 <param argument="convergence_iter" type="integer" optional="true" value="15" label="Number of iterations at each convergence step" help="Number of iterations with no change in the number of estimated clusters that stops the convergence."/>
119 <param argument="copy" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Copy" help="If False, the affinity matrix is modified inplace by the algorithm, for memory efficiency."/>
120 <!--param argument="preference"/-->
121 <param argument="affinity" type="select" label="Affinity" help="Affinity to use; euclidean uses the negative squared euclidean distance between points.">
122 <option value="euclidean">Euclidean</option>
123 <option value="precomputed">precomputed</option>
124 </param>
125 </section>
126 </when>
127 <when value="MeanShift">
128 <section name="options" title="Advanced Options" expanded="False">
129 <param argument="bandwidth" type="float" optional="true" value="" label="Kernel bandwidth" help="Bandwidth used in the RBF kernel. If not given, it will be computed using a heuristic based on the median of all pairwise distances."/>
130 <!--param argument="seeds"/-->
131 <param argument="bin_seeding" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Discretize initial kernel locations" help="If true, initial kernel locations are the bins grid whose coarseness corresponds to the bandwidth, speeding up the algorithm."/>
132 <param argument="min_bin_freq" type="integer" optional="true" value="1" label="Minimum number of seeds per bin" help="To speed up the algorithm, accept only those bins with at least min_bin_freq points as seeds."/>
133 <param argument="cluster_all" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolflase" checked="true" label="Cluster all" help="If true, all points (including orphans) are clustered. If false, orphans are given cluster label -1."/>
134 </section>
135 </when>
136 <when value="AgglomerativeClustering">
137 <section name="options" title="Advanced Options" expanded="False">
138 <expand macro="n_clusters" default_value="2" />
139 <param argument="affinity" type="select" label="Affinity" help="Metric used to compute the linkage. If linkage is ''ward'', only ''euclidean'' is accepted.">
140 <option value="euclidean">Euclidean</option>
141 <option value="manhattan">Manhattan</option>
142 <option value="l1">L1</option>
143 <option value="l2">L2</option>
144 <option value="cosine">cosine</option>
145 <option value="precomputed">precomputed</option>
146 </param>
147 <!--param argument="memory"-->
148 <!--param argument="connectivity"-->
149 <!--param argument="n_components"/-->
150 <!--param argument="compute_full_tree"-->
151 <param argument="linkage" type="select" optional="true" label="Linkage" help="">
152 <option value="ward" selected="true">ward</option>
153 <option value="complete">complete</option>
154 <option value="average">average</option>
155 </param>
156 <!--param argument="pooling_func"-->
157 </section>
158 </when>
159 </conditional>
160 </when>
161 </conditional>
162 </inputs>
163 <outputs>
164 <data format="tabular" name="outfile"/>
165 </outputs>
166 <tests>
167 <test>
168 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
169 <param name="selected_input_type" value="tabular"/>
170 <param name="selected_algorithm" value="KMeans"/>
171 <param name="start_column" value="2" />
172 <param name="end_column" value="4" />
173 <param name="n_clusters" value="4" />
174 <param name="init" value="k-means++" />
175 <param name="random_state" value="100"/>
176 <output name="outfile" file="cluster_result01.txt"/>
177 </test>
178 <test>
179 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
180 <param name="selected_algorithm" value="KMeans"/>
181 <param name="selected_input_type" value="tabular"/>
182 <param name="start_column" value="2" />
183 <param name="end_column" value="4" />
184 <param name="n_clusters" value="4" />
185 <param name="init" value="random" />
186 <param name="random_state" value="100"/>
187 <output name="outfile" file="cluster_result02.txt"/>
188 </test>
189 <test>
190 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
191 <param name="selected_algorithm" value="DBSCAN"/>
192 <param name="selected_input_type" value="tabular"/>
193 <param name="start_column" value="2" />
194 <param name="end_column" value="4" />
195 <param name="algorithm" value="kd_tree"/>
196 <param name="leaf_size" value="10"/>
197 <param name="eps" value="1.0"/>
198 <output name="outfile" file="cluster_result03.txt"/>
199 </test>
200 <test>
201 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
202 <param name="selected_algorithm" value="Birch"/>
203 <param name="selected_input_type" value="tabular"/>
204 <param name="start_column" value="2" />
205 <param name="end_column" value="4" />
206 <param name="n_clusters" value="4"/>
207 <param name="threshold" value="0.008"/>
208 <output name="outfile" file="cluster_result04.txt"/>
209 </test>
210 <test>
211 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
212 <param name="selected_algorithm" value="Birch"/>
213 <param name="selected_input_type" value="tabular"/>
214 <param name="start_column" value="2" />
215 <param name="end_column" value="4" />
216 <param name="branching_factor" value="20"/>
217 <output name="outfile" file="cluster_result05.txt"/>
218 </test>
219 <test>
220 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
221 <param name="selected_algorithm" value="AffinityPropagation"/>
222 <param name="selected_input_type" value="tabular"/>
223 <param name="start_column" value="2" />
224 <param name="end_column" value="4" />
225 <param name="affinity" value="euclidean"/>
226 <param name="copy" value="false"/>
227 <output name="outfile" file="cluster_result06.txt"/>
228 </test>
229 <test>
230 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
231 <param name="selected_algorithm" value="AffinityPropagation"/>
232 <param name="selected_input_type" value="tabular"/>
233 <param name="start_column" value="2" />
234 <param name="end_column" value="4" />
235 <param name="damping" value="0.8"/>
236 <output name="outfile" file="cluster_result07.txt"/>
237 </test>
238 <test>
239 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
240 <param name="selected_algorithm" value="MeanShift"/>
241 <param name="selected_input_type" value="tabular"/>
242 <param name="start_column" value="2" />
243 <param name="end_column" value="4" />
244 <param name="min_bin_freq" value="3"/>
245 <output name="outfile" file="cluster_result08.txt"/>
246 </test>
247 <test>
248 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
249 <param name="selected_algorithm" value="MeanShift"/>
250 <param name="selected_input_type" value="tabular"/>
251 <param name="start_column" value="2" />
252 <param name="end_column" value="4" />
253 <param name="cluster_all" value="False"/>
254 <output name="outfile" file="cluster_result09.txt"/>
255 </test>
256 <test>
257 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
258 <param name="selected_algorithm" value="AgglomerativeClustering"/>
259 <param name="selected_input_type" value="tabular"/>
260 <param name="start_column" value="2" />
261 <param name="end_column" value="4" />
262 <param name="affinity" value="euclidean"/>
263 <param name="linkage" value="average"/>
264 <param name="n_clusters" value="4"/>
265 <output name="outfile" file="cluster_result10.txt"/>
266 </test>
267 <test>
268 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
269 <param name="selected_algorithm" value="AgglomerativeClustering"/>
270 <param name="selected_input_type" value="tabular"/>
271 <param name="start_column" value="2" />
272 <param name="end_column" value="4" />
273 <param name="linkage" value="complete"/>
274 <param name="n_clusters" value="4"/>
275 <output name="outfile" file="cluster_result11.txt"/>
276 </test>
277 <test>
278 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
279 <param name="selected_algorithm" value="SpectralClustering"/>
280 <param name="selected_input_type" value="tabular"/>
281 <param name="start_column" value="2" />
282 <param name="end_column" value="4" />
283 <param name="eigen_solver" value="arpack"/>
284 <param name="n_neighbors" value="12"/>
285 <param name="n_clusters" value="4"/>
286 <param name="assign_labels" value="discretize"/>
287 <param name="random_state" value="100"/>
288 <output name="outfile" file="cluster_result12" compare="sim_size" />
289 </test>
290 <test>
291 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
292 <param name="selected_algorithm" value="SpectralClustering"/>
293 <param name="selected_input_type" value="tabular"/>
294 <param name="start_column" value="2" />
295 <param name="end_column" value="4" />
296 <param name="assign_labels" value="discretize"/>
297 <param name="random_state" value="100"/>
298 <param name="degree" value="2"/>
299 <output name="outfile" file="cluster_result13.txt" compare="sim_size" />
300 </test>
301 <test>
302 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
303 <param name="selected_algorithm" value="MiniBatchKMeans"/>
304 <param name="selected_input_type" value="tabular"/>
305 <param name="start_column" value="2" />
306 <param name="end_column" value="4" />
307 <param name="tol" value="0.5"/>
308 <param name="random_state" value="100"/>
309 <output name="outfile" file="cluster_result14.txt"/>
310 </test>
311 <test>
312 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
313 <param name="selected_algorithm" value="MiniBatchKMeans"/>
314 <param name="selected_input_type" value="tabular"/>
315 <param name="n_init" value="5"/>
316 <param name="start_column" value="2" />
317 <param name="end_column" value="4" />
318 <param name="batch_size" value="10"/>
319 <param name="n_clusters" value="4"/>
320 <param name="random_state" value="100"/>
321 <param name="reassignment_ratio" value="1.0"/>
322 <output name="outfile" file="cluster_result15.txt"/>
323 </test>
324 <test>
325 <param name="infile" value="numeric_values.tabular" ftype="tabular"/>
326 <param name="selected_algorithm" value="KMeans"/>
327 <param name="selected_input_type" value="tabular"/>
328 <param name="start_column" value="1" />
329 <param name="end_column" value="1" />
330 <param name="n_clusters" value="4" />
331 <param name="random_state" value="100"/>
332 <output name="outfile" file="cluster_result16.txt"/>
333 </test>
334 <test>
335 <param name="infile" value="sparse.mtx" ftype="txt"/>
336 <param name="selected_input_type" value="sparse"/>
337 <param name="selected_algorithm" value="KMeans"/>
338 <param name="n_clusters" value="2" />
339 <param name="init" value="k-means++" />
340 <param name="random_state" value="100"/>
341 <output name="outfile" file="cluster_result17.txt"/>
342 </test>
343 <test>
344 <param name="infile" value="sparse.mtx" ftype="txt"/>
345 <param name="selected_algorithm" value="DBSCAN"/>
346 <param name="selected_input_type" value="sparse"/>
347 <param name="algorithm" value="kd_tree"/>
348 <param name="leaf_size" value="10"/>
349 <param name="eps" value="1.0"/>
350 <output name="outfile" file="cluster_result18.txt"/>
351 </test>
352 <test>
353 <param name="infile" value="sparse.mtx" ftype="txt"/>
354 <param name="selected_algorithm" value="Birch"/>
355 <param name="selected_input_type" value="sparse"/>
356 <param name="n_clusters" value="2"/>
357 <param name="threshold" value="0.008"/>
358 <output name="outfile" file="cluster_result19.txt"/>
359 </test>
360 <test>
361 <param name="infile" value="sparse.mtx" ftype="txt"/>
362 <param name="selected_algorithm" value="MiniBatchKMeans"/>
363 <param name="selected_input_type" value="sparse"/>
364 <param name="n_init" value="5"/>
365 <param name="batch_size" value="10"/>
366 <param name="n_clusters" value="2"/>
367 <param name="random_state" value="100"/>
368 <param name="reassignment_ratio" value="1.0"/>
369 <output name="outfile" file="cluster_result20.txt"/>
370 </test>
371 <test>
372 <param name="infile" value="sparse.mtx" ftype="txt"/>
373 <param name="selected_algorithm" value="SpectralClustering"/>
374 <param name="selected_input_type" value="sparse"/>
375 <param name="assign_labels" value="discretize"/>
376 <param name="n_clusters" value="2"/>
377 <param name="random_state" value="100"/>
378 <param name="degree" value="2"/>
379 <output name="outfile" file="cluster_result21.txt"/>
380 </test>
381 </tests>
382 <help><![CDATA[
383 **What it does**
384 This tool offers different clustering algorithms which are provided by
385 scikit-learn to find similarities among samples and cluster the samples based on these similarities.
386 ]]></help>
387 <expand macro="sklearn_citation"/>
388 </tool>