Mercurial > repos > bgruening > sklearn_numeric_clustering
comparison numeric_clustering.xml @ 13:40f3318b61c2 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/sklearn commit 5d71c93a3dd804b1469852240a86021ab9130364
author | bgruening |
---|---|
date | Mon, 09 Jul 2018 14:32:01 -0400 |
parents | c7b8fab00c0f |
children | 6bbf5cb20652 |
comparison
equal
deleted
inserted
replaced
12:3bd31820d63e | 13:40f3318b61c2 |
---|---|
20 import sklearn.cluster | 20 import sklearn.cluster |
21 import pandas | 21 import pandas |
22 from sklearn import metrics | 22 from sklearn import metrics |
23 from scipy.io import mmread | 23 from scipy.io import mmread |
24 | 24 |
25 @COLUMNS_FUNCTION@ | |
26 | |
25 input_json_path = sys.argv[1] | 27 input_json_path = sys.argv[1] |
26 params = json.load(open(input_json_path, "r")) | 28 params = json.load(open(input_json_path, "r")) |
27 | 29 |
28 selected_algorithm = params["input_types"]["algorithm_options"]["selected_algorithm"] | 30 selected_algorithm = params["input_types"]["algorithm_options"]["selected_algorithm"] |
29 | 31 |
35 | 37 |
36 #if $input_types.selected_input_type == "sparse": | 38 #if $input_types.selected_input_type == "sparse": |
37 data_matrix = mmread(open("$infile", 'r')) | 39 data_matrix = mmread(open("$infile", 'r')) |
38 #else: | 40 #else: |
39 data = pandas.read_csv("$infile", sep='\t', header=0, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) | 41 data = pandas.read_csv("$infile", sep='\t', header=0, index_col=None, parse_dates=True, encoding=None, tupleize_cols=False ) |
40 | 42 header = 'infer' if params["input_types"]["header"] else None |
41 start_column = $input_types.start_column | 43 column_option = params["input_types"]["column_selector_options"]["selected_column_selector_option"] |
42 end_column = $input_types.end_column | 44 if column_option in ["by_index_number", "all_but_by_index_number", "by_header_name", "all_but_by_header_name"]: |
43 | 45 c = params["input_types"]["column_selector_options"]["col"] |
44 if end_column and start_column: | |
45 if end_column >= start_column: | |
46 data_matrix = data.values[:, start_column-1:end_column] | |
47 else: | |
48 data_matrix = data.values | |
49 else: | 46 else: |
50 data_matrix = data.values | 47 c = None |
48 data_matrix = read_columns( | |
49 "$infile", | |
50 c = c, | |
51 c_option = column_option, | |
52 sep='\t', | |
53 header=header, | |
54 parse_dates=True, | |
55 encoding=None, | |
56 tupleize_cols=False | |
57 ) | |
51 #end if | 58 #end if |
52 | 59 |
53 prediction = cluster_object.fit_predict( data_matrix ) | 60 prediction = cluster_object.fit_predict( data_matrix ) |
54 | 61 |
55 if len(np.unique(prediction)) > 1: | 62 if len(np.unique(prediction)) > 1: |
80 <param name="infile" type="data" format="txt" label="Sparse vector (scipy.sparse.csr_matrix) file:" help="The following clustering algorithms support sparse matrix operations: ''Birch'', ''DBSCAN'', ''KMeans'', ''Mini BatchK Means'', and ''Spectral Clustering''. If your data is in tabular format, please use other clustering algorithms."/> | 87 <param name="infile" type="data" format="txt" label="Sparse vector (scipy.sparse.csr_matrix) file:" help="The following clustering algorithms support sparse matrix operations: ''Birch'', ''DBSCAN'', ''KMeans'', ''Mini BatchK Means'', and ''Spectral Clustering''. If your data is in tabular format, please use other clustering algorithms."/> |
81 <expand macro="clustering_algorithms_options"/> | 88 <expand macro="clustering_algorithms_options"/> |
82 </when> | 89 </when> |
83 <when value="tabular"> | 90 <when value="tabular"> |
84 <param name="infile" type="data" format="tabular" label="Data file with numeric values"/> | 91 <param name="infile" type="data" format="tabular" label="Data file with numeric values"/> |
85 <param name="start_column" type="data_column" data_ref="infile" optional="True" label="Select a subset of data. Start column:" /> | 92 <param name="header" type="boolean" optional="true" truevalue="booltrue" falsevalue="boolfalse" checked="True" label="Does the dataset contain header:" /> |
86 <param name="end_column" type="data_column" data_ref="infile" optional="True" label="End column:" /> | 93 <conditional name="column_selector_options"> |
94 <expand macro="samples_column_selector_options" col_name="col" multiple="true" infile="infile"/> | |
95 </conditional> | |
87 <!--expand macro="clustering_algorithms_options"--> | 96 <!--expand macro="clustering_algorithms_options"--> |
88 <conditional name="algorithm_options"> | 97 <conditional name="algorithm_options"> |
89 <param name="selected_algorithm" type="select" label="Clustering Algorithm"> | 98 <param name="selected_algorithm" type="select" label="Clustering Algorithm"> |
90 <option value="AgglomerativeClustering">Hierarchical Agglomerative Clustering</option> | 99 <option value="AgglomerativeClustering">Hierarchical Agglomerative Clustering</option> |
91 <option value="AffinityPropagation">Affinity Propagation</option> | 100 <option value="AffinityPropagation">Affinity Propagation</option> |
166 <tests> | 175 <tests> |
167 <test> | 176 <test> |
168 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 177 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
169 <param name="selected_input_type" value="tabular"/> | 178 <param name="selected_input_type" value="tabular"/> |
170 <param name="selected_algorithm" value="KMeans"/> | 179 <param name="selected_algorithm" value="KMeans"/> |
171 <param name="start_column" value="2" /> | 180 <param name="col" value="2,3,4" /> |
172 <param name="end_column" value="4" /> | |
173 <param name="n_clusters" value="4" /> | 181 <param name="n_clusters" value="4" /> |
174 <param name="init" value="k-means++" /> | 182 <param name="init" value="k-means++" /> |
175 <param name="random_state" value="100"/> | 183 <param name="random_state" value="100"/> |
176 <output name="outfile" file="cluster_result01.txt"/> | 184 <output name="outfile" file="cluster_result01.txt"/> |
177 </test> | 185 </test> |
178 <test> | 186 <test> |
179 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 187 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
180 <param name="selected_algorithm" value="KMeans"/> | 188 <param name="selected_algorithm" value="KMeans"/> |
181 <param name="selected_input_type" value="tabular"/> | 189 <param name="selected_input_type" value="tabular"/> |
182 <param name="start_column" value="2" /> | 190 <param name="col" value="2,3,4" /> |
183 <param name="end_column" value="4" /> | |
184 <param name="n_clusters" value="4" /> | 191 <param name="n_clusters" value="4" /> |
185 <param name="init" value="random" /> | 192 <param name="init" value="random" /> |
186 <param name="random_state" value="100"/> | 193 <param name="random_state" value="100"/> |
187 <output name="outfile" file="cluster_result02.txt"/> | 194 <output name="outfile" file="cluster_result02.txt"/> |
188 </test> | 195 </test> |
189 <test> | 196 <test> |
190 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 197 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
191 <param name="selected_algorithm" value="DBSCAN"/> | 198 <param name="selected_algorithm" value="DBSCAN"/> |
192 <param name="selected_input_type" value="tabular"/> | 199 <param name="selected_input_type" value="tabular"/> |
193 <param name="start_column" value="2" /> | 200 <param name="col" value="2,3,4" /> |
194 <param name="end_column" value="4" /> | |
195 <param name="algorithm" value="kd_tree"/> | 201 <param name="algorithm" value="kd_tree"/> |
196 <param name="leaf_size" value="10"/> | 202 <param name="leaf_size" value="10"/> |
197 <param name="eps" value="1.0"/> | 203 <param name="eps" value="1.0"/> |
198 <output name="outfile" file="cluster_result03.txt"/> | 204 <output name="outfile" file="cluster_result03.txt"/> |
199 </test> | 205 </test> |
200 <test> | 206 <test> |
201 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 207 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
202 <param name="selected_algorithm" value="Birch"/> | 208 <param name="selected_algorithm" value="Birch"/> |
203 <param name="selected_input_type" value="tabular"/> | 209 <param name="selected_input_type" value="tabular"/> |
204 <param name="start_column" value="2" /> | 210 <param name="col" value="2,3,4" /> |
205 <param name="end_column" value="4" /> | |
206 <param name="n_clusters" value="4"/> | 211 <param name="n_clusters" value="4"/> |
207 <param name="threshold" value="0.008"/> | 212 <param name="threshold" value="0.008"/> |
208 <output name="outfile" file="cluster_result04.txt"/> | 213 <output name="outfile" file="cluster_result04.txt"/> |
209 </test> | 214 </test> |
210 <test> | 215 <test> |
211 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 216 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
212 <param name="selected_algorithm" value="Birch"/> | 217 <param name="selected_algorithm" value="Birch"/> |
213 <param name="selected_input_type" value="tabular"/> | 218 <param name="selected_input_type" value="tabular"/> |
214 <param name="start_column" value="2" /> | 219 <param name="col" value="2,3,4" /> |
215 <param name="end_column" value="4" /> | |
216 <param name="branching_factor" value="20"/> | 220 <param name="branching_factor" value="20"/> |
217 <output name="outfile" file="cluster_result05.txt"/> | 221 <output name="outfile" file="cluster_result05.txt"/> |
218 </test> | 222 </test> |
219 <test> | 223 <test> |
220 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 224 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
221 <param name="selected_algorithm" value="AffinityPropagation"/> | 225 <param name="selected_algorithm" value="AffinityPropagation"/> |
222 <param name="selected_input_type" value="tabular"/> | 226 <param name="selected_input_type" value="tabular"/> |
223 <param name="start_column" value="2" /> | 227 <param name="col" value="2,3,4" /> |
224 <param name="end_column" value="4" /> | |
225 <param name="affinity" value="euclidean"/> | 228 <param name="affinity" value="euclidean"/> |
226 <param name="copy" value="false"/> | 229 <param name="copy" value="false"/> |
227 <output name="outfile" file="cluster_result06.txt"/> | 230 <output name="outfile" file="cluster_result06.txt"/> |
228 </test> | 231 </test> |
229 <test> | 232 <test> |
230 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 233 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
231 <param name="selected_algorithm" value="AffinityPropagation"/> | 234 <param name="selected_algorithm" value="AffinityPropagation"/> |
232 <param name="selected_input_type" value="tabular"/> | 235 <param name="selected_input_type" value="tabular"/> |
233 <param name="start_column" value="2" /> | 236 <param name="col" value="2,3,4" /> |
234 <param name="end_column" value="4" /> | |
235 <param name="damping" value="0.8"/> | 237 <param name="damping" value="0.8"/> |
236 <output name="outfile" file="cluster_result07.txt"/> | 238 <output name="outfile" file="cluster_result07.txt"/> |
237 </test> | 239 </test> |
238 <test> | 240 <test> |
239 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 241 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
240 <param name="selected_algorithm" value="MeanShift"/> | 242 <param name="selected_algorithm" value="MeanShift"/> |
241 <param name="selected_input_type" value="tabular"/> | 243 <param name="selected_input_type" value="tabular"/> |
242 <param name="start_column" value="2" /> | 244 <param name="col" value="2,3,4" /> |
243 <param name="end_column" value="4" /> | |
244 <param name="min_bin_freq" value="3"/> | 245 <param name="min_bin_freq" value="3"/> |
245 <output name="outfile" file="cluster_result08.txt"/> | 246 <output name="outfile" file="cluster_result08.txt"/> |
246 </test> | 247 </test> |
247 <test> | 248 <test> |
248 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 249 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
249 <param name="selected_algorithm" value="MeanShift"/> | 250 <param name="selected_algorithm" value="MeanShift"/> |
250 <param name="selected_input_type" value="tabular"/> | 251 <param name="selected_input_type" value="tabular"/> |
251 <param name="start_column" value="2" /> | 252 <param name="col" value="2,3,4" /> |
252 <param name="end_column" value="4" /> | |
253 <param name="cluster_all" value="False"/> | 253 <param name="cluster_all" value="False"/> |
254 <output name="outfile" file="cluster_result09.txt"/> | 254 <output name="outfile" file="cluster_result09.txt"/> |
255 </test> | 255 </test> |
256 <test> | 256 <test> |
257 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 257 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
258 <param name="selected_algorithm" value="AgglomerativeClustering"/> | 258 <param name="selected_algorithm" value="AgglomerativeClustering"/> |
259 <param name="selected_input_type" value="tabular"/> | 259 <param name="selected_input_type" value="tabular"/> |
260 <param name="start_column" value="2" /> | 260 <param name="col" value="2,3,4" /> |
261 <param name="end_column" value="4" /> | |
262 <param name="affinity" value="euclidean"/> | 261 <param name="affinity" value="euclidean"/> |
263 <param name="linkage" value="average"/> | 262 <param name="linkage" value="average"/> |
264 <param name="n_clusters" value="4"/> | 263 <param name="n_clusters" value="4"/> |
265 <output name="outfile" file="cluster_result10.txt"/> | 264 <output name="outfile" file="cluster_result10.txt"/> |
266 </test> | 265 </test> |
267 <test> | 266 <test> |
268 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 267 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
269 <param name="selected_algorithm" value="AgglomerativeClustering"/> | 268 <param name="selected_algorithm" value="AgglomerativeClustering"/> |
270 <param name="selected_input_type" value="tabular"/> | 269 <param name="selected_input_type" value="tabular"/> |
271 <param name="start_column" value="2" /> | 270 <param name="col" value="2,3,4" /> |
272 <param name="end_column" value="4" /> | |
273 <param name="linkage" value="complete"/> | 271 <param name="linkage" value="complete"/> |
274 <param name="n_clusters" value="4"/> | 272 <param name="n_clusters" value="4"/> |
275 <output name="outfile" file="cluster_result11.txt"/> | 273 <output name="outfile" file="cluster_result11.txt"/> |
276 </test> | 274 </test> |
277 <test> | 275 <test> |
278 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 276 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
279 <param name="selected_algorithm" value="SpectralClustering"/> | 277 <param name="selected_algorithm" value="SpectralClustering"/> |
280 <param name="selected_input_type" value="tabular"/> | 278 <param name="selected_input_type" value="tabular"/> |
281 <param name="start_column" value="2" /> | 279 <param name="col" value="2,3,4" /> |
282 <param name="end_column" value="4" /> | |
283 <param name="eigen_solver" value="arpack"/> | 280 <param name="eigen_solver" value="arpack"/> |
284 <param name="n_neighbors" value="12"/> | 281 <param name="n_neighbors" value="12"/> |
285 <param name="n_clusters" value="4"/> | 282 <param name="n_clusters" value="4"/> |
286 <param name="assign_labels" value="discretize"/> | 283 <param name="assign_labels" value="discretize"/> |
287 <param name="random_state" value="100"/> | 284 <param name="random_state" value="100"/> |
289 </test> | 286 </test> |
290 <test> | 287 <test> |
291 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 288 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
292 <param name="selected_algorithm" value="SpectralClustering"/> | 289 <param name="selected_algorithm" value="SpectralClustering"/> |
293 <param name="selected_input_type" value="tabular"/> | 290 <param name="selected_input_type" value="tabular"/> |
294 <param name="start_column" value="2" /> | 291 <param name="col" value="2,3,4" /> |
295 <param name="end_column" value="4" /> | |
296 <param name="assign_labels" value="discretize"/> | 292 <param name="assign_labels" value="discretize"/> |
297 <param name="random_state" value="100"/> | 293 <param name="random_state" value="100"/> |
298 <param name="degree" value="2"/> | 294 <param name="degree" value="2"/> |
299 <output name="outfile" file="cluster_result13.txt" compare="sim_size" /> | 295 <output name="outfile" file="cluster_result13.txt" compare="sim_size" /> |
300 </test> | 296 </test> |
301 <test> | 297 <test> |
302 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 298 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
303 <param name="selected_algorithm" value="MiniBatchKMeans"/> | 299 <param name="selected_algorithm" value="MiniBatchKMeans"/> |
304 <param name="selected_input_type" value="tabular"/> | 300 <param name="selected_input_type" value="tabular"/> |
305 <param name="start_column" value="2" /> | 301 <param name="col" value="2,3,4" /> |
306 <param name="end_column" value="4" /> | |
307 <param name="tol" value="0.5"/> | 302 <param name="tol" value="0.5"/> |
308 <param name="random_state" value="100"/> | 303 <param name="random_state" value="100"/> |
309 <output name="outfile" file="cluster_result14.txt"/> | 304 <output name="outfile" file="cluster_result14.txt"/> |
310 </test> | 305 </test> |
311 <test> | 306 <test> |
312 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 307 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
313 <param name="selected_algorithm" value="MiniBatchKMeans"/> | 308 <param name="selected_algorithm" value="MiniBatchKMeans"/> |
314 <param name="selected_input_type" value="tabular"/> | 309 <param name="selected_input_type" value="tabular"/> |
315 <param name="n_init" value="5"/> | 310 <param name="n_init" value="5"/> |
316 <param name="start_column" value="2" /> | 311 <param name="col" value="2,3,4" /> |
317 <param name="end_column" value="4" /> | |
318 <param name="batch_size" value="10"/> | 312 <param name="batch_size" value="10"/> |
319 <param name="n_clusters" value="4"/> | 313 <param name="n_clusters" value="4"/> |
320 <param name="random_state" value="100"/> | 314 <param name="random_state" value="100"/> |
321 <param name="reassignment_ratio" value="1.0"/> | 315 <param name="reassignment_ratio" value="1.0"/> |
322 <output name="outfile" file="cluster_result15.txt"/> | 316 <output name="outfile" file="cluster_result15.txt"/> |
323 </test> | 317 </test> |
324 <test> | 318 <test> |
325 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> | 319 <param name="infile" value="numeric_values.tabular" ftype="tabular"/> |
326 <param name="selected_algorithm" value="KMeans"/> | 320 <param name="selected_algorithm" value="KMeans"/> |
327 <param name="selected_input_type" value="tabular"/> | 321 <param name="selected_input_type" value="tabular"/> |
328 <param name="start_column" value="1" /> | 322 <param name="col" value="1" /> |
329 <param name="end_column" value="1" /> | |
330 <param name="n_clusters" value="4" /> | 323 <param name="n_clusters" value="4" /> |
331 <param name="random_state" value="100"/> | 324 <param name="random_state" value="100"/> |
332 <output name="outfile" file="cluster_result16.txt"/> | 325 <output name="outfile" file="cluster_result16.txt"/> |
333 </test> | 326 </test> |
334 <test> | 327 <test> |