annotate cobraxy-9688ad27287b/COBRAxy/marea_cluster.xml @ 90:a48b2e06ebe7 draft

Uploaded
author luca_milaz
date Sun, 13 Oct 2024 11:35:56 +0000
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
90
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
1 <tool id="MaREAcluster" name="Cluster Analysis" version="2.0.0">
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
2 <description></description>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
3 <macros>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
4 <import>marea_macros.xml</import>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
5 </macros>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
6 <requirements>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
7 <requirement type="package" version="1.24.4">numpy</requirement>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
8 <requirement type="package" version="2.0.3">pandas</requirement>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
9 <requirement type="package" version="1.10.1">scipy</requirement>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
10 <requirement type="package" version="1.3.2">scikit-learn</requirement>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
11 <requirement type="package" version="3.7.3">matplotlib</requirement>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
12 <requirement type="package" version="5.2.2">lxml</requirement>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
13 </requirements>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
14 <command detect_errors="exit_code">
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
15 <![CDATA[
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
16 python $__tool_directory__/marea_cluster.py
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
17 --input $input
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
18 --tool_dir $__tool_directory__
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
19 --out_log $log
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
20 --best_cluster $best_cluster
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
21 --cluster_type ${data.clust_type}
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
22 #if $data.clust_type == 'kmeans':
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
23 --k_min ${data.k_min}
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
24 --k_max ${data.k_max}
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
25 --elbow ${data.elbow}
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
26 --silhouette ${data.silhouette}
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
27 #end if
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
28 #if $data.clust_type == 'dbscan':
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
29 #if $data.dbscan_advanced.advanced == 'true'
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
30 --eps ${data.dbscan_advanced.eps}
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
31 --min_samples ${data.dbscan_advanced.min_samples}
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
32 #end if
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
33 #end if
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
34 #if $data.clust_type == 'hierarchy':
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
35 --k_min ${data.k_min}
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
36 --k_max ${data.k_max}
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
37 --silhouette ${data.silhouette}
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
38 #end if
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
39 ]]>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
40 </command>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
41 <inputs>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
42 <param name="input" argument="--input" type="data" format="tabular, csv, tsv" label="Input dataset" />
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
43
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
44 <conditional name="data">
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
45 <param name="clust_type" argument="--cluster_type" type="select" label="Choose clustering type:">
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
46 <option value="kmeans" selected="true">KMeans</option>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
47 <option value="dbscan">DBSCAN</option>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
48 <option value="hierarchy">Agglomerative Hierarchical</option>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
49 </param>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
50 <when value="kmeans">
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
51 <param name="k_min" argument="--k_min" type="integer" min="2" max="20" value="2" label="Min number of clusters (k) to be tested" />
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
52 <param name="k_max" argument="--k_max" type="integer" min="2" max="20" value="3" label="Max number of clusters (k) to be tested" />
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
53 <param name="elbow" argument="--elbow" type="boolean" value="true" label="Draw the elbow plot from k-min to k-max"/>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
54 <param name="silhouette" argument="--silhouette" type="boolean" value="true" label="Draw the Silhouette plot from k-min to k-max"/>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
55 </when>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
56 <when value="dbscan">
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
57 <conditional name="dbscan_advanced">
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
58 <param name="advanced" type="boolean" value="false" label="Want to use custom params for DBSCAN? (if not optimal values will be used)">
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
59 <option value="true">Yes</option>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
60 <option value="false">No</option>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
61 </param>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
62 <when value="false"></when>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
63 <when value="true">
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
64 <param name="eps" argument="--eps" type="float" value="0.5" label="Epsilon - The maximum distance between two samples for one to be considered as in the neighborhood of the other" />
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
65 <param name="min_samples" argument="min_samples" type="integer" value="5" label="Min samples - The number of samples in a neighborhood for a point to be considered as a core point (this includes the point itself)"/>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
66
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
67 </when>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
68 </conditional>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
69 </when>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
70 <when value="hierarchy">
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
71 <param name="k_min" argument="--k_min" type="integer" min="2" max="20" value="2" label="Min number of clusters (k) to be tested" />
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
72 <param name="k_max" argument="--k_max" type="integer" min="3" max="20" value="3" label="Max number of clusters (k) to be tested" />
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
73 <param name="silhouette" argument="--silhouette" type="boolean" value="true" label="Draw the Silhouette plot from k-min to k-max"/>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
74 </when>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
75 </conditional>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
76 </inputs>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
77
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
78 <outputs>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
79 <data format="txt" name="log" label="${tool.name} - Log" />
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
80 <data format="tabular" name="best_cluster" label="${tool.name} - best cluster assignment" />
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
81 <collection name="results" type="list" label="${tool.name} - Plots and results">
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
82 <discover_datasets pattern="__name_and_ext__" directory="clustering"/>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
83 <filter>data['clust_type'] == "kmeans" or data['clust_type'] == "hierarchy"</filter>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
84 </collection>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
85 </outputs>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
86 <help>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
87 <![CDATA[
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
88
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
89 What it does
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
90 -------------
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
91
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
92 The tool performs cluster analysis of any dataset, according to most used algorithms: K-means, agglomerative
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
93 clustering and DBSCAN (Density Based Spatial Clustering of Applications with Noise).
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
94
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
95 Accepted files are:
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
96 - Tabular files in which rows indicate different variables and columns different observations. The first row reports the observations’ labels.
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
97
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
98
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
99 Example of input dataset:
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
100 -------------------------
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
101
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
102 +----------+----------+----------+
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
103 |TCGAA62670|TCGAA62671|TCGAA62672|
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
104 +==========+==========+==========+
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
105 | 0.523167 | 0.371355 | 0.925661 |
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
106 +----------+----------+----------+
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
107 | 0.568765 | 0.765567 | 0.456789 |
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
108 +----------+----------+----------+
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
109 | 0.876545 | 0.768933 | 0.987654 |
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
110 +----------+----------+----------+
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
111 | 0.456788 | 0.876543 | 0.876542 |
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
112 +----------+----------+----------+
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
113 | 0.876543 | 0.786543 | 0.897654 |
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
114 +----------+----------+----------+
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
115
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
116 .
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
117
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
118
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
119 Options:
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
120 --------
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
121
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
122 The following clustering types can be chosen:
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
123 - K-means. This option requires the number of clusters (k) to be set. Different values of k can be tested.
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
124 - Agglomerative clustering. Different values of k can be set, to cut the resulting dendrogram.
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
125 - DBSCAN. The DBSCAN method chooses the number of clusters based on parameters that define when a region is to be considered dense. Custom parameters may be used, namely the maximum distance between two samples for one to be considered as in the neighborhood of the other and the number of samples in a neighborhood for a point to be considered as a core point.
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
126
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
127 The tool generates:
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
128 - a tab-separated file: reporting the affiliation of each observation to a cluster. In case different numbers of clusters have been tested, the best cluster assignment is reported according to maximum average silhouette score. If desired, the elbow plot is generated, as well as silhouette plot for each k.
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
129 - a list of items, including: 1) the cluster assignment for each tested number of clusters 2) the dendrogram in case of agglomerative clustering 3) elbow and silhouete plots in case of k-means clustering.
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
130 - a log file (.txt).
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
131
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
132
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
133 .. class:: infomark
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
134
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
135 **TIP**: This tool has been conceived to cluster gene expression data, by using the RAS scores computed by `Ras tool`_.
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
136
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
137 .. class:: infomark
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
138
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
139 **TIP**: If your data is not TAB delimited, use `Convert delimiters to TAB`_.
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
140
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
141 @REFERENCE@
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
142
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
143 .. _Ras tool: http://bimib.disco.unimib.it:5555/?tool_id=toolshed.g2.bx.psu.edu%2Frepos%2Fbimib%2Fmarea%2FMaREA+RAS+Generator%2F1.0.6&version=1.0.6&__identifer=auulv6gbp76
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
144 .. _Convert delimiters to TAB: http://bimib.disco.unimib.it:5555/?tool_id=Convert+characters1&version=1.0.0&__identifer=76g7trea4j6
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
145
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
146 ]]>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
147 </help>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
148 <expand macro="citations" />
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
149 </tool>
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
150
a48b2e06ebe7 Uploaded
luca_milaz
parents:
diff changeset
151