Mercurial > repos > iuc > raceid_clustering
comparison raceid_clustering.xml @ 0:4ea021bd7513 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/raceid3 commit f880060c478d42202df5b78a81329f8af56b1138
author | iuc |
---|---|
date | Thu, 22 Nov 2018 04:43:57 -0500 |
parents | |
children | ee0bbc160cb1 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4ea021bd7513 |
---|---|
1 <tool id="raceid_clustering" name="Clustering using RaceID" version="@VERSION_RACEID@.@VERSION_PACKAGE@.1" > | |
2 <description>performs clustering, outlier detection, dimensional reduction</description> | |
3 <macros> | |
4 <import>macros.xml</import> | |
5 <import>macros_cluster.xml</import> | |
6 </macros> | |
7 <expand macro="requirements" /> | |
8 <version_command><![CDATA[ | |
9 Rscript '$__tool_directory__/scripts/cluster.R' @GET_VERSION@ | |
10 ]]></version_command> | |
11 | |
12 <command detect_errors="exit_code"><![CDATA[ | |
13 #set bin = 'cluster.R' | |
14 Rscript '$__tool_directory__/scripts/$bin' '$userconf' 2>&1 > '$outlog' | |
15 ]]></command> | |
16 | |
17 <configfiles> | |
18 <configfile name="userconf" ><![CDATA[ | |
19 @STRING2VECTOR@ | |
20 | |
21 @CLUSTER_CHEETAH@ | |
22 | |
23 ]]></configfile> | |
24 </configfiles> | |
25 <inputs> | |
26 <param name="inputrds" type="data" format="rdata" label="Input RaceID RDS" help="Requires the RDS output from the normalisation stage" /> | |
27 <section name="clust" title="Clustering" expanded="true" > | |
28 <!-- CompDist --> | |
29 <param name="metric" type="select" label="Distance Metric" > | |
30 <option value="pearson" selected="true" >Pearson</option> | |
31 <option value="spearman">Spearman</option> | |
32 <option value="logpearson">Log Pearson</option> | |
33 <option value="euclidean">Euclidean</option> | |
34 </param> | |
35 <!-- ClustExp --> | |
36 <param name="funcluster" type="select" label="Clustering method" > | |
37 <option value="kmedoids" selected="true" >K-medoids</option> | |
38 <option value="kmeans">K-means</option> | |
39 <option value="hclust">H-Clust</option> | |
40 </param> | |
41 <expand macro="use_defaults_no" > | |
42 <!-- CompDist --> | |
43 <param name="fselect" type="boolean" value="true" label="Perform feature selection" /> | |
44 <param name="knn" type="integer" min="0" optional="true" label="KNN" help="Number of nearest neighbours for imputing gene expression" /><!-- 0: NULL --> | |
45 <!-- ClustExp --> | |
46 <param name="sat" type="boolean" checked="true" label="Saturation-based clustering?" help="Determine number of clusters on saturation point of the mean within-cluster dispersion as a function of the cluster number." /> | |
47 <param name="clustnr" type="integer" min="0" value="30" label="Max number of clusters using Saturation-by-mean" help="Max number of clusters for the derivation of the cluster number by the saturation of mean within-cluster-dispersion." /> | |
48 <param name="samp" type="integer" min="0" optional="true" label="Sample random number of cells" help="Number of random sample of cells used for the inference of cluster number and Jaccard similarity" /><!-- 0:NULL --> | |
49 <param name="cln" type="integer" min="0" optional="true" label="Number of clusters" /><!-- 0:Null --> | |
50 <param name="bootnr" type="integer" min="0" value="50" label="Number of booststrapping runs" /> | |
51 <param name="rseed" type="integer" value="17000" label="Random seed" /> | |
52 </expand> | |
53 </section> | |
54 <section name="outlier" title="Outliers" expanded="true" > | |
55 <!-- Find Outliers --> | |
56 <param name="outminc" type="integer" min="0" value="5" label="Minimum Transcripts" help="minimal transcript count of a gene in a clusters to be tested for being an outlier gene" /> | |
57 <param name="outlg" type="integer" min="1" value="2" label="Minimum Genes" help="Minimum number of outlier genes required for being an outlier cell" /> | |
58 <!-- RFCorrect --> | |
59 <param name="final" type="boolean" value="true" label="Plot Final Clusters?" help="Reclassification of cell types using out-of-bag analysis is performed based on the final clusters after outlier identification. If 'FALSE', then the cluster partition prior to outlier identification is used for reclassification." /> | |
60 <expand macro="use_defaults_no" > | |
61 <!-- Find Outliers --> | |
62 <param name="probthr" type="float" min="0" value="0.001" label="Outlier Probability Threshold" help="Probability threshold for the above specified minimum number of genes to be an outlier cell. This probability is computed from a negative binomial background model of expression in a cluster" /> | |
63 <param name="outdistquant" type="float" min="0" max="1" value="0.95" label="Outlier Distance Quantile" help="Outlier cells are merged to outlier clusters if their distance smaller than the outdistquant-quantile of the distance distribution of pairs of cells in the orginal clusters after outlier removal" /> | |
64 <!-- RFCorrect --> | |
65 <param name="nbtree" type="integer" optional="true" label="Number of trees to be built" /><!-- 0:Null --> | |
66 <param name="nbfactor" type="integer" min="0" value="5" label="Tree Factor" help="Number of trees based on the number of cells multiplied by this factor. Effective only if the number of trees parameter is set to 0" /> | |
67 <param name="rfseed" type="integer" value="12345" label="Random Seed" /> | |
68 </expand> | |
69 </section> | |
70 <section name="tsne" title="tSNE and FR" expanded="true" > | |
71 <!-- CompTSNE --> | |
72 <param name="perplexity" type="integer" min="0" value="30" label="Perplexity" help="Perplexity of the t-SNE map" /> | |
73 <!-- CompFR --> | |
74 <param name="knn" type="integer" min="0" value="10" label="KNN" help="Number of nearest neighbours used for the inference of the Fruchterman-Rheingold layout" /> | |
75 <expand macro="use_defaults_no" > | |
76 <!-- CompTSNE --> | |
77 <param name="initial_cmd" type="boolean" checked="true" label="tSNE map initialised by classical multidimensional scaling" /> | |
78 <param name="rseed_tsne" type="integer" value="15555" label="Random Seed (tSNE)" /> | |
79 <!-- CompFR --> | |
80 <param name="rseed_fr" type="integer" min="0" value="15555" label="Random Seed (FR)" /> | |
81 </expand> | |
82 </section> | |
83 <section name="extra" title="Extra Parameters" expanded="false" > | |
84 <param name="tablelim" type="integer" min="1" value="25" label="Table Limit" help="Top N genes to print per cluster" /> | |
85 <param name="plotlim" type="integer" min="1" value="10" label="Plot Limit" help="Top N genes to plot. Must be less than or equal to the Table Limit" /> | |
86 <param name="foldchange" type="float" min="0" value="1" label="Fold change" /> | |
87 <param name="pvalue" type="float" min="0" max="1" value="0.01" label="P-value Cutoff" help="P-value cutoff for the inference of differential gene expression" /> | |
88 </section> | |
89 </inputs> | |
90 <outputs> | |
91 <data name="outpdf" format="pdf" label="${tool.name} on ${on_string}: PDF Report" /> | |
92 <data name="outrdat" format="rdata" label="${tool.name} on ${on_string}: RDS" /> | |
93 <data name="outgenelist" format="tabular" label="${tool.name} on ${on_string}: Cluster - Genes per Cluster" /> | |
94 <data name="outlog" format="txt" label="${tool.name} on ${on_string}: Log" > | |
95 <filter>use_log</filter> | |
96 </data> | |
97 </outputs> | |
98 | |
99 <tests> | |
100 <test> | |
101 <param name="inputrds" value="matrix.filter.rdat" /> | |
102 <output name="outgenelist" value="matrix2.genelist" /> | |
103 <output name="outrdat" value="matrix2.rdat" compare="sim_size" delta="15" /> | |
104 <output name="outpdf" value="matrix2.pdf" compare="sim_size" delta="10" /> | |
105 </test> | |
106 <test> | |
107 <!-- defaults, but manually specified. No opts, no CC. Generates identical to above --> | |
108 <param name="inputrds" value="matrix.filter.rdat" /> | |
109 <section name="clust" > | |
110 <param name="metric" value="pearson" /> | |
111 <param name="funcluster" value="kmedoids" /> | |
112 <expand macro="test_nondef" > | |
113 <param name="fselect" value="true" /> | |
114 <param name="sat" value="true" /> | |
115 <param name="clustnr" value="30" /> | |
116 <param name="bootnr" value="50" /> | |
117 <param name="rseed" value="17000" /> | |
118 </expand> | |
119 </section> | |
120 <section name="outlier" > | |
121 <param name="outminc" value="5" /> | |
122 <param name="outlg" value="2" /> | |
123 <param name="final" value="false" /> | |
124 <expand macro="test_nondef" section_name="outlier" > | |
125 <param name="probthr" value="0.001" /> | |
126 <param name="outdistquant" value="0.95" /> | |
127 <param name="rfseed" value="12345" /> | |
128 <param name="nbfactor" value="5" /> | |
129 </expand> | |
130 </section> | |
131 <section name="tsne" > | |
132 <param name="perplexity" value="30" /> | |
133 <param name="knn" value="10" /> | |
134 <expand macro="test_nondef" section_name="tsne" > | |
135 <param name="initial_cmd" value="true" /> | |
136 <param name="rseed_tsne" value="15555" /> | |
137 <param name="rfseed_fr" value="15555" /> | |
138 </expand> | |
139 </section> | |
140 <output name="outgenelist" value="intestinal.genelist" /> | |
141 <output name="outpdf" value="intestinal.pdf" compare="sim_size" delta="50" /> | |
142 </test> | |
143 <test> | |
144 <!-- Advanced. Opts, CC used --> | |
145 <param name="inputrds" value="matrix.filter.rdat" /> | |
146 <section name="clust" > | |
147 <param name="metric" value="euclidean" /> | |
148 <param name="funcluster" value="hclust" /> | |
149 <expand macro="test_nondef" > | |
150 <param name="fselect" value="false" /> | |
151 <param name="knn" value="5" /> | |
152 <param name="sat" value="false" /> | |
153 <param name="samp" value="10" /> | |
154 <param name="cln" value="10" /> | |
155 <param name="clustnr" value="10" /> | |
156 <param name="bootnr" value="30" /> | |
157 <param name="rseed" value="17000" /> | |
158 </expand> | |
159 </section> | |
160 <section name="outlier" > | |
161 <param name="outminc" value="3" /> | |
162 <param name="outlg" value="5" /> | |
163 <param name="final" value="true" /> | |
164 <expand macro="test_nondef" > | |
165 <param name="probthr" value="0.01" /> | |
166 <param name="outdistquant" value="0.5" /> | |
167 <param name="rfseed" value="12345" /> | |
168 <param name="nbfactor" value="5" /> | |
169 <param name="nbtree" value="10" /> | |
170 </expand> | |
171 </section> | |
172 <section name="tsne" > | |
173 <param name="perplexity" value="20" /> | |
174 <param name="knn" value="6" /> | |
175 <expand macro="test_nondef" > | |
176 <param name="initial_cmd" value="false" /> | |
177 <param name="rseed_tsne" value="15555" /> | |
178 <param name="rfseed_fr" value="15555" /> | |
179 </expand> | |
180 </section> | |
181 <output name="outgenelist" value="intestinal_advanced.genelist" /> | |
182 <output name="outpdf" value="intestinal_advanced.pdf" compare="sim_size" delta="150" /> | |
183 </test> | |
184 </tests> | |
185 <help><![CDATA[ | |
186 RaceID3 | |
187 ========= | |
188 | |
189 RaceID is a clustering algorithm for the identification of cell types from single-cell RNA-sequencing data. It was specifically designed for the detection of rare cells which correspond to outliers in conventional clustering methods. | |
190 | |
191 This module performs clustering, and outlier detection and ultimately tells you how well defined your clusters are (even if the resultant tSNE plots look messy). | |
192 | |
193 The tool generates the following: | |
194 | |
195 * A list of the most differentially expressed genes in each cluster | |
196 * Cluster stability plots: | |
197 * The mean within-cluster dispersion as a function of the cluster number and highlights the saturation point inferred based on the saturation criterion applied by RaceID3. The number of clusters where the change in within-cluster dispersion upon adding further clusters approaches linear behaviour demarcates the saturation point is highlighted in blue. | |
198 * The point where this flattens out gives you a rough estimate of how many clusters there are in your analysis. | |
199 * A scatter plot showing the gene expression variance as a function of the mean and the inferred polynomial fit of the background model, as well as a local regression. | |
200 * This tells you which genes are the most significant against a background model of random expression. | |
201 * A Jaccard stability plot which tells you how well defined your clusters are prior to outlier identification. | |
202 * Good stable clusters should near the 0.8 mark, but 0.6 is acceptable, and for a large number of clusters, one or two low scoring clusters are also acceptable. | |
203 * Heatmaps: | |
204 * The initial and final clustering (as determined using random forest) | |
205 * For each of the most differentially expressed genes in each cluster | |
206 | |
207 The tool requires the RData input from the previous filtering / normalisation / confounder removal step to work. | |
208 | |
209 | |
210 ]]> | |
211 </help> | |
212 <expand macro="citations" /> | |
213 </tool> |