Mercurial > repos > artbio > gsc_high_dimensions_visualisation
comparison high_dim_visu.R @ 0:cad0001b9cfb draft
planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_high_dimension_visualization commit 09dcd74dbc01f448518cf3db3e646afb0675a6fe
author | artbio |
---|---|
date | Mon, 24 Jun 2019 13:39:11 -0400 |
parents | |
children | 7e7a2a4cfce2 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:cad0001b9cfb |
---|---|
1 # load packages that are provided in the conda env | |
2 options( show.error.messages=F, | |
3 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } ) | |
4 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") | |
5 requiredPackages = c('optparse', 'Rtsne', 'ggplot2', 'ggfortify') | |
6 warnings() | |
7 library(optparse) | |
8 library(FactoMineR) | |
9 library(factoextra) | |
10 library(Rtsne) | |
11 library(ggplot2) | |
12 library(ggfortify) | |
13 library(RColorBrewer) | |
14 | |
15 | |
16 # Arguments | |
17 option_list = list( | |
18 make_option( | |
19 "--data", | |
20 default = NA, | |
21 type = 'character', | |
22 help = "Input file that contains expression value to visualise" | |
23 ), | |
24 make_option( | |
25 "--sep", | |
26 default = '\t', | |
27 type = 'character', | |
28 help = "File separator [default : '%default' ]" | |
29 ), | |
30 make_option( | |
31 "--colnames", | |
32 default = TRUE, | |
33 type = 'logical', | |
34 help = "Consider first line as header ? [default : '%default' ]" | |
35 ), | |
36 make_option( | |
37 "--out", | |
38 default = "res.tab", | |
39 type = 'character', | |
40 help = "Output name [default : '%default' ]" | |
41 ), | |
42 make_option( | |
43 "--labels", | |
44 default = FALSE, | |
45 type = 'logical', | |
46 help = "add labels in scatter plots [default : '%default' ]" | |
47 ), | |
48 make_option( | |
49 "--factor", | |
50 default = '', | |
51 type = 'character', | |
52 help = "A two column table that specifies factor levels for contrasting data [default : '%default' ]" | |
53 ), | |
54 make_option( | |
55 "--visu_choice", | |
56 default = 'PCA', | |
57 type = 'character', | |
58 help = "visualisation method ('PCA', 'tSNE', 'HCPC') [default : '%default' ]" | |
59 ), | |
60 make_option( | |
61 "--table_coordinates", | |
62 default = '', | |
63 type = 'character', | |
64 help = "Table with plot coordinates [default : '%default' ]" | |
65 ), | |
66 make_option( | |
67 "--Rtsne_seed", | |
68 default = 42, | |
69 type = 'integer', | |
70 help = "Seed value for reproducibility [default : '%default' ]" | |
71 ), | |
72 make_option( | |
73 "--Rtsne_dims", | |
74 default = 2, | |
75 type = 'integer', | |
76 help = "Output dimensionality [default : '%default' ]" | |
77 ), | |
78 make_option( | |
79 "--Rtsne_initial_dims", | |
80 default = 50, | |
81 type = 'integer', | |
82 help = "The number of dimensions that should be retained in the initial PCA step [default : '%default' ]" | |
83 ), | |
84 make_option( | |
85 "--Rtsne_perplexity", | |
86 default = 5.0, | |
87 type = 'numeric', | |
88 help = "perplexity [default : '%default' ]" | |
89 ), | |
90 make_option( | |
91 "--Rtsne_theta", | |
92 default = 1.0, | |
93 type = 'numeric', | |
94 help = "theta [default : '%default' ]" | |
95 ), | |
96 make_option( | |
97 "--Rtsne_max_iter", | |
98 default = 1000, | |
99 type = 'integer', | |
100 help = "max_iter [default : '%default' ]" | |
101 ), | |
102 make_option( | |
103 "--Rtsne_pca", | |
104 default = TRUE, | |
105 type = 'logical', | |
106 help = "Whether an initial PCA step should be performed [default : '%default' ]" | |
107 ), | |
108 make_option( | |
109 "--Rtsne_pca_center", | |
110 default = TRUE, | |
111 type = 'logical', | |
112 help = "Should data be centered before pca is applied? [default : '%default' ]" | |
113 ), | |
114 make_option( | |
115 "--Rtsne_pca_scale", | |
116 default = FALSE, | |
117 type = 'logical', | |
118 help = "Should data be scaled before pca is applied? [default : '%default' ]" | |
119 ), | |
120 make_option( | |
121 "--Rtsne_normalize", | |
122 default = TRUE, | |
123 type = 'logical', | |
124 help = "Should data be normalized internally prior to distance calculations? [default : '%default' ]" | |
125 ), | |
126 make_option( | |
127 "--Rtsne_exaggeration_factor", | |
128 default = 12.0, | |
129 type = 'numeric', | |
130 help = " Exaggeration factor used to multiply the P matrix in the first part of the optimization [default : '%default' ]" | |
131 ), | |
132 make_option( | |
133 "--PCA_npc", | |
134 default = 5, | |
135 type = 'integer', | |
136 help = "number of dimensions kept in the results [default : '%default' ]" | |
137 ), | |
138 make_option( | |
139 "--HCPC_ncluster", | |
140 default = -1, | |
141 type = 'numeric', | |
142 help = "nb.clust, number of clusters to consider in the hierarchical clustering. [default : -1 let HCPC to optimize the number]" | |
143 ), | |
144 make_option( | |
145 "--HCPC_npc", | |
146 default = 5, | |
147 type = 'integer', | |
148 help = "npc, number of dimensions which are kept for HCPC analysis [default : '%default' ]" | |
149 ), | |
150 make_option( | |
151 "--HCPC_metric", | |
152 default = 'euclidian', | |
153 type = 'character', | |
154 help = "Metric to be used for calculating dissimilarities between observations, available 'euclidian' or 'manhattan' [default : '%default' ]" | |
155 ), | |
156 make_option( | |
157 "--HCPC_method", | |
158 default = 'ward', | |
159 type = 'character', | |
160 help = "Clustering method between 'ward','average','single', 'complete', 'weighted' [default :'%default']" | |
161 ), | |
162 make_option( | |
163 "--pdf_out", | |
164 default = "out.pdf", | |
165 type = 'character', | |
166 help = "pdf of plots [default : '%default' ]" | |
167 ), | |
168 make_option( | |
169 "--HCPC_consol", | |
170 default = 'TRUE', | |
171 type = 'logical', | |
172 help = "If TRUE, a k-means consolidation is performed [default :'%default']" | |
173 ), | |
174 make_option( | |
175 "--HCPC_itermax", | |
176 default = '10', | |
177 type = 'integer', | |
178 help = "The maximum number of iterations for the consolidation [default :'%default']" | |
179 ), | |
180 make_option( | |
181 "--HCPC_min", | |
182 default = '3', | |
183 type = 'integer', | |
184 help = "The least possible number of clusters suggested [default :'%default']" | |
185 ), | |
186 make_option( | |
187 "--HCPC_max", | |
188 default = -1, | |
189 type = 'integer', | |
190 help = "The higher possible number of clusters suggested [default :'%default']" | |
191 ), | |
192 make_option( | |
193 "--HCPC_clusterCA", | |
194 default = 'rows', | |
195 type = 'character', | |
196 help = "A string equals to 'rows' or 'columns' for the clustering of Correspondence Analysis results [default :'%default']" | |
197 ), | |
198 make_option( | |
199 "--HCPC_kk", | |
200 default = -1, | |
201 type = 'numeric', | |
202 help = "The maximum number of iterations for the consolidation [default :'%default']" | |
203 ) | |
204 ) | |
205 | |
206 opt = parse_args(OptionParser(option_list = option_list), | |
207 args = commandArgs(trailingOnly = TRUE)) | |
208 | |
209 if (opt$sep == "tab") {opt$sep <- "\t"} | |
210 if (opt$sep == "comma") {opt$sep <- ","} | |
211 if(opt$HCPC_max == -1) {opt$HCPC_max <- NULL} | |
212 if(opt$HCPC_kk == -1) {opt$HCPC_kk <- Inf} | |
213 | |
214 data = read.table( | |
215 opt$data, | |
216 check.names = FALSE, | |
217 header = opt$colnames, | |
218 row.names = 1, | |
219 sep = opt$sep | |
220 ) | |
221 | |
222 # Contrasting factor and its colors | |
223 if (opt$factor != '') { | |
224 contrasting_factor <- read.delim( | |
225 opt$factor, | |
226 header = TRUE | |
227 ) | |
228 rownames(contrasting_factor) <- contrasting_factor[,1] | |
229 contrasting_factor <- contrasting_factor[colnames(data),] | |
230 colnames(contrasting_factor) <- c("id","factor") | |
231 contrasting_factor$factor <- as.factor(contrasting_factor$factor) | |
232 factorColors <- | |
233 with(contrasting_factor, | |
234 data.frame(factor = levels(factor), | |
235 data.frame(factor = levels(factor), | |
236 color = I(brewer.pal(nlevels(factor), name = 'Paired')))) | |
237 ) | |
238 factor_cols <- factorColors$color[match(contrasting_factor$factor, | |
239 factorColors$factor)] | |
240 } else { | |
241 factor_cols <- rep("deepskyblue4", length(rownames(data))) | |
242 } | |
243 | |
244 ################ t-SNE #################### | |
245 if (opt$visu_choice == 'tSNE') { | |
246 # filter and transpose df for tsne and pca | |
247 tdf = t(data) | |
248 # make tsne and plot results | |
249 set.seed(opt$Rtsne_seed) ## Sets seed for reproducibility | |
250 | |
251 tsne_out <- Rtsne(tdf, | |
252 dims = opt$Rtsne_dims, | |
253 initial_dims = opt$Rtsne_initial_dims, | |
254 perplexity = opt$Rtsne_perplexity , | |
255 theta = opt$Rtsne_theta, | |
256 max_iter = opt$Rtsne_max_iter, | |
257 pca = opt$Rtsne_pca, | |
258 pca_center = opt$Rtsne_pca_center, | |
259 pca_scale = opt$Rtsne_pca_scale, | |
260 normalize = opt$Rtsne_normalize, | |
261 exaggeration_factor=opt$Rtsne_exaggeration_factor) | |
262 | |
263 embedding <- as.data.frame(tsne_out$Y[,1:2]) | |
264 embedding$Class <- as.factor(rownames(tdf)) | |
265 gg_legend = theme(legend.position="right") | |
266 if (opt$factor == '') { | |
267 ggplot(embedding, aes(x=V1, y=V2)) + | |
268 geom_point(size=1, color='deepskyblue4') + | |
269 gg_legend + | |
270 xlab("") + | |
271 ylab("") + | |
272 ggtitle('t-SNE') + | |
273 if (opt$labels) { | |
274 geom_text(aes(label=Class),hjust=-0.2, vjust=-0.5, size=1.5, color='deepskyblue4') | |
275 } | |
276 } else { | |
277 embedding$factor <- as.factor(contrasting_factor$factor) | |
278 ggplot(embedding, aes(x=V1, y=V2, color=factor)) + | |
279 geom_point(size=1) + #, color=factor_cols | |
280 gg_legend + | |
281 xlab("") + | |
282 ylab("") + | |
283 ggtitle('t-SNE') + | |
284 if (opt$labels) { | |
285 geom_text(aes(label=Class, colour=factor),hjust=-0.2, vjust=-0.5, size=1.5) | |
286 } | |
287 } | |
288 ggsave(file=opt$pdf_out, device="pdf") | |
289 | |
290 #save coordinates table | |
291 if(opt$table_coordinates != ''){ | |
292 coord_table <- cbind(rownames(tdf),as.data.frame(tsne_out$Y)) | |
293 colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$Rtsne_dims))) | |
294 } | |
295 } | |
296 | |
297 | |
298 ######### make PCA with FactoMineR ################# | |
299 if (opt$visu_choice == 'PCA') { | |
300 pca <- PCA(t(data), ncp=opt$PCA_npc, graph=FALSE) | |
301 pdf(opt$pdf_out) | |
302 if (opt$labels == FALSE) { | |
303 plot(pca, label="none" , col.ind = factor_cols) | |
304 } else { | |
305 plot(pca, cex=0.2 , col.ind = factor_cols) | |
306 } | |
307 if (opt$factor != '') { | |
308 legend(x = 'topright', | |
309 legend = as.character(factorColors$factor), | |
310 col = factorColors$color, pch = 16, bty = 'n', xjust = 1, cex=0.7) | |
311 } | |
312 dev.off() | |
313 | |
314 #save coordinates table | |
315 if(opt$table_coordinates != ''){ | |
316 coord_table <- cbind(rownames(pca$ind$coord),as.data.frame(pca$ind$coord)) | |
317 colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$PCA_npc))) | |
318 } | |
319 | |
320 } | |
321 | |
322 ########### make HCPC with FactoMineR ########## | |
323 if (opt$visu_choice == 'HCPC') { | |
324 | |
325 # HCPC starts with a PCA | |
326 pca <- PCA( | |
327 t(data), | |
328 ncp = opt$HCPC_npc, | |
329 graph = FALSE, | |
330 scale.unit = FALSE | |
331 ) | |
332 | |
333 PCA_IndCoord = as.data.frame(pca$ind$coord) # coordinates of observations in PCA | |
334 | |
335 # Hierarchical Clustering On Principal Components Followed By Kmean Clustering | |
336 res.hcpc <- HCPC(pca, | |
337 nb.clust=opt$HCPC_ncluster, metric=opt$HCPC_metric, method=opt$HCPC_method, | |
338 graph=F,consol=opt$HCPC_consol,iter.max=opt$HCPC_itermax,min=opt$HCPC_min,max=opt$HCPC_max, | |
339 cluster.CA=opt$HCPC_clusterCA,kk=opt$HCPC_kk) | |
340 # HCPC plots | |
341 dims <- head(as.data.frame(res.hcpc$call$t$res$eig),2) # dims variances in column 2 | |
342 pdf(opt$pdf_out) | |
343 plot(res.hcpc, choice="tree") | |
344 plot(res.hcpc, choice="bar") | |
345 plot(res.hcpc, choice="3D.map") | |
346 if (opt$labels == FALSE) { | |
347 plot(res.hcpc, choice="map", label="none") | |
348 } else { | |
349 plot(res.hcpc, choice="map") | |
350 } | |
351 | |
352 # user contrasts on the pca | |
353 if (opt$factor != '') { | |
354 plot(pca, label="none", habillage="ind", col.hab=factor_cols) | |
355 legend(x = 'topright', | |
356 legend = as.character(factorColors$factor), | |
357 col = factorColors$color, pch = 16, bty = 'n', xjust = 1, cex=0.7) | |
358 } | |
359 ## Clusters to which individual observations belong # used ? | |
360 # Clust <- data.frame(Cluster = res.hcpc$data.clust[, (nrow(data) + 1)], | |
361 # Observation = rownames(res.hcpc$data.clust)) | |
362 # metadata <- data.frame(Observation=colnames(data), row.names=colnames(data)) | |
363 # metadata = merge(y = metadata, | |
364 # x = Clust, | |
365 # by = "Observation") | |
366 | |
367 # unclear utility | |
368 # ObsNumberPerCluster = as.data.frame(table(metadata$Cluster)) | |
369 # colnames(ObsNumberPerCluster) = c("Cluster", "ObsNumber") | |
370 # | |
371 # ## Silhouette Plot # not used | |
372 # hc.cut = hcut(PCA_IndCoord, | |
373 # k = nlevels(metadata$Cluster), | |
374 # hc_method = "ward.D2") | |
375 # | |
376 # Sil = fviz_silhouette(hc.cut) | |
377 # sil1 = as.data.frame(Sil$data) | |
378 | |
379 ## Normalized Mutual Information # to be implemented later | |
380 # sink(opt$mutual_info) | |
381 # res = external_validation( | |
382 # as.numeric(factor(metadata[, Patient])), | |
383 # as.numeric(metadata$Cluster), | |
384 # method = "adjusted_rand_index", | |
385 # summary_stats = TRUE | |
386 # ) | |
387 # sink() | |
388 dev.off() | |
389 | |
390 if(opt$table_coordinates != ''){ | |
391 coord_table <- cbind(Cell=rownames(res.hcpc$call$X),as.data.frame(res.hcpc$call$X)) | |
392 colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$HCPC_npc)),"Cluster") | |
393 } | |
394 } | |
395 | |
396 ## Return coordinates file to user | |
397 | |
398 if(opt$table_coordinates != ''){ | |
399 write.table( | |
400 coord_table, | |
401 file = opt$table_coordinates, | |
402 sep = "\t", | |
403 quote = F, | |
404 col.names = T, | |
405 row.names = F | |
406 ) | |
407 } | |
408 | |
409 | |
410 | |
411 | |
412 | |
413 | |
414 | |
415 | |
416 | |
417 | |
418 | |
419 | |
420 | |
421 |