comparison high_dim_visu.R @ 0:cad0001b9cfb draft

planemo upload for repository https://github.com/ARTbio/tools-artbio/tree/master/tools/gsc_high_dimension_visualization commit 09dcd74dbc01f448518cf3db3e646afb0675a6fe
author artbio
date Mon, 24 Jun 2019 13:39:11 -0400
parents
children 7e7a2a4cfce2
comparison
equal deleted inserted replaced
-1:000000000000 0:cad0001b9cfb
1 # load packages that are provided in the conda env
2 options( show.error.messages=F,
3 error = function () { cat( geterrmessage(), file=stderr() ); q( "no", 1, F ) } )
4 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
5 requiredPackages = c('optparse', 'Rtsne', 'ggplot2', 'ggfortify')
6 warnings()
7 library(optparse)
8 library(FactoMineR)
9 library(factoextra)
10 library(Rtsne)
11 library(ggplot2)
12 library(ggfortify)
13 library(RColorBrewer)
14
15
16 # Arguments
17 option_list = list(
18 make_option(
19 "--data",
20 default = NA,
21 type = 'character',
22 help = "Input file that contains expression value to visualise"
23 ),
24 make_option(
25 "--sep",
26 default = '\t',
27 type = 'character',
28 help = "File separator [default : '%default' ]"
29 ),
30 make_option(
31 "--colnames",
32 default = TRUE,
33 type = 'logical',
34 help = "Consider first line as header ? [default : '%default' ]"
35 ),
36 make_option(
37 "--out",
38 default = "res.tab",
39 type = 'character',
40 help = "Output name [default : '%default' ]"
41 ),
42 make_option(
43 "--labels",
44 default = FALSE,
45 type = 'logical',
46 help = "add labels in scatter plots [default : '%default' ]"
47 ),
48 make_option(
49 "--factor",
50 default = '',
51 type = 'character',
52 help = "A two column table that specifies factor levels for contrasting data [default : '%default' ]"
53 ),
54 make_option(
55 "--visu_choice",
56 default = 'PCA',
57 type = 'character',
58 help = "visualisation method ('PCA', 'tSNE', 'HCPC') [default : '%default' ]"
59 ),
60 make_option(
61 "--table_coordinates",
62 default = '',
63 type = 'character',
64 help = "Table with plot coordinates [default : '%default' ]"
65 ),
66 make_option(
67 "--Rtsne_seed",
68 default = 42,
69 type = 'integer',
70 help = "Seed value for reproducibility [default : '%default' ]"
71 ),
72 make_option(
73 "--Rtsne_dims",
74 default = 2,
75 type = 'integer',
76 help = "Output dimensionality [default : '%default' ]"
77 ),
78 make_option(
79 "--Rtsne_initial_dims",
80 default = 50,
81 type = 'integer',
82 help = "The number of dimensions that should be retained in the initial PCA step [default : '%default' ]"
83 ),
84 make_option(
85 "--Rtsne_perplexity",
86 default = 5.0,
87 type = 'numeric',
88 help = "perplexity [default : '%default' ]"
89 ),
90 make_option(
91 "--Rtsne_theta",
92 default = 1.0,
93 type = 'numeric',
94 help = "theta [default : '%default' ]"
95 ),
96 make_option(
97 "--Rtsne_max_iter",
98 default = 1000,
99 type = 'integer',
100 help = "max_iter [default : '%default' ]"
101 ),
102 make_option(
103 "--Rtsne_pca",
104 default = TRUE,
105 type = 'logical',
106 help = "Whether an initial PCA step should be performed [default : '%default' ]"
107 ),
108 make_option(
109 "--Rtsne_pca_center",
110 default = TRUE,
111 type = 'logical',
112 help = "Should data be centered before pca is applied? [default : '%default' ]"
113 ),
114 make_option(
115 "--Rtsne_pca_scale",
116 default = FALSE,
117 type = 'logical',
118 help = "Should data be scaled before pca is applied? [default : '%default' ]"
119 ),
120 make_option(
121 "--Rtsne_normalize",
122 default = TRUE,
123 type = 'logical',
124 help = "Should data be normalized internally prior to distance calculations? [default : '%default' ]"
125 ),
126 make_option(
127 "--Rtsne_exaggeration_factor",
128 default = 12.0,
129 type = 'numeric',
130 help = " Exaggeration factor used to multiply the P matrix in the first part of the optimization [default : '%default' ]"
131 ),
132 make_option(
133 "--PCA_npc",
134 default = 5,
135 type = 'integer',
136 help = "number of dimensions kept in the results [default : '%default' ]"
137 ),
138 make_option(
139 "--HCPC_ncluster",
140 default = -1,
141 type = 'numeric',
142 help = "nb.clust, number of clusters to consider in the hierarchical clustering. [default : -1 let HCPC to optimize the number]"
143 ),
144 make_option(
145 "--HCPC_npc",
146 default = 5,
147 type = 'integer',
148 help = "npc, number of dimensions which are kept for HCPC analysis [default : '%default' ]"
149 ),
150 make_option(
151 "--HCPC_metric",
152 default = 'euclidian',
153 type = 'character',
154 help = "Metric to be used for calculating dissimilarities between observations, available 'euclidian' or 'manhattan' [default : '%default' ]"
155 ),
156 make_option(
157 "--HCPC_method",
158 default = 'ward',
159 type = 'character',
160 help = "Clustering method between 'ward','average','single', 'complete', 'weighted' [default :'%default']"
161 ),
162 make_option(
163 "--pdf_out",
164 default = "out.pdf",
165 type = 'character',
166 help = "pdf of plots [default : '%default' ]"
167 ),
168 make_option(
169 "--HCPC_consol",
170 default = 'TRUE',
171 type = 'logical',
172 help = "If TRUE, a k-means consolidation is performed [default :'%default']"
173 ),
174 make_option(
175 "--HCPC_itermax",
176 default = '10',
177 type = 'integer',
178 help = "The maximum number of iterations for the consolidation [default :'%default']"
179 ),
180 make_option(
181 "--HCPC_min",
182 default = '3',
183 type = 'integer',
184 help = "The least possible number of clusters suggested [default :'%default']"
185 ),
186 make_option(
187 "--HCPC_max",
188 default = -1,
189 type = 'integer',
190 help = "The higher possible number of clusters suggested [default :'%default']"
191 ),
192 make_option(
193 "--HCPC_clusterCA",
194 default = 'rows',
195 type = 'character',
196 help = "A string equals to 'rows' or 'columns' for the clustering of Correspondence Analysis results [default :'%default']"
197 ),
198 make_option(
199 "--HCPC_kk",
200 default = -1,
201 type = 'numeric',
202 help = "The maximum number of iterations for the consolidation [default :'%default']"
203 )
204 )
205
206 opt = parse_args(OptionParser(option_list = option_list),
207 args = commandArgs(trailingOnly = TRUE))
208
209 if (opt$sep == "tab") {opt$sep <- "\t"}
210 if (opt$sep == "comma") {opt$sep <- ","}
211 if(opt$HCPC_max == -1) {opt$HCPC_max <- NULL}
212 if(opt$HCPC_kk == -1) {opt$HCPC_kk <- Inf}
213
214 data = read.table(
215 opt$data,
216 check.names = FALSE,
217 header = opt$colnames,
218 row.names = 1,
219 sep = opt$sep
220 )
221
222 # Contrasting factor and its colors
223 if (opt$factor != '') {
224 contrasting_factor <- read.delim(
225 opt$factor,
226 header = TRUE
227 )
228 rownames(contrasting_factor) <- contrasting_factor[,1]
229 contrasting_factor <- contrasting_factor[colnames(data),]
230 colnames(contrasting_factor) <- c("id","factor")
231 contrasting_factor$factor <- as.factor(contrasting_factor$factor)
232 factorColors <-
233 with(contrasting_factor,
234 data.frame(factor = levels(factor),
235 data.frame(factor = levels(factor),
236 color = I(brewer.pal(nlevels(factor), name = 'Paired'))))
237 )
238 factor_cols <- factorColors$color[match(contrasting_factor$factor,
239 factorColors$factor)]
240 } else {
241 factor_cols <- rep("deepskyblue4", length(rownames(data)))
242 }
243
244 ################ t-SNE ####################
245 if (opt$visu_choice == 'tSNE') {
246 # filter and transpose df for tsne and pca
247 tdf = t(data)
248 # make tsne and plot results
249 set.seed(opt$Rtsne_seed) ## Sets seed for reproducibility
250
251 tsne_out <- Rtsne(tdf,
252 dims = opt$Rtsne_dims,
253 initial_dims = opt$Rtsne_initial_dims,
254 perplexity = opt$Rtsne_perplexity ,
255 theta = opt$Rtsne_theta,
256 max_iter = opt$Rtsne_max_iter,
257 pca = opt$Rtsne_pca,
258 pca_center = opt$Rtsne_pca_center,
259 pca_scale = opt$Rtsne_pca_scale,
260 normalize = opt$Rtsne_normalize,
261 exaggeration_factor=opt$Rtsne_exaggeration_factor)
262
263 embedding <- as.data.frame(tsne_out$Y[,1:2])
264 embedding$Class <- as.factor(rownames(tdf))
265 gg_legend = theme(legend.position="right")
266 if (opt$factor == '') {
267 ggplot(embedding, aes(x=V1, y=V2)) +
268 geom_point(size=1, color='deepskyblue4') +
269 gg_legend +
270 xlab("") +
271 ylab("") +
272 ggtitle('t-SNE') +
273 if (opt$labels) {
274 geom_text(aes(label=Class),hjust=-0.2, vjust=-0.5, size=1.5, color='deepskyblue4')
275 }
276 } else {
277 embedding$factor <- as.factor(contrasting_factor$factor)
278 ggplot(embedding, aes(x=V1, y=V2, color=factor)) +
279 geom_point(size=1) + #, color=factor_cols
280 gg_legend +
281 xlab("") +
282 ylab("") +
283 ggtitle('t-SNE') +
284 if (opt$labels) {
285 geom_text(aes(label=Class, colour=factor),hjust=-0.2, vjust=-0.5, size=1.5)
286 }
287 }
288 ggsave(file=opt$pdf_out, device="pdf")
289
290 #save coordinates table
291 if(opt$table_coordinates != ''){
292 coord_table <- cbind(rownames(tdf),as.data.frame(tsne_out$Y))
293 colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$Rtsne_dims)))
294 }
295 }
296
297
298 ######### make PCA with FactoMineR #################
299 if (opt$visu_choice == 'PCA') {
300 pca <- PCA(t(data), ncp=opt$PCA_npc, graph=FALSE)
301 pdf(opt$pdf_out)
302 if (opt$labels == FALSE) {
303 plot(pca, label="none" , col.ind = factor_cols)
304 } else {
305 plot(pca, cex=0.2 , col.ind = factor_cols)
306 }
307 if (opt$factor != '') {
308 legend(x = 'topright',
309 legend = as.character(factorColors$factor),
310 col = factorColors$color, pch = 16, bty = 'n', xjust = 1, cex=0.7)
311 }
312 dev.off()
313
314 #save coordinates table
315 if(opt$table_coordinates != ''){
316 coord_table <- cbind(rownames(pca$ind$coord),as.data.frame(pca$ind$coord))
317 colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$PCA_npc)))
318 }
319
320 }
321
322 ########### make HCPC with FactoMineR ##########
323 if (opt$visu_choice == 'HCPC') {
324
325 # HCPC starts with a PCA
326 pca <- PCA(
327 t(data),
328 ncp = opt$HCPC_npc,
329 graph = FALSE,
330 scale.unit = FALSE
331 )
332
333 PCA_IndCoord = as.data.frame(pca$ind$coord) # coordinates of observations in PCA
334
335 # Hierarchical Clustering On Principal Components Followed By Kmean Clustering
336 res.hcpc <- HCPC(pca,
337 nb.clust=opt$HCPC_ncluster, metric=opt$HCPC_metric, method=opt$HCPC_method,
338 graph=F,consol=opt$HCPC_consol,iter.max=opt$HCPC_itermax,min=opt$HCPC_min,max=opt$HCPC_max,
339 cluster.CA=opt$HCPC_clusterCA,kk=opt$HCPC_kk)
340 # HCPC plots
341 dims <- head(as.data.frame(res.hcpc$call$t$res$eig),2) # dims variances in column 2
342 pdf(opt$pdf_out)
343 plot(res.hcpc, choice="tree")
344 plot(res.hcpc, choice="bar")
345 plot(res.hcpc, choice="3D.map")
346 if (opt$labels == FALSE) {
347 plot(res.hcpc, choice="map", label="none")
348 } else {
349 plot(res.hcpc, choice="map")
350 }
351
352 # user contrasts on the pca
353 if (opt$factor != '') {
354 plot(pca, label="none", habillage="ind", col.hab=factor_cols)
355 legend(x = 'topright',
356 legend = as.character(factorColors$factor),
357 col = factorColors$color, pch = 16, bty = 'n', xjust = 1, cex=0.7)
358 }
359 ## Clusters to which individual observations belong # used ?
360 # Clust <- data.frame(Cluster = res.hcpc$data.clust[, (nrow(data) + 1)],
361 # Observation = rownames(res.hcpc$data.clust))
362 # metadata <- data.frame(Observation=colnames(data), row.names=colnames(data))
363 # metadata = merge(y = metadata,
364 # x = Clust,
365 # by = "Observation")
366
367 # unclear utility
368 # ObsNumberPerCluster = as.data.frame(table(metadata$Cluster))
369 # colnames(ObsNumberPerCluster) = c("Cluster", "ObsNumber")
370 #
371 # ## Silhouette Plot # not used
372 # hc.cut = hcut(PCA_IndCoord,
373 # k = nlevels(metadata$Cluster),
374 # hc_method = "ward.D2")
375 #
376 # Sil = fviz_silhouette(hc.cut)
377 # sil1 = as.data.frame(Sil$data)
378
379 ## Normalized Mutual Information # to be implemented later
380 # sink(opt$mutual_info)
381 # res = external_validation(
382 # as.numeric(factor(metadata[, Patient])),
383 # as.numeric(metadata$Cluster),
384 # method = "adjusted_rand_index",
385 # summary_stats = TRUE
386 # )
387 # sink()
388 dev.off()
389
390 if(opt$table_coordinates != ''){
391 coord_table <- cbind(Cell=rownames(res.hcpc$call$X),as.data.frame(res.hcpc$call$X))
392 colnames(coord_table)=c("Cells",paste0("DIM",(1:opt$HCPC_npc)),"Cluster")
393 }
394 }
395
396 ## Return coordinates file to user
397
398 if(opt$table_coordinates != ''){
399 write.table(
400 coord_table,
401 file = opt$table_coordinates,
402 sep = "\t",
403 quote = F,
404 col.names = T,
405 row.names = F
406 )
407 }
408
409
410
411
412
413
414
415
416
417
418
419
420
421