Mercurial > repos > melpetera > corr_table
comparison CorrTable/Corr_Script_samples_row.R @ 1:29ec7e3afdd4 draft
Uploaded
author | melpetera |
---|---|
date | Thu, 01 Aug 2019 11:30:58 -0400 |
parents | b22c453e4cf4 |
children | 2173ad5e7750 |
comparison
equal
deleted
inserted
replaced
0:b22c453e4cf4 | 1:29ec7e3afdd4 |
---|---|
1 ################################################################################################# | 1 ################################################################################################# |
2 # CORRELATION TABLE # | 2 # CORRELATION TABLE # |
3 # # | 3 # # |
4 # # | 4 # # |
5 # Input : 2 tables with common samples # | 5 # Input : 2 tables with shared samples # |
6 # Output : Correlation table ; Heatmap (pdf) # | 6 # Output : Correlation table ; Heatmap (pdf) # |
7 # # | 7 # # |
8 # Dependencies : Libraries "ggplot2" and "reshape2" # | 8 # Dependencies : Libraries "ggplot2" and "reshape2" # |
9 # # | 9 # # |
10 ################################################################################################# | 10 ################################################################################################# |
11 | 11 |
12 | 12 |
13 # Parameters (for dev) | 13 # Parameters (for dev) |
14 if(FALSE){ | 14 if(FALSE){ |
15 | 15 |
16 rm(list = ls()) | 16 tab1.name <- "test/ressources/inputs/CT/CT2_DM.tabular" |
17 setwd(dir = "Y:/Developpement") | 17 tab2.name <- "test/ressources/inputs/CT/CT2_base_Diapason_14ClinCES_PRIN.txt" |
18 | |
19 tab1.name <- "Test/Ressources/Inputs/CT2_DM.tabular" | |
20 tab2.name <- "Test/Ressources/Inputs/CT2_base_Diapason_14ClinCES_PRIN.txt" | |
21 param1.samples <- "column" | 18 param1.samples <- "column" |
22 param2.samples <- "row" | 19 param2.samples <- "row" |
23 corr.method <- "pearson" | 20 corr.method <- "pearson" |
24 test.corr <- "yes" | 21 test.corr <- "yes" |
25 alpha <- 0.05 | 22 alpha <- 0.05 |
26 multi.name <- "none" | 23 multi.name <- "none" |
27 filter <- "yes" | 24 filter <- "yes" |
28 filters.choice <- "filters_0_thr" | 25 filters.choice <- "filters_0_thr" |
29 threshold <- 0.2 | 26 threshold <- 0.2 |
30 reorder.var <- "yes" | 27 reorder.var <- "yes" |
28 plot.choice <- "auto" | |
31 color.heatmap <- "yes" | 29 color.heatmap <- "yes" |
32 type.classes <-"irregular" | 30 type.classes <-"irregular" |
33 reg.value <- 1/3 | 31 reg.value <- 1/3 |
34 irreg.vect <- c(-0.3, -0.2, -0.1, 0, 0.3, 0.4) | 32 irreg.vect <- c(-0.3, -0.2, -0.1, 0, 0.3, 0.4) |
35 output1 <- "Correlation_table.txt" | 33 output1 <- "Correlation_table.txt" |
38 } | 36 } |
39 | 37 |
40 | 38 |
41 | 39 |
42 correlation.tab <- function(tab1.name, tab2.name, param1.samples, param2.samples, corr.method, test.corr, alpha, | 40 correlation.tab <- function(tab1.name, tab2.name, param1.samples, param2.samples, corr.method, test.corr, alpha, |
43 multi.name, filter, filters.choice, threshold, reorder.var, color.heatmap, type.classes, | 41 multi.name, filter, filters.choice, threshold, reorder.var, plot.choice, color.heatmap, |
44 reg.value, irreg.vect, output1, output2){ | 42 type.classes, reg.value, irreg.vect, output1, output2){ |
45 | 43 |
46 # This function allows to visualize the correlation between two tables | 44 # This function allows to visualize the correlation between two tables |
47 # | 45 # |
48 # Parameters: | 46 # Parameters: |
49 # - tab1.name: table 1 file's access | 47 # - tab1.name: table 1 file's access |
57 # - filter ("yes", "no"): use filter.0 or/and filter.threshold | 55 # - filter ("yes", "no"): use filter.0 or/and filter.threshold |
58 # - filters.choice ("filter_0" or "filters_0_thr"): zero filter removes variables with all their correlation coefficients = 0 | 56 # - filters.choice ("filter_0" or "filters_0_thr"): zero filter removes variables with all their correlation coefficients = 0 |
59 # and threshold filter remove variables with all their correlation coefficients in abs < threshold | 57 # and threshold filter remove variables with all their correlation coefficients in abs < threshold |
60 # - threshold (value between 0 and 1): threshold for filter threshold | 58 # - threshold (value between 0 and 1): threshold for filter threshold |
61 # - reorder.var ("yes" or "no"): reorder variables in the correlation table thanks to the HCA | 59 # - reorder.var ("yes" or "no"): reorder variables in the correlation table thanks to the HCA |
60 # - plot.choice ("auto", "forced" or "none"): determine whether a heatmap is plotted | |
62 # - color.heatmap ("yes" or "no"): color the heatmap with classes defined by the user | 61 # - color.heatmap ("yes" or "no"): color the heatmap with classes defined by the user |
63 # - type.classes ("regular" or "irregular"): choose to color the heatmap with regular or irregular classes | 62 # - type.classes ("regular" or "irregular"): choose to color the heatmap with regular or irregular classes |
64 # - reg.value (value between 0 and 1): value for regular classes | 63 # - reg.value (value between 0 and 1): value for regular classes |
65 # - irreg.vect (vector with values between -1 and 1): vector which indicates values for intervals (irregular classes) | 64 # - irreg.vect (vector with values between -1 and 1): vector which indicates values for intervals (irregular classes) |
66 # - output1: correlation table file's access | 65 # - output1: correlation table file's access |
83 | 82 |
84 # Sorting tables in alphabetical order of the samples | 83 # Sorting tables in alphabetical order of the samples |
85 tab1 <- tab1[order(rownames(tab1)),] | 84 tab1 <- tab1[order(rownames(tab1)),] |
86 tab2 <- tab2[order(rownames(tab2)),] | 85 tab2 <- tab2[order(rownames(tab2)),] |
87 | 86 |
88 | 87 |
88 # Checks --------------------------------------------------------------------------------------------- | |
89 | |
89 # Check if the 2 datasets match regarding samples identifiers | 90 # Check if the 2 datasets match regarding samples identifiers |
90 # Adapt from functions "check.err" and "match2", RcheckLibrary.R | 91 # Adapt from functions "check.err" and "match2", RcheckLibrary.R |
91 | 92 |
92 err.stock <- NULL | 93 err.stock <- NULL |
93 | 94 |
126 if(length(err.stock)!=0){ | 127 if(length(err.stock)!=0){ |
127 stop("\n- - - - - - - - -\n",err.stock,"\n- - - - - - - - -\n\n") | 128 stop("\n- - - - - - - - -\n",err.stock,"\n- - - - - - - - -\n\n") |
128 } | 129 } |
129 | 130 |
130 | 131 |
132 # Check whether tab1=tab2 | |
133 | |
134 err.msg <- NULL | |
135 | |
136 if((ncol(tab1)==ncol(tab2))&&(sum(tab1!=tab2,na.rm=TRUE)==0)){ | |
137 autocor <- TRUE | |
138 err.msg <- c(err.msg, "\nYou chose the same table for the two dataset inputs. \nTo allow filtering options,", | |
139 "we will turn the diagonal to 0 in the correlation matrix during the filtering process.\n") | |
140 }else{ | |
141 autocor <- FALSE | |
142 } | |
143 | |
144 | |
131 # Check qualitative variables in each input tables | 145 # Check qualitative variables in each input tables |
132 err.msg <- NULL | |
133 | 146 |
134 var1.quali <- vector() | 147 var1.quali <- vector() |
135 var2.quali <- vector() | 148 var2.quali <- vector() |
136 | 149 |
137 for (i in 1:dim(tab1)[2]){ | 150 for (i in 1:dim(tab1)[2]){ |
165 " ",paste(colnames(tab2)[var2.quali[1:3]],collapse="\n "),"\n") | 178 " ",paste(colnames(tab2)[var2.quali[1:3]],collapse="\n "),"\n") |
166 } | 179 } |
167 } | 180 } |
168 | 181 |
169 if(length(var1.quali) != 0){ | 182 if(length(var1.quali) != 0){ |
170 tab1 <- tab1[,-var1.quali] | 183 tab1 <- tab1[,-var1.quali,drop=FALSE] |
171 } | 184 } |
172 if(length(var2.quali) != 0){ | 185 if(length(var2.quali) != 0){ |
173 tab2 <- tab2[,-var2.quali] | 186 tab2 <- tab2[,-var2.quali,drop=FALSE] |
174 } | 187 } |
188 | |
189 | |
190 # Check whether there are constant variables | |
191 | |
192 var1.cons <- vector() | |
193 var2.cons <- vector() | |
194 | |
195 for (i in 1:dim(tab1)[2]){ | |
196 if(length(levels(as.factor(tab1[,i])))==1){ var1.cons <- c(var1.cons,i) } | |
197 } | |
198 | |
199 for (j in 1:dim(tab2)[2]){ | |
200 if(length(levels(as.factor(tab2[,j])))==1){ var2.cons <- c(var2.cons, j) } | |
201 } | |
202 | |
203 if (length(var1.cons) != 0 | length(var2.cons) != 0){ | |
204 err.msg <- c(err.msg, "\nThere are constant variables in your input tables which have been removed to compute the correlation table.\n\n") | |
205 | |
206 if(length(var1.cons) != 0 && length(var1.cons) < 4){ | |
207 err.msg <- c(err.msg, "In table 1, the following constant variables have been removed:\n", | |
208 " ",paste(colnames(tab1)[var1.cons],collapse="\n "),"\n") | |
209 } else if(length(var1.cons) != 0 && length(var1.cons) > 3){ | |
210 err.msg <- c(err.msg, "For example, in table 1, the following constant variables have been removed:\n", | |
211 " ",paste(colnames(tab1)[var1.cons[1:3]],collapse="\n "),"\n") | |
212 } | |
213 | |
214 if(length(var2.cons) != 0 && length(var2.cons) < 4){ | |
215 err.msg <- c(err.msg, "In table 2, the following constant variables have been removed:\n", | |
216 " ",paste(colnames(tab2)[var2.cons],collapse="\n "),"\n") | |
217 } else if(length(var2.cons) != 0 && length(var2.cons) > 3){ | |
218 err.msg <- c(err.msg, "For example, in table 2, the following constant variables have been removed:\n", | |
219 " ",paste(colnames(tab2)[var2.cons[1:3]],collapse="\n "),"\n") | |
220 } | |
221 } | |
222 | |
223 if(length(var1.cons) != 0){ | |
224 tab1 <- tab1[,-var1.cons,drop=FALSE] | |
225 } | |
226 if(length(var2.cons) != 0){ | |
227 tab2 <- tab2[,-var2.cons,drop=FALSE] | |
228 } | |
229 | |
230 | |
231 # Print info message | |
175 | 232 |
176 if(length(err.msg) != 0){ | 233 if(length(err.msg) != 0){ |
177 cat("\n- - - - - - - - -\n",err.msg,"\n- - - - - - - - -\n\n") | 234 cat("\n- - - - - - - - -\n",err.msg,"\n- - - - - - - - -\n\n") |
178 } | 235 } |
179 | 236 |
237 rm(err.stock,var1.quali,var2.quali,var1.cons,var2.cons,err.msg) | |
238 | |
239 | |
180 # Correlation table --------------------------------------------------------------------------------- | 240 # Correlation table --------------------------------------------------------------------------------- |
181 | 241 |
182 tab.corr <- matrix(nrow = dim(tab2)[2], ncol = dim(tab1)[2]) | 242 tab.corr <- matrix(nrow = dim(tab2)[2], ncol = dim(tab1)[2]) |
183 for (i in 1:dim(tab2)[2]){ | 243 for (i in 1:dim(tab2)[2]){ |
184 for (j in 1:dim(tab1)[2]){ | 244 for (j in 1:dim(tab1)[2]){ |
186 } | 246 } |
187 } | 247 } |
188 | 248 |
189 colnames(tab.corr) <- colnames(tab1) | 249 colnames(tab.corr) <- colnames(tab1) |
190 rownames(tab.corr) <- colnames(tab2) | 250 rownames(tab.corr) <- colnames(tab2) |
191 | |
192 | 251 |
193 | 252 |
194 # Significance of correlation test ------------------------------------------------------------------ | 253 # Significance of correlation test ------------------------------------------------------------------ |
195 | 254 |
196 if (test.corr == "yes"){ | 255 if (test.corr == "yes"){ |
216 | 275 |
217 | 276 |
218 # Filter settings ------------------------------------------------------------------------------------ | 277 # Filter settings ------------------------------------------------------------------------------------ |
219 | 278 |
220 if (filter == "yes"){ | 279 if (filter == "yes"){ |
280 | |
281 # Turn diagonal from 1 to 0 if autocorrelation | |
282 if(autocor){ | |
283 for(i in 1:(ncol(tab.corr))){ tab.corr[i,i] <- 0 } | |
284 } | |
221 | 285 |
222 # Remove variables with all their correlation coefficients = 0 : | 286 # Remove variables with all their correlation coefficients = 0 : |
223 if (filters.choice == "filter_0"){ | 287 if (filters.choice == "filter_0"){ |
224 threshold <- 0 | 288 threshold <- 0 |
225 } | 289 } |
230 var2.thres <- c(var2.thres, i) | 294 var2.thres <- c(var2.thres, i) |
231 } | 295 } |
232 } | 296 } |
233 | 297 |
234 if (length(var2.thres) != 0){ | 298 if (length(var2.thres) != 0){ |
235 tab.corr <- tab.corr[-var2.thres,] | 299 tab.corr <- tab.corr[-var2.thres,,drop=FALSE] |
236 tab2 <- tab2[, -var2.thres] | 300 tab2 <- tab2[, -var2.thres,drop=FALSE] |
237 } | 301 } |
238 | 302 |
239 var1.thres <- vector() | 303 var1.thres <- vector() |
240 for (i in 1:dim(tab.corr)[2]){ | 304 for (i in 1:dim(tab.corr)[2]){ |
241 if (length(which(abs(tab.corr[,i]) <= threshold)) == dim(tab.corr)[1]){ | 305 if (length(which(abs(tab.corr[,i]) <= threshold)) == dim(tab.corr)[1]){ |
242 var1.thres <- c(var1.thres, i) | 306 var1.thres <- c(var1.thres, i) |
243 } | 307 } |
244 } | 308 } |
245 | 309 |
246 if (length(var1.thres) != 0){ | 310 if (length(var1.thres) != 0){ |
247 tab.corr <- tab.corr[,-var1.thres] | 311 tab.corr <- tab.corr[,-var1.thres,drop=FALSE] |
248 tab1 <- tab1[,-var1.thres] | 312 tab1 <- tab1[,-var1.thres,drop=FALSE] |
249 } | 313 } |
314 | |
315 # Turn diagonal from 0 back to 1 if autocorrelation | |
316 if(autocor){ | |
317 for(i in 1:(ncol(tab.corr))){ tab.corr[i,i] <- 1 } | |
318 } | |
250 | 319 |
251 } | 320 } |
252 | 321 |
253 | 322 |
254 # Reorder variables in the correlation table (with the HCA) ------------------------------------------ | 323 # Reorder variables in the correlation table (with the HCA) ------------------------------------------ |
256 | 325 |
257 cormat.tab2 <- cor(tab2, method = corr.method, use = "pairwise.complete.obs") | 326 cormat.tab2 <- cor(tab2, method = corr.method, use = "pairwise.complete.obs") |
258 dist.tab2 <- as.dist(1 - cormat.tab2) | 327 dist.tab2 <- as.dist(1 - cormat.tab2) |
259 hc.tab2 <- hclust(dist.tab2, method = "ward.D2") | 328 hc.tab2 <- hclust(dist.tab2, method = "ward.D2") |
260 tab.corr <- tab.corr[hc.tab2$order,] | 329 tab.corr <- tab.corr[hc.tab2$order,] |
330 rm(cormat.tab2) | |
261 | 331 |
262 cormat.tab1 <- cor(tab1, method = corr.method, use = "pairwise.complete.obs") | 332 cormat.tab1 <- cor(tab1, method = corr.method, use = "pairwise.complete.obs") |
263 dist.tab1 <- as.dist(1 - cormat.tab1) | 333 dist.tab1 <- as.dist(1 - cormat.tab1) |
264 hc.tab1 <- hclust(dist.tab1, method = "ward.D2") | 334 hc.tab1 <- hclust(dist.tab1, method = "ward.D2") |
265 tab.corr <- tab.corr[,hc.tab1$order] | 335 tab.corr <- tab.corr[,hc.tab1$order] |
266 | 336 rm(cormat.tab1) |
267 } | 337 |
268 | 338 } |
269 | 339 |
270 | 340 |
271 # Output 1 : Correlation table ----------------------------------------------------------------------- | 341 # Output 1 : Correlation table ----------------------------------------------------------------------- |
342 | |
272 | 343 |
273 # Export correlation table | 344 # Export correlation table |
274 write.table(x = data.frame(name = rownames(tab.corr), tab.corr), file = output1, sep = "\t", quote = FALSE, row.names = FALSE) | 345 write.table(x = data.frame(name = rownames(tab.corr), tab.corr), file = output1, sep = "\t", quote = FALSE, row.names = FALSE) |
275 | 346 |
276 | 347 |
277 | 348 |
278 # Create the heatmap --------------------------------------------------------------------------------- | 349 # Create the heatmap --------------------------------------------------------------------------------- |
279 | 350 |
351 if(plot.choice != "none"){ | |
352 | |
280 # A message if no variable kept | 353 # A message if no variable kept |
281 if(length(tab.corr)==0){ | 354 if(length(tab.corr)==0){ |
282 pdf(output2) | 355 pdf(output2) |
283 plot.new() | 356 plot.new() |
284 legend("center","Filtering leads to no remaining correlation coefficient.") | 357 legend("center","Filtering leads to no remaining correlation coefficient.") |
358 dev.off() | |
359 } else { | |
360 | |
361 # A message if more than 1000 variable in auto mode | |
362 if((plot.choice=="auto")&&(max(dim(tab.corr))>1000)){ | |
363 pdf(output2) | |
364 war.msg <- paste0("In 'default' mode, the colored table is not provided when\none of the tables contains more than ", | |
365 "a thousand\nvariables after the filter step.\n\nOne of your table still contains ",max(dim(tab.corr))," variables.\n", | |
366 "Please consider more filtering, or use the 'Always plot a\ncolored table' mode to obtain your colored table.") | |
367 plot.new() | |
368 legend("center",war.msg,adj=c(0.05,0.075)) | |
285 dev.off() | 369 dev.off() |
286 } else { | 370 } else { |
287 | 371 |
288 | 372 |
289 library(ggplot2) | 373 library(ggplot2) |
397 | 481 |
398 | 482 |
399 ggsave(output2, device = "pdf", width = 10+0.075*dim(tab.corr)[2], height = 5+0.075*dim(tab.corr)[1], limitsize = FALSE) | 483 ggsave(output2, device = "pdf", width = 10+0.075*dim(tab.corr)[2], height = 5+0.075*dim(tab.corr)[1], limitsize = FALSE) |
400 | 484 |
401 | 485 |
402 } # End if(length(tab.corr)==0)else | 486 } # End of if((plot.choice=="auto")&&(max(dim(tab.corr))>1000))else |
487 } # End of if(length(tab.corr)==0)else | |
488 } # End of if(plot.choice != "auto") | |
489 | |
403 | 490 |
404 } # End of correlation.tab | 491 } # End of correlation.tab |
405 | 492 |
406 | 493 |
407 # Function call | 494 # Function call |