comparison Intchecks/Script_intensity_check.R @ 1:4973a2104cfd draft

Uploaded
author melpetera
date Wed, 05 Dec 2018 10:27:45 -0500
parents c2c2e1be904a
children bdee2c2c484b
comparison
equal deleted inserted replaced
0:c2c2e1be904a 1:4973a2104cfd
1 #################################################################### 1 #########################################################################
2 # SCRIPT INTENSITY CHECK 2 # SCRIPT INTENSITY CHECK #
3 # V1: Fold and NA 3 # #
4 # 4 # Input: Data Matrix, VariableMetadata, SampleMetadata #
5 # Input: Data Matrix, VariableMetadata, SampleMetadata 5 # Output: VariableMetadata, Graphics (barplots and boxplots) #
6 # Output: VariableMetadata, Graphics (barplots and boxplots) 6 # #
7 # 7 # Dependencies: RcheckLibrary.R #
8 # Dependencies: RcheckLibrary.R 8 # #
9 # 9 #########################################################################
10 ####################################################################
11 10
12 11
13 # Parameters (for dev) 12 # Parameters (for dev)
14 if(FALSE){ 13 if(FALSE){
15 14
17 setwd("Y:\\Developpement\\Intensity check\\Pour tests") 16 setwd("Y:\\Developpement\\Intensity check\\Pour tests")
18 17
19 DM.name <- "DM_NA.tabular" 18 DM.name <- "DM_NA.tabular"
20 SM.name <- "SM_NA.tabular" 19 SM.name <- "SM_NA.tabular"
21 VM.name <- "vM_NA.tabular" 20 VM.name <- "vM_NA.tabular"
21 class.col <- "2"
22 type <- "One_class" 22 type <- "One_class"
23 class.col <- "2"
24 class1 <- "Blanks" 23 class1 <- "Blanks"
24 fold.frac <- "Top"
25 logarithm <- "log2"
25 VM.output <- "new_VM.txt" 26 VM.output <- "new_VM.txt"
26 graphs.output <- "Barplots_and_Boxplots.pdf" 27 graphs.output <- "Barplots_and_Boxplots.pdf"
27 } 28 }
28 29
29 intens_check <- function(DM.name, SM.name, VM.name, type, class.col, class1, VM.output, graphs.output){ 30
30 # This function allows to check the intensities considering classes with a fold calculation, the number and 31
31 # the proportion of NA 32
33 intens_check <- function(DM.name, SM.name, VM.name, class.col, type, class1, fold.frac, logarithm,
34 VM.output, graphs.output){
35
36
37 # This function allows to check the intensities considering classes with a mean fold change calculation,
38 # the number and the proportion of missing values (NA) in dataMatrix
32 # 39 #
33 # Two options: 40 # Two options:
34 # - one class (selected by the user) against all the remaining samples ("One_class") 41 # - one class (selected by the user) against all the remaining samples ("One_class")
35 # - tests on each class ("Each_class") 42 # - tests on each class ("Each_class")
36 # 43 #
37 # Parameters: 44 # Parameters:
38 # DM.name, SM.name, VM.name: dataMatrix, sampleMetadata, variableMetadata files access 45 # DM.name, SM.name, VM.name: dataMatrix, sampleMetadata, variableMetadata files access
46 # class.col: number of the sampleMetadata's column with classes
39 # type: "One_class" or "Each_class" 47 # type: "One_class" or "Each_class"
40 # class.col: number of the sampleMetadata's column with classes 48 # class1: name of the class, only if type="One_class"
41 # class1: name of the class if type="One_class" 49 # fold.frac: if type="One class": class1/other ("Top") or other/class1 ("Bottom")
42 # VM.output: output file's access (VM with the new columns) 50 # logarithm: "log2", "log10" or "none" for log mean fold change
43 # graphs.output: pdf with barplots for the proportion of NA and boxplots with the folds values 51 # VM.output: output file's access (VM with new columns)
44 52 # graphs.output: pdf file's access with barplots for the proportion of NA and boxplots with the folds values
45 53
46 54
47 55
48 # Input --------------------------------------------------------- 56
57
58 # Input ---------------------------------------------------------------------------------------------------
49 59
50 DM <- read.table(DM.name, header=TRUE, sep="\t", check.names=FALSE) 60 DM <- read.table(DM.name, header=TRUE, sep="\t", check.names=FALSE)
51 SM <- read.table(SM.name, header=TRUE, sep="\t", check.names=FALSE) 61 SM <- read.table(SM.name, header=TRUE, sep="\t", check.names=FALSE)
52 VM <- read.table(VM.name, header=TRUE, sep="\t", check.names=FALSE) 62 VM <- read.table(VM.name, header=TRUE, sep="\t", check.names=FALSE)
53 63
54 64
55 65
56 # Table match check with Rchecklibrary 66 # Table match check with Rchecklibrary
57 table.check <- match3(DM, SM, VM) 67 table.check <- match3(DM, SM, VM)
58 check.err(table.check) 68 check.err(table.check)
69
59 70
60 71
61 rownames(DM) <- DM[,1] 72 rownames(DM) <- DM[,1]
62 var_names <- DM[,1] 73 var_names <- DM[,1]
63 DM <- DM[,-1] 74 DM <- DM[,-1]
64 DM <- data.frame(t(DM)) 75 DM <- data.frame(t(DM))
65 76
66 class.col <- colnames(SM)[as.numeric(class.col)] 77 class.col <- colnames(SM)[as.numeric(class.col)]
67 78
68 # check class.col, class1 and the number of classes--------------- 79
80 # check class.col, class1 and the number of classes ---------------------------------------------------------
69 81
70 if(!(class.col %in% colnames(SM))){ 82 if(!(class.col %in% colnames(SM))){
71 stop("\n- - - - - - - - -\n", "The column ",class.col, " is not a part of the specify sample Metadata","\n- - - - - - - - -\n") 83 stop("\n- - - - - - - - -\n", "The column ",class.col, " is not a part of the specify sample Metadata","\n- - - - - - - - -\n")
72 } 84 }
73 85
74 c_class <- SM[,class.col] 86 c_class <- SM[,class.col]
75 c_class <- as.factor(c_class) 87 c_class <- as.factor(c_class)
76 nb_class <- nlevels(c_class) 88 nb_class <- nlevels(c_class)
77 classnames <- levels(c_class) 89 classnames <- levels(c_class)
78 90
91 if(nb_class < 2){
92 err.1class <- c("\n The column",class.col, "contains only one class, fold calculation could not be executed \n")
93 cat(err.1class)
94 }
79 95
80 if((nb_class > (nrow(SM))/3)){ 96 if((nb_class > (nrow(SM))/3)){
81 class.err <- c("\n There are too many classes, think about reducing the number of classes and excluding those 97 class.err <- c("\n There are too many classes, think about reducing the number of classes and excluding those
82 with few samples \n") 98 with few samples \n")
83 cat(class.err) 99 cat(class.err)
84 } 100 }
85 101
102
86 if(type == "One_class"){ 103 if(type == "One_class"){
87 if(!(class1 %in% classnames)){ 104 if(!(class1 %in% classnames)){
88 list.class1 <- c("\n Classes:",classnames,"\n") 105 list.class1 <- c("\n Classes:",classnames,"\n")
89 cat(list.class1) 106 cat(list.class1)
90 err.class1 <- c("The class ",class1, " does not appear in the column ", class.col) 107 err.class1 <- c("The class ",class1, " does not appear in the column ", class.col)
91 stop("\n- - - - - - - - -\n", err.class1,"\n- - - - - - - - -\n") 108 stop("\n- - - - - - - - -\n", err.class1,"\n- - - - - - - - -\n")
92 } 109 }
93 } 110 }
111
94 112
95 #If type is "one_class", change others classes in "other" 113 #If type is "one_class", change others classes in "other"
96 if(type == "One_class"){ 114 if(type == "One_class"){
97 for(i in 1:length(c_class)){ 115 for(i in 1:length(c_class)){
98 if(c_class[i]!=class1){ 116 if(c_class[i]!=class1){
99 c_class <- as.character(c_class) 117 c_class <- as.character(c_class)
100 c_class[i] <- "Other" 118 c_class[i] <- "Other"
101 c_class <- as.factor(c_class) 119 c_class <- as.factor(c_class)
102 nb_class <- nlevels(c_class) 120 nb_class <- nlevels(c_class)
103 classnames <- levels(c_class) 121 classnames <- c(class1,"Other")
104 122
105 } 123 }
106 } 124 }
107 } 125 }
108 126
109 DM <- cbind(DM,c_class) 127 DM <- cbind(DM,c_class)
110 128
111 # fold ------------------------------------------------------- 129
112 n <- 1 130
113 fold <- data.frame() 131 # fold calculation -------------------------------------------------------------------------------------------
114 for(j in 1:(nb_class-1)){ 132
115 for(k in (j+1):nb_class) { 133 if(nb_class >= 2){
116 for (i in 1:(length(DM)-1)){ 134
117 fold[i,n] <- mean(DM[which(DM$c_class==classnames[k]),i], na.rm=TRUE)/ 135
118 mean(DM[which(DM$c_class==classnames[j]),i], na.rm=TRUE)} 136 fold <- data.frame()
119 names(fold)[n] <- paste("fold",classnames[k],"VS", classnames[j], sep="_") 137 n <- 1
120 n <- n + 1} 138 ratio1 <- NULL
121 } 139 ratio2 <- NULL
122 140
123 # NA --------------------------------------------------------- 141 if(type=="Each_class"){
142 fold.frac <- "Top"
143 }
144
145 for(j in 1:(nb_class-1)){
146 for(k in (j+1):nb_class) {
147
148 if(fold.frac=="Bottom"){
149 ratio1 <- classnames[k]
150 ratio2 <- classnames[j]
151 }else{
152 ratio1 <- classnames[j]
153 ratio2 <- classnames[k]
154 }
155
156 for (i in 1:(length(DM)-1)){
157 fold[i,n] <- mean(DM[which(DM$c_class==ratio1),i], na.rm=TRUE)/
158 mean(DM[which(DM$c_class==ratio2),i], na.rm=TRUE)
159 if(logarithm=="log2"){
160 fold[i,n] <- log2(fold[i,n])
161 }else if(logarithm=="log10"){
162 fold[i,n] <- log10(fold[i,n])
163 }
164 }
165 names(fold)[n] <- paste("fold",ratio1,"VS", ratio2, sep="_")
166 if(logarithm != "none"){
167 names(fold)[n] <- paste(logarithm,names(fold)[n], sep="_")
168 }
169 n <- n + 1}
170 }
171
172 }
173
174 # number and proportion of NA ---------------------------------------------------------------------------------
175
124 calcul_NA <- data.frame() 176 calcul_NA <- data.frame()
125 pct_NA <- data.frame() 177 pct_NA <- data.frame()
126 for (i in 1:(length(DM)-1)){ 178 for (i in 1:(length(DM)-1)){
127 for (j in 1:nb_class){ 179 for (j in 1:nb_class){
128 n <- 0 180 n <- 0
132 n <- n + 1} 184 n <- n + 1}
133 calcul_NA[i,j] <- n 185 calcul_NA[i,j] <- n
134 pct_NA[i,j] <- (calcul_NA[i,j]/length(new_DM))*100} 186 pct_NA[i,j] <- (calcul_NA[i,j]/length(new_DM))*100}
135 } 187 }
136 } 188 }
137 names(calcul_NA) <- paste("Nb_NA",classnames, sep="_") 189 names(calcul_NA) <- paste("NA",classnames, sep="_")
138 names(pct_NA) <- paste("Pct_NA", classnames, sep="_") 190 names(pct_NA) <- paste("Pct_NA", classnames, sep="_")
139 191
140 # Alert message if there is no NA 192 # Alert message if there is no NA in data matrix
141 193
142 sumNA <- colSums(calcul_NA) 194 sumNA <- colSums(calcul_NA)
143 sum_total <- sum(sumNA) 195 sum_total <- sum(sumNA)
144
145 alerte <- NULL 196 alerte <- NULL
146 if(sum_total==0){ 197 if(sum_total==0){
147 alerte <- c(alerte, "Data Matrix contains no NA.\n") 198 alerte <- c(alerte, "Data Matrix contains no NA.\n")
148 } 199 }
149 200
150 if(length(alerte) != 0){ 201 if(length(alerte) != 0){
151 cat(alerte,"\n") 202 cat(alerte,"\n")
152 } 203 }
153
154 table_NA <- cbind(calcul_NA, pct_NA) 204 table_NA <- cbind(calcul_NA, pct_NA)
155 205
156 # check columns names ---------------------------------------- 206
207
208 # check columns names ---------------------------------------------------------------------------------------
209
210
211 VM.names <- colnames(VM)
157 212
158 # Fold 213 # Fold
159 VM.names <- colnames(VM) 214
160 fold.names <- colnames(fold) 215 if(nb_class >=2){
161 216 fold.names <- colnames(fold)
162 for (i in 1:length(VM.names)){ 217
163 for (j in 1:length(fold.names)){ 218 for (i in 1:length(VM.names)){
164 if (VM.names[i]==fold.names[j]){ 219 for (j in 1:length(fold.names)){
165 fold.names[j] <- paste(fold.names[j],"2", sep="_") 220 if (VM.names[i]==fold.names[j]){
166 } 221 fold.names[j] <- paste(fold.names[j],"2", sep="_")
167 } 222 }
168 } 223 }
169 colnames(fold) <- fold.names 224 }
225 colnames(fold) <- fold.names
226
227 VM <- cbind(VM,fold)
228 }
170 229
171 # NA 230 # NA
172 NA.names <- colnames(table_NA) 231 NA.names <- colnames(table_NA)
173 232
174 for (i in 1:length(VM.names)){ 233 for (i in 1:length(VM.names)){
177 NA.names[j] <- paste(NA.names[j],"2", sep="_") 236 NA.names[j] <- paste(NA.names[j],"2", sep="_")
178 } 237 }
179 } 238 }
180 } 239 }
181 colnames(table_NA) <- NA.names 240 colnames(table_NA) <- NA.names
182 241 VM <- cbind(VM,table_NA)
183 #for NA barplots --------------------------------------------- 242
243
244 #for NA barplots -------------------------------------------------------------------------------------------
184 245
185 data_bp <- data.frame() 246 data_bp <- data.frame()
186 247
187 for (j in 1:ncol(pct_NA)){ 248 for (j in 1:ncol(pct_NA)){
188 Nb_NA_0_20 <- 0 249 Nb_NA_0_20 <- 0
212 data_bp[2,j] <- Nb_NA_20_40 273 data_bp[2,j] <- Nb_NA_20_40
213 data_bp[3,j] <- Nb_NA_40_60 274 data_bp[3,j] <- Nb_NA_40_60
214 data_bp[4,j] <- Nb_NA_60_80 275 data_bp[4,j] <- Nb_NA_60_80
215 data_bp[5,j] <- Nb_NA_80_100 276 data_bp[5,j] <- Nb_NA_80_100
216 } 277 }
217 rownames(data_bp) <- c("0-20%", "20-40%", "40-60%", "60-80%", "80-100%") 278 rownames(data_bp) <- c("0%-20%", "20%-40%", "40%-60%", "60%-80%", "80%-100%")
218 colnames(data_bp) <- classnames 279 colnames(data_bp) <- classnames
219 data_bp <- as.matrix(data_bp) 280 data_bp <- as.matrix(data_bp)
220 281
221 282
222 # Output-------------------------------------------------------- 283 # Output ---------------------------------------------------------------------------------------------------
223 284
224 VM <- cbind(VM,fold,table_NA)
225 285
226 write.table(VM, VM.output,sep="\t", quote=FALSE, row.names=FALSE) 286 write.table(VM, VM.output,sep="\t", quote=FALSE, row.names=FALSE)
227 287
228 #graphics pdf 288 #graphics pdf
229 289
230 pdf(graphs.output) 290 pdf(graphs.output)
231 291
232 #Boxplots for NA 292 #Barplots for NA
233
234 par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE) 293 par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
235 294
236 bp=barplot(data_bp, col=rainbow(nrow(data_bp)), main="Proportion of NA", xlab="Classes", ylab="Variables") 295 bp=barplot(data_bp, col=rainbow(nrow(data_bp)), main="Proportion of NA", xlab="Classes", ylab="Variables")
237 legend("topright", fill=rainbow(nrow(data_bp)),rownames(data_bp), inset=c(-0.3,0)) 296 legend("topright", fill=rainbow(nrow(data_bp)),rownames(data_bp), inset=c(-0.3,0))
238 297
239
240 stock=0 298 stock=0
241 for (i in 1:nrow(data_bp)){ 299 for (i in 1:nrow(data_bp)){
242 text(bp, stock+data_bp[i,]/2, data_bp[i,], col="white") 300 text(bp, stock+data_bp[i,]/2, data_bp[i,], col="white", cex=0.7)
243 stock <- stock+data_bp[i,] 301 stock <- stock+data_bp[i,]
244 } 302 }
245 303
246 304
247 #Boxplots for fold test 305 #Boxplots for fold test
248 for (j in 1:ncol(fold)){ 306
249 title=paste(fold.names[j]) 307 if(nb_class >= 2){
250 boxplot(fold[j], main=title) 308
251 } 309 clean_fold <- fold
310 for(i in 1:nrow(clean_fold)){
311 for(j in 1:ncol(clean_fold)){
312 if(is.infinite(clean_fold[i,j])){
313 clean_fold[i,j] <- NA
314 }
315 }
316 }
317 for (j in 1:ncol(clean_fold)){
318 title <- paste(fold.names[j])
319 boxplot(clean_fold[j], main=title)
320 }
321 }
252 322
253 dev.off() 323 dev.off()
254 324
255 } 325 }
256 326
257 327
258 # Function call---------------
259
260 #setwd("Y:\\Developpement\\Intensity check\\Pour tests")
261 #intens_check("DM_NA.tabular", "SM_NA.tabular", "VM_NA.tabular", "One_class", "class", "Blanks", "VM_oneclass_NA.txt",
262 #"Barplots_and_Boxplots")
263
264