Mercurial > repos > melpetera > intensity_checks
comparison Intchecks/Script_intensity_check.R @ 1:4973a2104cfd draft
Uploaded
author | melpetera |
---|---|
date | Wed, 05 Dec 2018 10:27:45 -0500 |
parents | c2c2e1be904a |
children | bdee2c2c484b |
comparison
equal
deleted
inserted
replaced
0:c2c2e1be904a | 1:4973a2104cfd |
---|---|
1 #################################################################### | 1 ######################################################################### |
2 # SCRIPT INTENSITY CHECK | 2 # SCRIPT INTENSITY CHECK # |
3 # V1: Fold and NA | 3 # # |
4 # | 4 # Input: Data Matrix, VariableMetadata, SampleMetadata # |
5 # Input: Data Matrix, VariableMetadata, SampleMetadata | 5 # Output: VariableMetadata, Graphics (barplots and boxplots) # |
6 # Output: VariableMetadata, Graphics (barplots and boxplots) | 6 # # |
7 # | 7 # Dependencies: RcheckLibrary.R # |
8 # Dependencies: RcheckLibrary.R | 8 # # |
9 # | 9 ######################################################################### |
10 #################################################################### | |
11 | 10 |
12 | 11 |
13 # Parameters (for dev) | 12 # Parameters (for dev) |
14 if(FALSE){ | 13 if(FALSE){ |
15 | 14 |
17 setwd("Y:\\Developpement\\Intensity check\\Pour tests") | 16 setwd("Y:\\Developpement\\Intensity check\\Pour tests") |
18 | 17 |
19 DM.name <- "DM_NA.tabular" | 18 DM.name <- "DM_NA.tabular" |
20 SM.name <- "SM_NA.tabular" | 19 SM.name <- "SM_NA.tabular" |
21 VM.name <- "vM_NA.tabular" | 20 VM.name <- "vM_NA.tabular" |
21 class.col <- "2" | |
22 type <- "One_class" | 22 type <- "One_class" |
23 class.col <- "2" | |
24 class1 <- "Blanks" | 23 class1 <- "Blanks" |
24 fold.frac <- "Top" | |
25 logarithm <- "log2" | |
25 VM.output <- "new_VM.txt" | 26 VM.output <- "new_VM.txt" |
26 graphs.output <- "Barplots_and_Boxplots.pdf" | 27 graphs.output <- "Barplots_and_Boxplots.pdf" |
27 } | 28 } |
28 | 29 |
29 intens_check <- function(DM.name, SM.name, VM.name, type, class.col, class1, VM.output, graphs.output){ | 30 |
30 # This function allows to check the intensities considering classes with a fold calculation, the number and | 31 |
31 # the proportion of NA | 32 |
33 intens_check <- function(DM.name, SM.name, VM.name, class.col, type, class1, fold.frac, logarithm, | |
34 VM.output, graphs.output){ | |
35 | |
36 | |
37 # This function allows to check the intensities considering classes with a mean fold change calculation, | |
38 # the number and the proportion of missing values (NA) in dataMatrix | |
32 # | 39 # |
33 # Two options: | 40 # Two options: |
34 # - one class (selected by the user) against all the remaining samples ("One_class") | 41 # - one class (selected by the user) against all the remaining samples ("One_class") |
35 # - tests on each class ("Each_class") | 42 # - tests on each class ("Each_class") |
36 # | 43 # |
37 # Parameters: | 44 # Parameters: |
38 # DM.name, SM.name, VM.name: dataMatrix, sampleMetadata, variableMetadata files access | 45 # DM.name, SM.name, VM.name: dataMatrix, sampleMetadata, variableMetadata files access |
46 # class.col: number of the sampleMetadata's column with classes | |
39 # type: "One_class" or "Each_class" | 47 # type: "One_class" or "Each_class" |
40 # class.col: number of the sampleMetadata's column with classes | 48 # class1: name of the class, only if type="One_class" |
41 # class1: name of the class if type="One_class" | 49 # fold.frac: if type="One class": class1/other ("Top") or other/class1 ("Bottom") |
42 # VM.output: output file's access (VM with the new columns) | 50 # logarithm: "log2", "log10" or "none" for log mean fold change |
43 # graphs.output: pdf with barplots for the proportion of NA and boxplots with the folds values | 51 # VM.output: output file's access (VM with new columns) |
44 | 52 # graphs.output: pdf file's access with barplots for the proportion of NA and boxplots with the folds values |
45 | 53 |
46 | 54 |
47 | 55 |
48 # Input --------------------------------------------------------- | 56 |
57 | |
58 # Input --------------------------------------------------------------------------------------------------- | |
49 | 59 |
50 DM <- read.table(DM.name, header=TRUE, sep="\t", check.names=FALSE) | 60 DM <- read.table(DM.name, header=TRUE, sep="\t", check.names=FALSE) |
51 SM <- read.table(SM.name, header=TRUE, sep="\t", check.names=FALSE) | 61 SM <- read.table(SM.name, header=TRUE, sep="\t", check.names=FALSE) |
52 VM <- read.table(VM.name, header=TRUE, sep="\t", check.names=FALSE) | 62 VM <- read.table(VM.name, header=TRUE, sep="\t", check.names=FALSE) |
53 | 63 |
54 | 64 |
55 | 65 |
56 # Table match check with Rchecklibrary | 66 # Table match check with Rchecklibrary |
57 table.check <- match3(DM, SM, VM) | 67 table.check <- match3(DM, SM, VM) |
58 check.err(table.check) | 68 check.err(table.check) |
69 | |
59 | 70 |
60 | 71 |
61 rownames(DM) <- DM[,1] | 72 rownames(DM) <- DM[,1] |
62 var_names <- DM[,1] | 73 var_names <- DM[,1] |
63 DM <- DM[,-1] | 74 DM <- DM[,-1] |
64 DM <- data.frame(t(DM)) | 75 DM <- data.frame(t(DM)) |
65 | 76 |
66 class.col <- colnames(SM)[as.numeric(class.col)] | 77 class.col <- colnames(SM)[as.numeric(class.col)] |
67 | 78 |
68 # check class.col, class1 and the number of classes--------------- | 79 |
80 # check class.col, class1 and the number of classes --------------------------------------------------------- | |
69 | 81 |
70 if(!(class.col %in% colnames(SM))){ | 82 if(!(class.col %in% colnames(SM))){ |
71 stop("\n- - - - - - - - -\n", "The column ",class.col, " is not a part of the specify sample Metadata","\n- - - - - - - - -\n") | 83 stop("\n- - - - - - - - -\n", "The column ",class.col, " is not a part of the specify sample Metadata","\n- - - - - - - - -\n") |
72 } | 84 } |
73 | 85 |
74 c_class <- SM[,class.col] | 86 c_class <- SM[,class.col] |
75 c_class <- as.factor(c_class) | 87 c_class <- as.factor(c_class) |
76 nb_class <- nlevels(c_class) | 88 nb_class <- nlevels(c_class) |
77 classnames <- levels(c_class) | 89 classnames <- levels(c_class) |
78 | 90 |
91 if(nb_class < 2){ | |
92 err.1class <- c("\n The column",class.col, "contains only one class, fold calculation could not be executed \n") | |
93 cat(err.1class) | |
94 } | |
79 | 95 |
80 if((nb_class > (nrow(SM))/3)){ | 96 if((nb_class > (nrow(SM))/3)){ |
81 class.err <- c("\n There are too many classes, think about reducing the number of classes and excluding those | 97 class.err <- c("\n There are too many classes, think about reducing the number of classes and excluding those |
82 with few samples \n") | 98 with few samples \n") |
83 cat(class.err) | 99 cat(class.err) |
84 } | 100 } |
85 | 101 |
102 | |
86 if(type == "One_class"){ | 103 if(type == "One_class"){ |
87 if(!(class1 %in% classnames)){ | 104 if(!(class1 %in% classnames)){ |
88 list.class1 <- c("\n Classes:",classnames,"\n") | 105 list.class1 <- c("\n Classes:",classnames,"\n") |
89 cat(list.class1) | 106 cat(list.class1) |
90 err.class1 <- c("The class ",class1, " does not appear in the column ", class.col) | 107 err.class1 <- c("The class ",class1, " does not appear in the column ", class.col) |
91 stop("\n- - - - - - - - -\n", err.class1,"\n- - - - - - - - -\n") | 108 stop("\n- - - - - - - - -\n", err.class1,"\n- - - - - - - - -\n") |
92 } | 109 } |
93 } | 110 } |
111 | |
94 | 112 |
95 #If type is "one_class", change others classes in "other" | 113 #If type is "one_class", change others classes in "other" |
96 if(type == "One_class"){ | 114 if(type == "One_class"){ |
97 for(i in 1:length(c_class)){ | 115 for(i in 1:length(c_class)){ |
98 if(c_class[i]!=class1){ | 116 if(c_class[i]!=class1){ |
99 c_class <- as.character(c_class) | 117 c_class <- as.character(c_class) |
100 c_class[i] <- "Other" | 118 c_class[i] <- "Other" |
101 c_class <- as.factor(c_class) | 119 c_class <- as.factor(c_class) |
102 nb_class <- nlevels(c_class) | 120 nb_class <- nlevels(c_class) |
103 classnames <- levels(c_class) | 121 classnames <- c(class1,"Other") |
104 | 122 |
105 } | 123 } |
106 } | 124 } |
107 } | 125 } |
108 | 126 |
109 DM <- cbind(DM,c_class) | 127 DM <- cbind(DM,c_class) |
110 | 128 |
111 # fold ------------------------------------------------------- | 129 |
112 n <- 1 | 130 |
113 fold <- data.frame() | 131 # fold calculation ------------------------------------------------------------------------------------------- |
114 for(j in 1:(nb_class-1)){ | 132 |
115 for(k in (j+1):nb_class) { | 133 if(nb_class >= 2){ |
116 for (i in 1:(length(DM)-1)){ | 134 |
117 fold[i,n] <- mean(DM[which(DM$c_class==classnames[k]),i], na.rm=TRUE)/ | 135 |
118 mean(DM[which(DM$c_class==classnames[j]),i], na.rm=TRUE)} | 136 fold <- data.frame() |
119 names(fold)[n] <- paste("fold",classnames[k],"VS", classnames[j], sep="_") | 137 n <- 1 |
120 n <- n + 1} | 138 ratio1 <- NULL |
121 } | 139 ratio2 <- NULL |
122 | 140 |
123 # NA --------------------------------------------------------- | 141 if(type=="Each_class"){ |
142 fold.frac <- "Top" | |
143 } | |
144 | |
145 for(j in 1:(nb_class-1)){ | |
146 for(k in (j+1):nb_class) { | |
147 | |
148 if(fold.frac=="Bottom"){ | |
149 ratio1 <- classnames[k] | |
150 ratio2 <- classnames[j] | |
151 }else{ | |
152 ratio1 <- classnames[j] | |
153 ratio2 <- classnames[k] | |
154 } | |
155 | |
156 for (i in 1:(length(DM)-1)){ | |
157 fold[i,n] <- mean(DM[which(DM$c_class==ratio1),i], na.rm=TRUE)/ | |
158 mean(DM[which(DM$c_class==ratio2),i], na.rm=TRUE) | |
159 if(logarithm=="log2"){ | |
160 fold[i,n] <- log2(fold[i,n]) | |
161 }else if(logarithm=="log10"){ | |
162 fold[i,n] <- log10(fold[i,n]) | |
163 } | |
164 } | |
165 names(fold)[n] <- paste("fold",ratio1,"VS", ratio2, sep="_") | |
166 if(logarithm != "none"){ | |
167 names(fold)[n] <- paste(logarithm,names(fold)[n], sep="_") | |
168 } | |
169 n <- n + 1} | |
170 } | |
171 | |
172 } | |
173 | |
174 # number and proportion of NA --------------------------------------------------------------------------------- | |
175 | |
124 calcul_NA <- data.frame() | 176 calcul_NA <- data.frame() |
125 pct_NA <- data.frame() | 177 pct_NA <- data.frame() |
126 for (i in 1:(length(DM)-1)){ | 178 for (i in 1:(length(DM)-1)){ |
127 for (j in 1:nb_class){ | 179 for (j in 1:nb_class){ |
128 n <- 0 | 180 n <- 0 |
132 n <- n + 1} | 184 n <- n + 1} |
133 calcul_NA[i,j] <- n | 185 calcul_NA[i,j] <- n |
134 pct_NA[i,j] <- (calcul_NA[i,j]/length(new_DM))*100} | 186 pct_NA[i,j] <- (calcul_NA[i,j]/length(new_DM))*100} |
135 } | 187 } |
136 } | 188 } |
137 names(calcul_NA) <- paste("Nb_NA",classnames, sep="_") | 189 names(calcul_NA) <- paste("NA",classnames, sep="_") |
138 names(pct_NA) <- paste("Pct_NA", classnames, sep="_") | 190 names(pct_NA) <- paste("Pct_NA", classnames, sep="_") |
139 | 191 |
140 # Alert message if there is no NA | 192 # Alert message if there is no NA in data matrix |
141 | 193 |
142 sumNA <- colSums(calcul_NA) | 194 sumNA <- colSums(calcul_NA) |
143 sum_total <- sum(sumNA) | 195 sum_total <- sum(sumNA) |
144 | |
145 alerte <- NULL | 196 alerte <- NULL |
146 if(sum_total==0){ | 197 if(sum_total==0){ |
147 alerte <- c(alerte, "Data Matrix contains no NA.\n") | 198 alerte <- c(alerte, "Data Matrix contains no NA.\n") |
148 } | 199 } |
149 | 200 |
150 if(length(alerte) != 0){ | 201 if(length(alerte) != 0){ |
151 cat(alerte,"\n") | 202 cat(alerte,"\n") |
152 } | 203 } |
153 | |
154 table_NA <- cbind(calcul_NA, pct_NA) | 204 table_NA <- cbind(calcul_NA, pct_NA) |
155 | 205 |
156 # check columns names ---------------------------------------- | 206 |
207 | |
208 # check columns names --------------------------------------------------------------------------------------- | |
209 | |
210 | |
211 VM.names <- colnames(VM) | |
157 | 212 |
158 # Fold | 213 # Fold |
159 VM.names <- colnames(VM) | 214 |
160 fold.names <- colnames(fold) | 215 if(nb_class >=2){ |
161 | 216 fold.names <- colnames(fold) |
162 for (i in 1:length(VM.names)){ | 217 |
163 for (j in 1:length(fold.names)){ | 218 for (i in 1:length(VM.names)){ |
164 if (VM.names[i]==fold.names[j]){ | 219 for (j in 1:length(fold.names)){ |
165 fold.names[j] <- paste(fold.names[j],"2", sep="_") | 220 if (VM.names[i]==fold.names[j]){ |
166 } | 221 fold.names[j] <- paste(fold.names[j],"2", sep="_") |
167 } | 222 } |
168 } | 223 } |
169 colnames(fold) <- fold.names | 224 } |
225 colnames(fold) <- fold.names | |
226 | |
227 VM <- cbind(VM,fold) | |
228 } | |
170 | 229 |
171 # NA | 230 # NA |
172 NA.names <- colnames(table_NA) | 231 NA.names <- colnames(table_NA) |
173 | 232 |
174 for (i in 1:length(VM.names)){ | 233 for (i in 1:length(VM.names)){ |
177 NA.names[j] <- paste(NA.names[j],"2", sep="_") | 236 NA.names[j] <- paste(NA.names[j],"2", sep="_") |
178 } | 237 } |
179 } | 238 } |
180 } | 239 } |
181 colnames(table_NA) <- NA.names | 240 colnames(table_NA) <- NA.names |
182 | 241 VM <- cbind(VM,table_NA) |
183 #for NA barplots --------------------------------------------- | 242 |
243 | |
244 #for NA barplots ------------------------------------------------------------------------------------------- | |
184 | 245 |
185 data_bp <- data.frame() | 246 data_bp <- data.frame() |
186 | 247 |
187 for (j in 1:ncol(pct_NA)){ | 248 for (j in 1:ncol(pct_NA)){ |
188 Nb_NA_0_20 <- 0 | 249 Nb_NA_0_20 <- 0 |
212 data_bp[2,j] <- Nb_NA_20_40 | 273 data_bp[2,j] <- Nb_NA_20_40 |
213 data_bp[3,j] <- Nb_NA_40_60 | 274 data_bp[3,j] <- Nb_NA_40_60 |
214 data_bp[4,j] <- Nb_NA_60_80 | 275 data_bp[4,j] <- Nb_NA_60_80 |
215 data_bp[5,j] <- Nb_NA_80_100 | 276 data_bp[5,j] <- Nb_NA_80_100 |
216 } | 277 } |
217 rownames(data_bp) <- c("0-20%", "20-40%", "40-60%", "60-80%", "80-100%") | 278 rownames(data_bp) <- c("0%-20%", "20%-40%", "40%-60%", "60%-80%", "80%-100%") |
218 colnames(data_bp) <- classnames | 279 colnames(data_bp) <- classnames |
219 data_bp <- as.matrix(data_bp) | 280 data_bp <- as.matrix(data_bp) |
220 | 281 |
221 | 282 |
222 # Output-------------------------------------------------------- | 283 # Output --------------------------------------------------------------------------------------------------- |
223 | 284 |
224 VM <- cbind(VM,fold,table_NA) | |
225 | 285 |
226 write.table(VM, VM.output,sep="\t", quote=FALSE, row.names=FALSE) | 286 write.table(VM, VM.output,sep="\t", quote=FALSE, row.names=FALSE) |
227 | 287 |
228 #graphics pdf | 288 #graphics pdf |
229 | 289 |
230 pdf(graphs.output) | 290 pdf(graphs.output) |
231 | 291 |
232 #Boxplots for NA | 292 #Barplots for NA |
233 | |
234 par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE) | 293 par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE) |
235 | 294 |
236 bp=barplot(data_bp, col=rainbow(nrow(data_bp)), main="Proportion of NA", xlab="Classes", ylab="Variables") | 295 bp=barplot(data_bp, col=rainbow(nrow(data_bp)), main="Proportion of NA", xlab="Classes", ylab="Variables") |
237 legend("topright", fill=rainbow(nrow(data_bp)),rownames(data_bp), inset=c(-0.3,0)) | 296 legend("topright", fill=rainbow(nrow(data_bp)),rownames(data_bp), inset=c(-0.3,0)) |
238 | 297 |
239 | |
240 stock=0 | 298 stock=0 |
241 for (i in 1:nrow(data_bp)){ | 299 for (i in 1:nrow(data_bp)){ |
242 text(bp, stock+data_bp[i,]/2, data_bp[i,], col="white") | 300 text(bp, stock+data_bp[i,]/2, data_bp[i,], col="white", cex=0.7) |
243 stock <- stock+data_bp[i,] | 301 stock <- stock+data_bp[i,] |
244 } | 302 } |
245 | 303 |
246 | 304 |
247 #Boxplots for fold test | 305 #Boxplots for fold test |
248 for (j in 1:ncol(fold)){ | 306 |
249 title=paste(fold.names[j]) | 307 if(nb_class >= 2){ |
250 boxplot(fold[j], main=title) | 308 |
251 } | 309 clean_fold <- fold |
310 for(i in 1:nrow(clean_fold)){ | |
311 for(j in 1:ncol(clean_fold)){ | |
312 if(is.infinite(clean_fold[i,j])){ | |
313 clean_fold[i,j] <- NA | |
314 } | |
315 } | |
316 } | |
317 for (j in 1:ncol(clean_fold)){ | |
318 title <- paste(fold.names[j]) | |
319 boxplot(clean_fold[j], main=title) | |
320 } | |
321 } | |
252 | 322 |
253 dev.off() | 323 dev.off() |
254 | 324 |
255 } | 325 } |
256 | 326 |
257 | 327 |
258 # Function call--------------- | |
259 | |
260 #setwd("Y:\\Developpement\\Intensity check\\Pour tests") | |
261 #intens_check("DM_NA.tabular", "SM_NA.tabular", "VM_NA.tabular", "One_class", "class", "Blanks", "VM_oneclass_NA.txt", | |
262 #"Barplots_and_Boxplots") | |
263 | |
264 |