1
|
1 #########################################################################
|
|
2 # SCRIPT INTENSITY CHECK #
|
|
3 # #
|
|
4 # Input: Data Matrix, VariableMetadata, SampleMetadata #
|
|
5 # Output: VariableMetadata, Graphics (barplots and boxplots) #
|
|
6 # #
|
|
7 # Dependencies: RcheckLibrary.R #
|
|
8 # #
|
|
9 #########################################################################
|
0
|
10
|
|
11
|
|
12 # Parameters (for dev)
|
|
13 if(FALSE){
|
|
14
|
|
15 rm(list = ls())
|
|
16 setwd("Y:\\Developpement\\Intensity check\\Pour tests")
|
|
17
|
|
18 DM.name <- "DM_NA.tabular"
|
|
19 SM.name <- "SM_NA.tabular"
|
|
20 VM.name <- "vM_NA.tabular"
|
1
|
21 class.col <- "2"
|
0
|
22 type <- "One_class"
|
|
23 class1 <- "Blanks"
|
1
|
24 fold.frac <- "Top"
|
|
25 logarithm <- "log2"
|
0
|
26 VM.output <- "new_VM.txt"
|
|
27 graphs.output <- "Barplots_and_Boxplots.pdf"
|
|
28 }
|
|
29
|
1
|
30
|
|
31
|
|
32
|
|
33 intens_check <- function(DM.name, SM.name, VM.name, class.col, type, class1, fold.frac, logarithm,
|
|
34 VM.output, graphs.output){
|
|
35
|
|
36
|
|
37 # This function allows to check the intensities considering classes with a mean fold change calculation,
|
|
38 # the number and the proportion of missing values (NA) in dataMatrix
|
0
|
39 #
|
|
40 # Two options:
|
|
41 # - one class (selected by the user) against all the remaining samples ("One_class")
|
|
42 # - tests on each class ("Each_class")
|
|
43 #
|
|
44 # Parameters:
|
|
45 # DM.name, SM.name, VM.name: dataMatrix, sampleMetadata, variableMetadata files access
|
1
|
46 # class.col: number of the sampleMetadata's column with classes
|
0
|
47 # type: "One_class" or "Each_class"
|
1
|
48 # class1: name of the class, only if type="One_class"
|
|
49 # fold.frac: if type="One class": class1/other ("Top") or other/class1 ("Bottom")
|
|
50 # logarithm: "log2", "log10" or "none" for log mean fold change
|
|
51 # VM.output: output file's access (VM with new columns)
|
|
52 # graphs.output: pdf file's access with barplots for the proportion of NA and boxplots with the folds values
|
|
53
|
0
|
54
|
|
55
|
|
56
|
|
57
|
1
|
58 # Input ---------------------------------------------------------------------------------------------------
|
0
|
59
|
|
60 DM <- read.table(DM.name, header=TRUE, sep="\t", check.names=FALSE)
|
|
61 SM <- read.table(SM.name, header=TRUE, sep="\t", check.names=FALSE)
|
|
62 VM <- read.table(VM.name, header=TRUE, sep="\t", check.names=FALSE)
|
|
63
|
|
64
|
|
65
|
|
66 # Table match check with Rchecklibrary
|
|
67 table.check <- match3(DM, SM, VM)
|
|
68 check.err(table.check)
|
|
69
|
|
70
|
1
|
71
|
0
|
72 rownames(DM) <- DM[,1]
|
|
73 var_names <- DM[,1]
|
|
74 DM <- DM[,-1]
|
|
75 DM <- data.frame(t(DM))
|
|
76
|
|
77 class.col <- colnames(SM)[as.numeric(class.col)]
|
|
78
|
1
|
79
|
|
80 # check class.col, class1 and the number of classes ---------------------------------------------------------
|
0
|
81
|
|
82 if(!(class.col %in% colnames(SM))){
|
|
83 stop("\n- - - - - - - - -\n", "The column ",class.col, " is not a part of the specify sample Metadata","\n- - - - - - - - -\n")
|
|
84 }
|
|
85
|
|
86 c_class <- SM[,class.col]
|
|
87 c_class <- as.factor(c_class)
|
|
88 nb_class <- nlevels(c_class)
|
|
89 classnames <- levels(c_class)
|
|
90
|
1
|
91 if(nb_class < 2){
|
|
92 err.1class <- c("\n The column",class.col, "contains only one class, fold calculation could not be executed \n")
|
|
93 cat(err.1class)
|
|
94 }
|
0
|
95
|
|
96 if((nb_class > (nrow(SM))/3)){
|
|
97 class.err <- c("\n There are too many classes, think about reducing the number of classes and excluding those
|
|
98 with few samples \n")
|
|
99 cat(class.err)
|
|
100 }
|
|
101
|
1
|
102
|
0
|
103 if(type == "One_class"){
|
|
104 if(!(class1 %in% classnames)){
|
|
105 list.class1 <- c("\n Classes:",classnames,"\n")
|
|
106 cat(list.class1)
|
|
107 err.class1 <- c("The class ",class1, " does not appear in the column ", class.col)
|
|
108 stop("\n- - - - - - - - -\n", err.class1,"\n- - - - - - - - -\n")
|
|
109 }
|
|
110 }
|
|
111
|
1
|
112
|
0
|
113 #If type is "one_class", change others classes in "other"
|
|
114 if(type == "One_class"){
|
|
115 for(i in 1:length(c_class)){
|
|
116 if(c_class[i]!=class1){
|
|
117 c_class <- as.character(c_class)
|
|
118 c_class[i] <- "Other"
|
|
119 c_class <- as.factor(c_class)
|
|
120 nb_class <- nlevels(c_class)
|
1
|
121 classnames <- c(class1,"Other")
|
0
|
122
|
|
123 }
|
|
124 }
|
|
125 }
|
|
126
|
|
127 DM <- cbind(DM,c_class)
|
|
128
|
1
|
129
|
|
130
|
|
131 # fold calculation -------------------------------------------------------------------------------------------
|
|
132
|
|
133 if(nb_class >= 2){
|
|
134
|
|
135
|
|
136 fold <- data.frame()
|
|
137 n <- 1
|
|
138 ratio1 <- NULL
|
|
139 ratio2 <- NULL
|
|
140
|
|
141 if(type=="Each_class"){
|
|
142 fold.frac <- "Top"
|
|
143 }
|
|
144
|
|
145 for(j in 1:(nb_class-1)){
|
|
146 for(k in (j+1):nb_class) {
|
|
147
|
|
148 if(fold.frac=="Bottom"){
|
|
149 ratio1 <- classnames[k]
|
|
150 ratio2 <- classnames[j]
|
|
151 }else{
|
|
152 ratio1 <- classnames[j]
|
|
153 ratio2 <- classnames[k]
|
|
154 }
|
|
155
|
|
156 for (i in 1:(length(DM)-1)){
|
|
157 fold[i,n] <- mean(DM[which(DM$c_class==ratio1),i], na.rm=TRUE)/
|
|
158 mean(DM[which(DM$c_class==ratio2),i], na.rm=TRUE)
|
|
159 if(logarithm=="log2"){
|
|
160 fold[i,n] <- log2(fold[i,n])
|
|
161 }else if(logarithm=="log10"){
|
|
162 fold[i,n] <- log10(fold[i,n])
|
|
163 }
|
|
164 }
|
|
165 names(fold)[n] <- paste("fold",ratio1,"VS", ratio2, sep="_")
|
|
166 if(logarithm != "none"){
|
|
167 names(fold)[n] <- paste(logarithm,names(fold)[n], sep="_")
|
|
168 }
|
|
169 n <- n + 1}
|
|
170 }
|
|
171
|
0
|
172 }
|
|
173
|
1
|
174 # number and proportion of NA ---------------------------------------------------------------------------------
|
|
175
|
0
|
176 calcul_NA <- data.frame()
|
|
177 pct_NA <- data.frame()
|
|
178 for (i in 1:(length(DM)-1)){
|
|
179 for (j in 1:nb_class){
|
|
180 n <- 0
|
|
181 new_DM <- DM[which(DM$c_class==classnames[j]),i]
|
|
182 for(k in 1:length(new_DM)){
|
|
183 if (is.na(new_DM[k])){
|
|
184 n <- n + 1}
|
|
185 calcul_NA[i,j] <- n
|
|
186 pct_NA[i,j] <- (calcul_NA[i,j]/length(new_DM))*100}
|
|
187 }
|
|
188 }
|
1
|
189 names(calcul_NA) <- paste("NA",classnames, sep="_")
|
0
|
190 names(pct_NA) <- paste("Pct_NA", classnames, sep="_")
|
|
191
|
1
|
192 # Alert message if there is no NA in data matrix
|
0
|
193
|
|
194 sumNA <- colSums(calcul_NA)
|
|
195 sum_total <- sum(sumNA)
|
|
196 alerte <- NULL
|
|
197 if(sum_total==0){
|
|
198 alerte <- c(alerte, "Data Matrix contains no NA.\n")
|
|
199 }
|
|
200
|
|
201 if(length(alerte) != 0){
|
|
202 cat(alerte,"\n")
|
|
203 }
|
|
204 table_NA <- cbind(calcul_NA, pct_NA)
|
|
205
|
1
|
206
|
|
207
|
|
208 # check columns names ---------------------------------------------------------------------------------------
|
|
209
|
|
210
|
|
211 VM.names <- colnames(VM)
|
0
|
212
|
|
213 # Fold
|
1
|
214
|
|
215 if(nb_class >=2){
|
|
216 fold.names <- colnames(fold)
|
0
|
217
|
1
|
218 for (i in 1:length(VM.names)){
|
|
219 for (j in 1:length(fold.names)){
|
|
220 if (VM.names[i]==fold.names[j]){
|
|
221 fold.names[j] <- paste(fold.names[j],"2", sep="_")
|
|
222 }
|
0
|
223 }
|
|
224 }
|
1
|
225 colnames(fold) <- fold.names
|
|
226
|
|
227 VM <- cbind(VM,fold)
|
0
|
228 }
|
|
229
|
|
230 # NA
|
|
231 NA.names <- colnames(table_NA)
|
|
232
|
|
233 for (i in 1:length(VM.names)){
|
|
234 for (j in 1:length(NA.names)){
|
|
235 if (VM.names[i]==NA.names[j]){
|
|
236 NA.names[j] <- paste(NA.names[j],"2", sep="_")
|
|
237 }
|
|
238 }
|
|
239 }
|
|
240 colnames(table_NA) <- NA.names
|
1
|
241 VM <- cbind(VM,table_NA)
|
0
|
242
|
1
|
243
|
|
244 #for NA barplots -------------------------------------------------------------------------------------------
|
0
|
245
|
|
246 data_bp <- data.frame()
|
|
247
|
|
248 for (j in 1:ncol(pct_NA)){
|
|
249 Nb_NA_0_20 <- 0
|
|
250 Nb_NA_20_40 <- 0
|
|
251 Nb_NA_40_60 <- 0
|
|
252 Nb_NA_60_80 <- 0
|
|
253 Nb_NA_80_100 <- 0
|
|
254 for (i in 1:nrow(pct_NA)){
|
|
255
|
|
256 if ((0<=pct_NA[i,j])&(pct_NA[i,j]<20)){
|
|
257 Nb_NA_0_20=Nb_NA_0_20+1
|
|
258 }
|
|
259
|
|
260 if ((20<=pct_NA[i,j])&(pct_NA[i,j]<40)){
|
|
261 Nb_NA_20_40=Nb_NA_20_40+1}
|
|
262
|
|
263 if ((40<=pct_NA[i,j])&(pct_NA[i,j]<60)){
|
|
264 Nb_NA_40_60=Nb_NA_40_60+1}
|
|
265
|
|
266 if ((60<=pct_NA[i,j])&(pct_NA[i,j]<80)){
|
|
267 Nb_NA_60_80=Nb_NA_60_80+1}
|
|
268
|
|
269 if ((80<=pct_NA[i,j])&(pct_NA[i,j]<=100)){
|
|
270 Nb_NA_80_100=Nb_NA_80_100+1}
|
|
271 }
|
|
272 data_bp[1,j] <- Nb_NA_0_20
|
|
273 data_bp[2,j] <- Nb_NA_20_40
|
|
274 data_bp[3,j] <- Nb_NA_40_60
|
|
275 data_bp[4,j] <- Nb_NA_60_80
|
|
276 data_bp[5,j] <- Nb_NA_80_100
|
|
277 }
|
1
|
278 rownames(data_bp) <- c("0%-20%", "20%-40%", "40%-60%", "60%-80%", "80%-100%")
|
0
|
279 colnames(data_bp) <- classnames
|
|
280 data_bp <- as.matrix(data_bp)
|
|
281
|
|
282
|
1
|
283 # Output ---------------------------------------------------------------------------------------------------
|
0
|
284
|
|
285
|
|
286 write.table(VM, VM.output,sep="\t", quote=FALSE, row.names=FALSE)
|
|
287
|
|
288 #graphics pdf
|
|
289
|
|
290 pdf(graphs.output)
|
|
291
|
1
|
292 #Barplots for NA
|
0
|
293 par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE)
|
|
294
|
|
295 bp=barplot(data_bp, col=rainbow(nrow(data_bp)), main="Proportion of NA", xlab="Classes", ylab="Variables")
|
|
296 legend("topright", fill=rainbow(nrow(data_bp)),rownames(data_bp), inset=c(-0.3,0))
|
|
297
|
|
298 stock=0
|
|
299 for (i in 1:nrow(data_bp)){
|
1
|
300 text(bp, stock+data_bp[i,]/2, data_bp[i,], col="white", cex=0.7)
|
0
|
301 stock <- stock+data_bp[i,]
|
|
302 }
|
|
303
|
|
304
|
|
305 #Boxplots for fold test
|
1
|
306
|
|
307 if(nb_class >= 2){
|
|
308
|
|
309 clean_fold <- fold
|
|
310 for(i in 1:nrow(clean_fold)){
|
|
311 for(j in 1:ncol(clean_fold)){
|
|
312 if(is.infinite(clean_fold[i,j])){
|
|
313 clean_fold[i,j] <- NA
|
|
314 }
|
|
315 }
|
|
316 }
|
|
317 for (j in 1:ncol(clean_fold)){
|
|
318 title <- paste(fold.names[j])
|
|
319 boxplot(clean_fold[j], main=title)
|
|
320 }
|
|
321 }
|
0
|
322
|
|
323 dev.off()
|
|
324
|
|
325 }
|
|
326
|
|
327 |