Mercurial > repos > melpetera > intensity_checks
diff Intchecks/Script_intensity_check.R @ 0:c2c2e1be904a draft
Uploaded
author | melpetera |
---|---|
date | Thu, 11 Oct 2018 05:33:19 -0400 |
parents | |
children | 4973a2104cfd |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Intchecks/Script_intensity_check.R Thu Oct 11 05:33:19 2018 -0400 @@ -0,0 +1,264 @@ +#################################################################### +# SCRIPT INTENSITY CHECK +# V1: Fold and NA +# +# Input: Data Matrix, VariableMetadata, SampleMetadata +# Output: VariableMetadata, Graphics (barplots and boxplots) +# +# Dependencies: RcheckLibrary.R +# +#################################################################### + + +# Parameters (for dev) +if(FALSE){ + + rm(list = ls()) + setwd("Y:\\Developpement\\Intensity check\\Pour tests") + + DM.name <- "DM_NA.tabular" + SM.name <- "SM_NA.tabular" + VM.name <- "vM_NA.tabular" + type <- "One_class" + class.col <- "2" + class1 <- "Blanks" + VM.output <- "new_VM.txt" + graphs.output <- "Barplots_and_Boxplots.pdf" +} + +intens_check <- function(DM.name, SM.name, VM.name, type, class.col, class1, VM.output, graphs.output){ + # This function allows to check the intensities considering classes with a fold calculation, the number and + # the proportion of NA + # + # Two options: + # - one class (selected by the user) against all the remaining samples ("One_class") + # - tests on each class ("Each_class") + # + # Parameters: + # DM.name, SM.name, VM.name: dataMatrix, sampleMetadata, variableMetadata files access + # type: "One_class" or "Each_class" + # class.col: number of the sampleMetadata's column with classes + # class1: name of the class if type="One_class" + # VM.output: output file's access (VM with the new columns) + # graphs.output: pdf with barplots for the proportion of NA and boxplots with the folds values + + + + + # Input --------------------------------------------------------- + + DM <- read.table(DM.name, header=TRUE, sep="\t", check.names=FALSE) + SM <- read.table(SM.name, header=TRUE, sep="\t", check.names=FALSE) + VM <- read.table(VM.name, header=TRUE, sep="\t", check.names=FALSE) + + + + # Table match check with Rchecklibrary + table.check <- match3(DM, SM, VM) + check.err(table.check) + + + rownames(DM) <- DM[,1] + var_names <- DM[,1] + DM <- DM[,-1] + DM <- data.frame(t(DM)) + + class.col <- colnames(SM)[as.numeric(class.col)] + + # check class.col, class1 and the number of classes--------------- + + if(!(class.col %in% colnames(SM))){ + stop("\n- - - - - - - - -\n", "The column ",class.col, " is not a part of the specify sample Metadata","\n- - - - - - - - -\n") + } + + c_class <- SM[,class.col] + c_class <- as.factor(c_class) + nb_class <- nlevels(c_class) + classnames <- levels(c_class) + + + if((nb_class > (nrow(SM))/3)){ + class.err <- c("\n There are too many classes, think about reducing the number of classes and excluding those + with few samples \n") + cat(class.err) + } + + if(type == "One_class"){ + if(!(class1 %in% classnames)){ + list.class1 <- c("\n Classes:",classnames,"\n") + cat(list.class1) + err.class1 <- c("The class ",class1, " does not appear in the column ", class.col) + stop("\n- - - - - - - - -\n", err.class1,"\n- - - - - - - - -\n") + } + } + + #If type is "one_class", change others classes in "other" + if(type == "One_class"){ + for(i in 1:length(c_class)){ + if(c_class[i]!=class1){ + c_class <- as.character(c_class) + c_class[i] <- "Other" + c_class <- as.factor(c_class) + nb_class <- nlevels(c_class) + classnames <- levels(c_class) + + } + } + } + + DM <- cbind(DM,c_class) + + # fold ------------------------------------------------------- + n <- 1 + fold <- data.frame() + for(j in 1:(nb_class-1)){ + for(k in (j+1):nb_class) { + for (i in 1:(length(DM)-1)){ + fold[i,n] <- mean(DM[which(DM$c_class==classnames[k]),i], na.rm=TRUE)/ + mean(DM[which(DM$c_class==classnames[j]),i], na.rm=TRUE)} + names(fold)[n] <- paste("fold",classnames[k],"VS", classnames[j], sep="_") + n <- n + 1} + } + + # NA --------------------------------------------------------- + calcul_NA <- data.frame() + pct_NA <- data.frame() + for (i in 1:(length(DM)-1)){ + for (j in 1:nb_class){ + n <- 0 + new_DM <- DM[which(DM$c_class==classnames[j]),i] + for(k in 1:length(new_DM)){ + if (is.na(new_DM[k])){ + n <- n + 1} + calcul_NA[i,j] <- n + pct_NA[i,j] <- (calcul_NA[i,j]/length(new_DM))*100} + } + } + names(calcul_NA) <- paste("Nb_NA",classnames, sep="_") + names(pct_NA) <- paste("Pct_NA", classnames, sep="_") + + # Alert message if there is no NA + + sumNA <- colSums(calcul_NA) + sum_total <- sum(sumNA) + + alerte <- NULL + if(sum_total==0){ + alerte <- c(alerte, "Data Matrix contains no NA.\n") + } + + if(length(alerte) != 0){ + cat(alerte,"\n") + } + + table_NA <- cbind(calcul_NA, pct_NA) + + # check columns names ---------------------------------------- + + # Fold + VM.names <- colnames(VM) + fold.names <- colnames(fold) + + for (i in 1:length(VM.names)){ + for (j in 1:length(fold.names)){ + if (VM.names[i]==fold.names[j]){ + fold.names[j] <- paste(fold.names[j],"2", sep="_") + } + } + } + colnames(fold) <- fold.names + + # NA + NA.names <- colnames(table_NA) + + for (i in 1:length(VM.names)){ + for (j in 1:length(NA.names)){ + if (VM.names[i]==NA.names[j]){ + NA.names[j] <- paste(NA.names[j],"2", sep="_") + } + } + } + colnames(table_NA) <- NA.names + + #for NA barplots --------------------------------------------- + + data_bp <- data.frame() + + for (j in 1:ncol(pct_NA)){ + Nb_NA_0_20 <- 0 + Nb_NA_20_40 <- 0 + Nb_NA_40_60 <- 0 + Nb_NA_60_80 <- 0 + Nb_NA_80_100 <- 0 + for (i in 1:nrow(pct_NA)){ + + if ((0<=pct_NA[i,j])&(pct_NA[i,j]<20)){ + Nb_NA_0_20=Nb_NA_0_20+1 + } + + if ((20<=pct_NA[i,j])&(pct_NA[i,j]<40)){ + Nb_NA_20_40=Nb_NA_20_40+1} + + if ((40<=pct_NA[i,j])&(pct_NA[i,j]<60)){ + Nb_NA_40_60=Nb_NA_40_60+1} + + if ((60<=pct_NA[i,j])&(pct_NA[i,j]<80)){ + Nb_NA_60_80=Nb_NA_60_80+1} + + if ((80<=pct_NA[i,j])&(pct_NA[i,j]<=100)){ + Nb_NA_80_100=Nb_NA_80_100+1} + } + data_bp[1,j] <- Nb_NA_0_20 + data_bp[2,j] <- Nb_NA_20_40 + data_bp[3,j] <- Nb_NA_40_60 + data_bp[4,j] <- Nb_NA_60_80 + data_bp[5,j] <- Nb_NA_80_100 + } + rownames(data_bp) <- c("0-20%", "20-40%", "40-60%", "60-80%", "80-100%") + colnames(data_bp) <- classnames + data_bp <- as.matrix(data_bp) + + + # Output-------------------------------------------------------- + + VM <- cbind(VM,fold,table_NA) + + write.table(VM, VM.output,sep="\t", quote=FALSE, row.names=FALSE) + + #graphics pdf + + pdf(graphs.output) + + #Boxplots for NA + + par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE) + + bp=barplot(data_bp, col=rainbow(nrow(data_bp)), main="Proportion of NA", xlab="Classes", ylab="Variables") + legend("topright", fill=rainbow(nrow(data_bp)),rownames(data_bp), inset=c(-0.3,0)) + + + stock=0 + for (i in 1:nrow(data_bp)){ + text(bp, stock+data_bp[i,]/2, data_bp[i,], col="white") + stock <- stock+data_bp[i,] + } + + + #Boxplots for fold test + for (j in 1:ncol(fold)){ + title=paste(fold.names[j]) + boxplot(fold[j], main=title) + } + + dev.off() + +} + + +# Function call--------------- + +#setwd("Y:\\Developpement\\Intensity check\\Pour tests") +#intens_check("DM_NA.tabular", "SM_NA.tabular", "VM_NA.tabular", "One_class", "class", "Blanks", "VM_oneclass_NA.txt", +#"Barplots_and_Boxplots") + + \ No newline at end of file