Mercurial > repos > melpetera > intensity_checks
view Intchecks/Script_intensity_check.R @ 2:a7553caa2572 draft
Uploaded
author | melpetera |
---|---|
date | Mon, 14 Jan 2019 08:17:26 -0500 |
parents | 4973a2104cfd |
children | bdee2c2c484b |
line wrap: on
line source
######################################################################### # SCRIPT INTENSITY CHECK # # # # Input: Data Matrix, VariableMetadata, SampleMetadata # # Output: VariableMetadata, Graphics (barplots and boxplots) # # # # Dependencies: RcheckLibrary.R # # # ######################################################################### # Parameters (for dev) if(FALSE){ rm(list = ls()) setwd("Y:\\Developpement\\Intensity check\\Pour tests") DM.name <- "DM_NA.tabular" SM.name <- "SM_NA.tabular" VM.name <- "vM_NA.tabular" class.col <- "2" type <- "One_class" class1 <- "Blanks" fold.frac <- "Top" logarithm <- "log2" VM.output <- "new_VM.txt" graphs.output <- "Barplots_and_Boxplots.pdf" } intens_check <- function(DM.name, SM.name, VM.name, class.col, type, class1, fold.frac, logarithm, VM.output, graphs.output){ # This function allows to check the intensities considering classes with a mean fold change calculation, # the number and the proportion of missing values (NA) in dataMatrix # # Two options: # - one class (selected by the user) against all the remaining samples ("One_class") # - tests on each class ("Each_class") # # Parameters: # DM.name, SM.name, VM.name: dataMatrix, sampleMetadata, variableMetadata files access # class.col: number of the sampleMetadata's column with classes # type: "One_class" or "Each_class" # class1: name of the class, only if type="One_class" # fold.frac: if type="One class": class1/other ("Top") or other/class1 ("Bottom") # logarithm: "log2", "log10" or "none" for log mean fold change # VM.output: output file's access (VM with new columns) # graphs.output: pdf file's access with barplots for the proportion of NA and boxplots with the folds values # Input --------------------------------------------------------------------------------------------------- DM <- read.table(DM.name, header=TRUE, sep="\t", check.names=FALSE) SM <- read.table(SM.name, header=TRUE, sep="\t", check.names=FALSE) VM <- read.table(VM.name, header=TRUE, sep="\t", check.names=FALSE) # Table match check with Rchecklibrary table.check <- match3(DM, SM, VM) check.err(table.check) rownames(DM) <- DM[,1] var_names <- DM[,1] DM <- DM[,-1] DM <- data.frame(t(DM)) class.col <- colnames(SM)[as.numeric(class.col)] # check class.col, class1 and the number of classes --------------------------------------------------------- if(!(class.col %in% colnames(SM))){ stop("\n- - - - - - - - -\n", "The column ",class.col, " is not a part of the specify sample Metadata","\n- - - - - - - - -\n") } c_class <- SM[,class.col] c_class <- as.factor(c_class) nb_class <- nlevels(c_class) classnames <- levels(c_class) if(nb_class < 2){ err.1class <- c("\n The column",class.col, "contains only one class, fold calculation could not be executed \n") cat(err.1class) } if((nb_class > (nrow(SM))/3)){ class.err <- c("\n There are too many classes, think about reducing the number of classes and excluding those with few samples \n") cat(class.err) } if(type == "One_class"){ if(!(class1 %in% classnames)){ list.class1 <- c("\n Classes:",classnames,"\n") cat(list.class1) err.class1 <- c("The class ",class1, " does not appear in the column ", class.col) stop("\n- - - - - - - - -\n", err.class1,"\n- - - - - - - - -\n") } } #If type is "one_class", change others classes in "other" if(type == "One_class"){ for(i in 1:length(c_class)){ if(c_class[i]!=class1){ c_class <- as.character(c_class) c_class[i] <- "Other" c_class <- as.factor(c_class) nb_class <- nlevels(c_class) classnames <- c(class1,"Other") } } } DM <- cbind(DM,c_class) # fold calculation ------------------------------------------------------------------------------------------- if(nb_class >= 2){ fold <- data.frame() n <- 1 ratio1 <- NULL ratio2 <- NULL if(type=="Each_class"){ fold.frac <- "Top" } for(j in 1:(nb_class-1)){ for(k in (j+1):nb_class) { if(fold.frac=="Bottom"){ ratio1 <- classnames[k] ratio2 <- classnames[j] }else{ ratio1 <- classnames[j] ratio2 <- classnames[k] } for (i in 1:(length(DM)-1)){ fold[i,n] <- mean(DM[which(DM$c_class==ratio1),i], na.rm=TRUE)/ mean(DM[which(DM$c_class==ratio2),i], na.rm=TRUE) if(logarithm=="log2"){ fold[i,n] <- log2(fold[i,n]) }else if(logarithm=="log10"){ fold[i,n] <- log10(fold[i,n]) } } names(fold)[n] <- paste("fold",ratio1,"VS", ratio2, sep="_") if(logarithm != "none"){ names(fold)[n] <- paste(logarithm,names(fold)[n], sep="_") } n <- n + 1} } } # number and proportion of NA --------------------------------------------------------------------------------- calcul_NA <- data.frame() pct_NA <- data.frame() for (i in 1:(length(DM)-1)){ for (j in 1:nb_class){ n <- 0 new_DM <- DM[which(DM$c_class==classnames[j]),i] for(k in 1:length(new_DM)){ if (is.na(new_DM[k])){ n <- n + 1} calcul_NA[i,j] <- n pct_NA[i,j] <- (calcul_NA[i,j]/length(new_DM))*100} } } names(calcul_NA) <- paste("NA",classnames, sep="_") names(pct_NA) <- paste("Pct_NA", classnames, sep="_") # Alert message if there is no NA in data matrix sumNA <- colSums(calcul_NA) sum_total <- sum(sumNA) alerte <- NULL if(sum_total==0){ alerte <- c(alerte, "Data Matrix contains no NA.\n") } if(length(alerte) != 0){ cat(alerte,"\n") } table_NA <- cbind(calcul_NA, pct_NA) # check columns names --------------------------------------------------------------------------------------- VM.names <- colnames(VM) # Fold if(nb_class >=2){ fold.names <- colnames(fold) for (i in 1:length(VM.names)){ for (j in 1:length(fold.names)){ if (VM.names[i]==fold.names[j]){ fold.names[j] <- paste(fold.names[j],"2", sep="_") } } } colnames(fold) <- fold.names VM <- cbind(VM,fold) } # NA NA.names <- colnames(table_NA) for (i in 1:length(VM.names)){ for (j in 1:length(NA.names)){ if (VM.names[i]==NA.names[j]){ NA.names[j] <- paste(NA.names[j],"2", sep="_") } } } colnames(table_NA) <- NA.names VM <- cbind(VM,table_NA) #for NA barplots ------------------------------------------------------------------------------------------- data_bp <- data.frame() for (j in 1:ncol(pct_NA)){ Nb_NA_0_20 <- 0 Nb_NA_20_40 <- 0 Nb_NA_40_60 <- 0 Nb_NA_60_80 <- 0 Nb_NA_80_100 <- 0 for (i in 1:nrow(pct_NA)){ if ((0<=pct_NA[i,j])&(pct_NA[i,j]<20)){ Nb_NA_0_20=Nb_NA_0_20+1 } if ((20<=pct_NA[i,j])&(pct_NA[i,j]<40)){ Nb_NA_20_40=Nb_NA_20_40+1} if ((40<=pct_NA[i,j])&(pct_NA[i,j]<60)){ Nb_NA_40_60=Nb_NA_40_60+1} if ((60<=pct_NA[i,j])&(pct_NA[i,j]<80)){ Nb_NA_60_80=Nb_NA_60_80+1} if ((80<=pct_NA[i,j])&(pct_NA[i,j]<=100)){ Nb_NA_80_100=Nb_NA_80_100+1} } data_bp[1,j] <- Nb_NA_0_20 data_bp[2,j] <- Nb_NA_20_40 data_bp[3,j] <- Nb_NA_40_60 data_bp[4,j] <- Nb_NA_60_80 data_bp[5,j] <- Nb_NA_80_100 } rownames(data_bp) <- c("0%-20%", "20%-40%", "40%-60%", "60%-80%", "80%-100%") colnames(data_bp) <- classnames data_bp <- as.matrix(data_bp) # Output --------------------------------------------------------------------------------------------------- write.table(VM, VM.output,sep="\t", quote=FALSE, row.names=FALSE) #graphics pdf pdf(graphs.output) #Barplots for NA par(mar=c(5.1, 4.1, 4.1, 8.1), xpd=TRUE) bp=barplot(data_bp, col=rainbow(nrow(data_bp)), main="Proportion of NA", xlab="Classes", ylab="Variables") legend("topright", fill=rainbow(nrow(data_bp)),rownames(data_bp), inset=c(-0.3,0)) stock=0 for (i in 1:nrow(data_bp)){ text(bp, stock+data_bp[i,]/2, data_bp[i,], col="white", cex=0.7) stock <- stock+data_bp[i,] } #Boxplots for fold test if(nb_class >= 2){ clean_fold <- fold for(i in 1:nrow(clean_fold)){ for(j in 1:ncol(clean_fold)){ if(is.infinite(clean_fold[i,j])){ clean_fold[i,j] <- NA } } } for (j in 1:ncol(clean_fold)){ title <- paste(fold.names[j]) boxplot(clean_fold[j], main=title) } } dev.off() }