Mercurial > repos > ynewton > matrix_normalization
view normalize.r @ 3:6e77048d4d88 draft default tip
Uploaded
author | ynewton |
---|---|
date | Mon, 17 Dec 2012 15:26:13 -0500 |
parents | f3fe0f64fe91 |
children |
line wrap: on
line source
#!/usr/bin/Rscript #Yulia Newton, last updated 20121217 v.3 #usage, options and doc goes here argspec <- c("normalize.r - takes any flat file and normalizes the rows or the columns using various normalizations (median_shift, mean_shift, t_statistic (z-score), exp_fit, normal_fit, weibull_0.5_fit, weibull_1_fit, weibull_1.5_fit, weibull_5_fit). Requires a single header line and a single cloumn of annotation. Usage: normalize.r input.tab norm_type norm_by > output.tab Example: Rscript normalize.r test_matrix.tab median_shift column > output.tab Rscript normalize.r test_matrix.tab mean_shift row normals.tab > output.tab Options: input matrix (annotated by row and column names) normalization type; available options: median_shift - shifts all values by the median or the row/column if no normals are specified, otherwise shifts by the median of normals mean_shift - shifts all values by the mean or the row/column if no normals are specified, otherwise shifts by the mean of normals t_statistic - converts all values to z-scores; if normals are specified then converts to z-scores within normal and non-normal classes separately exponential_fit - (only by column) ranks data and transforms exponential CDF normal_fit - (only by column) ranks data and transforms normal CDF weibull_0.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 0.5 weibull_1_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1 weibull_1.5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 1.5 weibull_5_fit - (only by column) ranks data and transforms Weibull CDF with scale parameter = 1 and shape parameter = 5 normalization by: row column normals_file is an optional parameter which contains either a list of column headers from the input matrix, which should be considered as normals, or a matrix of normal samples output file is specified through redirect character >") read_matrix <- function(in_file){ header <- strsplit(readLines(con=in_file, n=1), "\t")[[1]] cl.cols<- 1:length(header) > 1 data_matrix.df <- read.delim(in_file, header=TRUE, row.names=NULL, stringsAsFactors=FALSE, na.strings="NA", check.names=FALSE) data_matrix <- as.matrix(data_matrix.df[,cl.cols]) rownames(data_matrix) <- data_matrix.df[,1] return(data_matrix) #read_mtrx <- as.matrix(read.table(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA")) #separate on white characters #read_mtrx[,1] #return(as.matrix(read.table(in_file, header=TRUE, sep="", row.names=1))) #separate on white characters #mtrx <- read.delim(in_file, header=TRUE, sep="", row.names=NULL, stringsAsFactors=FALSE, na.strings="NA") #print(mtrx[1,]) } write_matrix <- function(data_matrix){ header <- append(c("Genes"), colnames(data_matrix)) write.table(t(header), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) write.table(data_matrix, stdout(), quote=FALSE, sep="\t", row.names=TRUE, col.names=FALSE) } read_normals <- function(in_file){ #return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))[, 1]) return(as.matrix(read.table(in_file, header=FALSE, sep="", as.is = TRUE))) } normalize <- function(data_matrix, norm_type, normals_list, tumors_list){ if(norm_type == 'MEDIAN_SHIFT'){ return(shift(data_matrix, 'MEDIAN', normals_list, tumors_list)) } else if(norm_type == 'MEAN_SHIFT'){ return(shift(data_matrix, 'MEAN', normals_list, tumors_list)) } else if(norm_type == 'T_STATISTIC'){ return(compute_z_score(data_matrix, normals_list, tumors_list)) } else if(norm_type == 'EXPONENTIAL_FIT'){ return(fit_distribution(data_matrix, 'EXPONENTIAL')) } else if(norm_type == 'NORMAL_FIT'){ return(fit_distribution(data_matrix, 'NORMAL')) } else if(norm_type == 'WEIBULL_0.5_FIT'){ return(fit_distribution(data_matrix, 'WEIBULL_0.5')) } else if(norm_type == 'WEIBULL_1_FIT'){ return(fit_distribution(data_matrix, 'WEIBULL_1')) } else if(norm_type == 'WEIBULL_1.5_FIT'){ return(fit_distribution(data_matrix, 'WEIBULL_1.5')) } else if(norm_type == 'WEIBULL_5_FIT'){ return(fit_distribution(data_matrix, 'WEIBULL_5')) }else{ write("ERROR: unknown normalization type", stderr()); q(); } } shift <- function(data_matrix, shift_type, normals_list, tumors_list){ return(t(apply(data_matrix, 1, shift_normalize_row, norm_type=shift_type, normals_list=normals_list, tumors_list=tumors_list))) } shift_normalize_row <- function(data_row, norm_type, normals_list, tumors_list){ if(length(normals_list) == 0){ #no normals are specified if(norm_type == 'MEDIAN'){ row_stat <- median(data_row) } else if(norm_type == 'MEAN'){ row_stat <- mean(data_row) } return(unlist(lapply(data_row, function(x){return(x - row_stat);}))) } else{ #normals are specified normal_values <- data_row[normals_list] tumor_columns <- data_row[tumors_list] if(norm_type == 'MEDIAN'){ row_stat <- median(normal_values) } else if(norm_type == 'MEAN'){ row_stat <- mean(normal_values) } return(unlist(lapply(tumor_columns, function(x){return(x - row_stat);}))) } } compute_z_score <- function(data_matrix, normals_list, tumors_list){ return(t(apply(data_matrix, 1, t_stat_normalize_row, normals_list=normals_list, tumors_list=tumors_list))) } t_stat_normalize_row <- function(data_row, normals_list, tumors_list){ if(length(normals_list) == 0){ #no normals are specified row_mean <- mean(data_row) row_sd <- sd(data_row) return(unlist(lapply(data_row, function(x){return((x - row_mean)/row_sd);}))) } else{ #normals are specified normal_values <- data_row[normals_list] normal_mean <- mean(normal_values) normal_sd <- sd(normal_values) normalized_normals <- unlist(lapply(normal_values, function(x){return((x - normal_mean)/normal_sd);})) tumor_values <- data_row[tumors_list] normalized_tumors <- unlist(lapply(tumor_values, function(x){return((x - normal_mean)/normal_sd);})) return(append(normalized_normals, normalized_tumors)) } } rankNA <- function(col){ #originally written by Dan Carlin col[!is.na(col)]<-(rank(col[!is.na(col)])/sum(!is.na(col)))-(1/sum(!is.na(col))) return(col) } fit_distribution <- function(data_matrix, dist){ if(dist == 'EXPONENTIAL'){ ranked_data_matrix <- apply(data_matrix,1,rankNA) #idea by Dan Carlin #write.table(c("ranked data:"), stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) #write.table(ranked_data_matrix, stdout(), quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) return(apply(ranked_data_matrix, 1, qexp)) } else if(dist == 'NORMAL'){ ranked_data_matrix <- apply(data_matrix,2,rankNA) return(apply(ranked_data_matrix, c(1,2), qnorm, mean=0, sd=1)) } else if(dist == 'WEIBULL_0.5'){ ranked_data_matrix <- apply(data_matrix,2,rankNA) return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=0.5)) } else if(dist == 'WEIBULL_1'){ ranked_data_matrix <- apply(data_matrix,2,rankNA) return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1)) } else if(dist == 'WEIBULL_1.5'){ ranked_data_matrix <- apply(data_matrix,2,rankNA) return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=1.5)) } else if(dist == 'WEIBULL_5'){ ranked_data_matrix <- apply(data_matrix,2,rankNA) return(apply(ranked_data_matrix, c(1,2), qweibull, scale=1, shape=5)) } } main <- function(argv) { #determine if correct number of arguments are specified and if normals are specified with_normals = FALSE if(length(argv) == 1){ if(argv==c('--help')){ write(argspec, stderr()); q(); } } if(!(length(argv) == 3 || length(argv) == 4)){ write("ERROR: invalid number of arguments is specified", stderr()); q(); } if(length(argv) == 4){ with_normals = TRUE normals_file <- argv[4] } #store command line arguments in variables: input_file <- argv[1] norm_type <- toupper(argv[2]) norm_by <- toupper(argv[3]) #input_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix.tab" #norm_type <- "MEAN_SHIFT" #norm_by <- "ROW" #normals_file <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/test_matrix2.tab" #normals_file2 <- "/Users/ynewton/school/ucsc/projects/stuart_lab/data_normalization/normals.tab" #read the input file(s): data_matrix <- read_matrix(input_file) if(with_normals){ normals <- read_normals(normals_file) if(length(colnames(normals)) == 1){ normals_indices <- which(colnames(data_matrix) %in% normals) tumor_indices <- which(!(colnames(data_matrix) %in% normals)) }else{ normals_numeric <- normals[2:length(normals[,1]),2:length(normals[1,])] normals_numeric <- apply(normals_numeric, 2, as.numeric) rownames(normals_numeric) <- normals[,1][2:length(normals[,1])] colnames(normals_numeric) <- normals[1,][2:length(normals[1,])] #select only the intersection of the rows between the two matrices: normals_numeric <- normals_numeric[rownames(normals_numeric) %in% rownames(data_matrix),] data_matrix <- data_matrix[rownames(data_matrix) %in% rownames(normals_numeric),] normals_numeric <- normals_numeric[order(rownames(normals_numeric)),] data_matrix <- data_matrix[order(rownames(data_matrix)),] combined_matrix <- cbind(data_matrix, normals_numeric) tumor_indices <- c(1:length(data_matrix[1,])) normals_indices <- c(length(tumor_indices)+1:length(normals_numeric[1,])) data_matrix <- combined_matrix } }else{ normals_indices <- c() tumor_indices <- c() } #if normalize by columns then transpose the matrix: if(norm_by == 'COLUMN'){ data_matrix <- t(data_matrix) } #normalize: data_matrix <- normalize(data_matrix, norm_type, normals_indices, tumor_indices) #if normalize by columns then transpose the matrix again since we normalized the transposed matrix by row: if(norm_by == 'COLUMN'){ data_matrix <- t(data_matrix) } write_matrix(data_matrix) } main(commandArgs(TRUE))