Mercurial > repos > pieterlukasse > prims_metabolomics
diff Rscripts/ridb-regression.R @ 0:9d5f4f5f764b
Initial commit to toolshed
author | pieter.lukasse@wur.nl |
---|---|
date | Thu, 16 Jan 2014 13:10:00 +0100 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/Rscripts/ridb-regression.R Thu Jan 16 13:10:00 2014 +0100 @@ -0,0 +1,208 @@ +## +# +# Performs regression analysis using either 3rd degree polynomial- or linear-method +# +## + +# Commandline arguments +args <- commandArgs(TRUE) +if (length(args) < 7) + stop(cat("Missing arguments, usage:\n\tRscript ridb-regression.R RI-database ", + "ouput_file logfile min_residuals range_mod pvalue rsquared method ", + "plot(yes/no) plot_archive")) + +ridb <- args[1] +out_file <- args[2] +logfile <- args[3] +min_residuals <- as.integer(args[4]) +range_mod <- as.integer(args[5]) +pvalue <- as.double(args[6]) +rsquared <- as.double(args[7]) +method <- args[8] +plot <- tolower(args[9]) +if (plot == 'true') + plot_archive = args[10] + +# Do not show warnings etc. +sink(file='/dev/null') + +progress <- c() +logger <- function(logdata) { + ## Logs progress, adds a timestamp for each event + #cat(paste(Sys.time(), "\t", logdata, "\n", sep="")) ## DEBUG + progress <<- c(progress, paste(Sys.time(), "\t", logdata, sep="")) +} + +logger("Reading Retention Index Database..") + +# Read Retention Index Database +ridb <- read.csv(ridb, header=TRUE, sep="\t") +logger(paste("\t", nrow(ridb), "records read..")) +# Get a unique list +gc_columns <- unique(as.vector(as.matrix(ridb['Column.name'])[,1])) +cas_numbers <- unique(as.vector(as.matrix(ridb['CAS'])[,1])) + +add_poly_fit <- function(fit, gc1_index, gc2_index, range) { + pval = anova.lm(fit)$Pr + r.squared = summary(fit)$r.squared + + data = rep(NA, 11) + # Append results to matrix + data[1] = gc_columns[gc1_index] # Column 1 + data[2] = gc_columns[gc2_index] # Column 2 + data[3] = coefficients(fit)[1] # The 4 coefficients + data[4] = coefficients(fit)[2] + data[5] = coefficients(fit)[3] + data[6] = coefficients(fit)[4] + data[7] = range[1] # Left limit + data[8] = range[2] # Right limit + data[9] = length(fit$residuals) # Number of datapoints analysed + data[10] = pval[1] # p-value for resulting fitting + data[11] = r.squared # R-squared + return(data) +} + + +add_linear_fit <- function(fit, gc1_index, gc2_index, range) { + pval = anova.lm(fit)$Pr + r.squared = summary(fit)$r.squared + + data = rep(NA, 7) + # Append results to matrix + data[1] = gc_columns[gc1_index] # Column 1 + data[2] = gc_columns[gc2_index] # Column 2 + data[3] = coefficients(fit)[1] # The 4 coefficients + data[4] = coefficients(fit)[2] + data[7] = length(fit$residuals) # Number of datapoints analysed + data[8] = pval[1] # p-value for resulting fitting + data[9] = r.squared # R-squared + return(data) +} + + +add_fit <- function(fit, gc1_index, gc2_index, range, method) { + if (method == 'poly') + return(add_poly_fit(fit, gc1_index, gc2_index, range)) + else + return(add_linear_fit(fit, gc1_index, gc2_index, range)) +} + + +plot_fit <- function(ri1, ri2, gc1_index, gc2_index, coeff, range, method) { + if (method == 'poly') + pol <- function(x) coeff[4]*x^3 + coeff[3]*x^2 + coeff[2]*x + coeff[1] + else + pol <- function(x) coeff[2]*x + coeff[1] + pdf(paste('regression_model_', + make.names(gc_columns[gc1_index]), '_vs_', + make.names(gc_columns[gc2_index]), '.pdf', sep='')) + curve(pol, 250:3750, col="red", lwd=2.5, main='Regression Model', xlab=gc_columns[gc1_index], + ylab=gc_columns[gc2_index], xlim=c(250, 3750), ylim=c(250, 3750)) + points(ri1, ri2, lwd=0.4) + # Add vertical lines showing left- and right limits when using poly method + if (method == 'poly') + abline(v=range, col="grey", lwd=1.5) + dev.off() +} + +# Initialize output dataframe +if (method == 'poly') { + m <- data.frame(matrix(ncol = 11, nrow = 10)) +} else { + m <- data.frame(matrix(ncol = 9, nrow = 10)) +} + + +get_fit <- function(gc1, gc2, method) { + if (method == 'poly') + return(lm(gc1 ~ poly(gc2, 3, raw=TRUE))) + else + return(lm(gc1 ~ gc2)) +} + +# Permutate +k <- 1 +logger(paste("Permutating (with ", length(gc_columns), " GC-columns)..", sep="")) + +for (i in 1:(length(gc_columns)-1)) { + logger(paste("\tCalculating model for ", gc_columns[i], "..", sep="")) + breaks <- 0 + for (j in (i+1):length(gc_columns)) { + col1 = ridb[which(ridb['Column.name'][,1] == gc_columns[i]),] + col2 = ridb[which(ridb['Column.name'][,1] == gc_columns[j]),] + + # Find CAS numbers for which both columns have data (intersect) + cas_intersect = intersect(col1[['CAS']], col2[['CAS']]) + + # Skip if number of shared CAS entries is < cutoff + if (length(cas_intersect) < min_residuals) { + breaks = breaks + 1 + next + } + # Gather Retention Indices + col1_data = col1[['RI']][match(cas_intersect, col1[['CAS']])] + col2_data = col2[['RI']][match(cas_intersect, col2[['CAS']])] + + # Calculate the range within which regression is possible (and move if 'range_mod' != 0) + range = c(min(c(min(col1_data), min(col2_data))), max(c(max(col1_data), max(col2_data)))) + if (range_mod != 0) { + # Calculate percentage and add/subtract from range + perc = diff(range) / 100 + perc_cutoff = range_mod * perc + range = as.integer(range + c(perc_cutoff, -perc_cutoff)) + } + + # Calculate model for column1 vs column2 and plot if requested + fit = get_fit(col1_data, col2_data, method) + m[k,] = add_fit(fit, i, j, range, method) + + if (plot == 'true') + plot_fit(col1_data, col2_data, i, j, coefficients(fit), range, method) + + # Calculate model for column2 vs column1 and plot if requested + fit = get_fit(col2_data, col1_data, method) + m[k + 1,] = add_fit(fit, j, i, range, method) + + if (plot == 'true') + plot_fit(col2_data, col1_data, j, i, coefficients(fit), range, method) + + k = k + 2 + } + logger(paste("\t\t", breaks, " comparisons have been skipped due to nr. of datapoints < cutoff", sep="")) +} + +# Filter on pvalue and R-squared +logger("Filtering on pvalue and R-squared..") +if (method == 'poly') { + pval_index <- which(m[,10] < pvalue) + rsquared_index <- which(m[,11] > rsquared) +} else { + pval_index <- which(m[,8] < pvalue) + rsquared_index <- which(m[,9] > rsquared) +} +logger(paste(nrow(m) - length(pval_index), " models discarded due to pvalue > ", pvalue, sep="")) + +logger(paste(nrow(m) - length(rsquared_index), " models discarded due to R-squared < ", rsquared, sep="")) + +# Remaining rows +index = unique(c(pval_index, rsquared_index)) + +# Reduce dataset +m = m[index,] +sink() + +# Place plots in the history as a ZIP file +if (plot == 'true') { + logger("Creating archive with model graphics..") + system(paste("zip -9 -r models.zip *.pdf > /dev/null", sep="")) + system(paste("cp models.zip ", plot_archive, sep="")) +} + +# Save dataframe as tab separated file +logger("All done, saving data..") +header = c("Column1", "Column2", "Coefficient1", "Coefficient2", "Coefficient3", "Coefficient4", + "LeftLimit", "RightLimit", "Residuals", "pvalue", "Rsquared") +if (method != 'poly') + header = header[c(1:4, 7:11)] +write(progress, logfile) +write.table(m, file=out_file, sep="\t", quote=FALSE, col.names=header, row.names=FALSE)