Mercurial > repos > kyost > atac_primer_tool
diff find_qPCR_regions.R @ 0:fd3ea97a96bc draft
planemo upload commit 103cb51ec368438642504c3f98b76c363db478bb
author | kyost |
---|---|
date | Sat, 28 Apr 2018 15:07:26 -0400 |
parents | |
children | fbfe7b785ea7 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/find_qPCR_regions.R Sat Apr 28 15:07:26 2018 -0400 @@ -0,0 +1,121 @@ +## Command to run tool: +# Rscript --vanilla find_qPCR_regions.R o.coverage.bed f9.coverage.bed lib_sizes.txt cor_cutoff cov_cutoff output_file + +# Set up R error handling to go to stderr +options(show.error.messages=F, error=function(){cat(geterrmessage(),file=stderr());q("no",1,F)}) + +# Avoid crashing Galaxy with an UTF8 error on German LC settings +loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8") + +args <- commandArgs(TRUE) + +o.coverage <- args[1] +f9.coverage <- args[2] +lib_sizes <- args[3] +corr_cutoff <- as.numeric(args[4]) +cov_cutoff <- as.numeric(args[5]) +output_file <- args[6] + +o.coverage.data <- read.delim(o.coverage, header=FALSE) +f9.coverage.data <- read.delim(f9.coverage, header=FALSE) +lib.sizes.data <- read.delim(lib_sizes, header=FALSE) + +m <- ncol(o.coverage.data) + +#normalize library sizes to 50 mil reads +lib.sizes.data <- lib.sizes.data/50000000 +#normalize peak reads and spanning reads by library size +o.coverage.data[,5:m] <- t(apply(o.coverage.data[,5:m], 1, function(x) x/t(lib.sizes.data))) +f9.coverage.data[,5:m] <- t(apply(f9.coverage.data[,5:m], 1, function(x) x/t(lib.sizes.data))) + +#returns list of dataframes containing windows for each peak +split_coverage <- function(a) { + returnlist <- list() + temp <- a[1,] + for (i in 2:nrow(a)) { + if (temp[1,4] == strsplit(as.character(a[i,4]),"_window")[[1]][1]){ + temp <- rbind(temp, a[i,]) + }else{ + returnlist[[length(returnlist)+1]] <- temp + temp <- a[i,] + } + } + returnlist[[length(returnlist)+1]] <- temp + return(returnlist) +} + +make_cor_plot <- function(a,b, corr_cut, cov_cut, output_file) { #a=o.coverage b=f9.coverage + + asplit <- split_coverage(a) + bsplit <- split_coverage(b) + + combined_regions <- data.frame() + + for (i in 1:length(asplit)){ + n <- nrow(asplit[[i]])-1 + m <- ncol(asplit[[i]]) + + a_split <- asplit[[i]] + b_split <- bsplit[[i]] + + #calculate correlation between total peak reads and spanning reads in each window + corb <- data.frame(cor(t(a_split[1,5:m]) + , t(b_split[-1,5:m]) + , method = "pearson")) + + data <- data.frame(t(corb), row.names = NULL, stringsAsFactors = FALSE) + + data <- cbind(rep(c(1:n)) + , a_split[-1,1:4] + , data) + colnames(data)<-c("position","chr", "start", "stop", "name", "correlation") + rownames(data)<-NULL + a_split$average <- rowMeans(a_split[,5:m]) + b_split$average <- rowMeans(b_split[,5:m]) + data2 <- data.frame(cbind(c(1:n), a_split[-1,m+1]), stringsAsFactors=FALSE) + data3 <- data.frame(cbind(c(1:n), b_split[-1,m+1]), stringsAsFactors=FALSE) + colnames(data2)<-c("position","averageReadDepth") + colnames(data3)<-c("position","averageReadDepth") + + #keep windows for which correlation with total peak reads is above threshold and + #number of spanning reads is above cutoff + regions <- data[which(data$correlation > corr_cut & data3$averageReadDepth > cov_cut),] + regions <- regions[,2:4] + + #collapse windows to non-overlappin regions + if (nrow(regions)>0){ + regions$V4 <- paste(as.character(a_split[1,4]),"_region1",sep="") + collapsed_regions <- regions[1,] + nregion <- 1 + if (nrow(regions)>=2){ + for (j in 2:nrow(regions)) { + if (as.numeric(regions[j,2]) <= (as.numeric(regions[j-1,3]))) { + collapsed_regions[nrow(collapsed_regions),3] <- regions[j,3] + }else{ + collapsed_regions <- rbind(collapsed_regions, regions[j,]) + nregion <- nregion + 1 + collapsed_regions[nrow(collapsed_regions),4] <- paste(as.character(a_split[1,4]) + ,"_region" + , as.character(nregion) + ,sep="") + } + } + } + rownames(collapsed_regions) <- NULL + + combined_regions <- rbind(combined_regions, collapsed_regions) +} + + + } + + write.table(combined_regions + , output_file + , sep = "\t" + , col.names = FALSE + , row.names = FALSE + , quote = FALSE) + print("Successfully found optimal ATAC-qPCR regions.") +} + +make_cor_plot(o.coverage.data, f9.coverage.data, corr_cutoff, cov_cutoff, output_file)