view find_qPCR_regions.R @ 4:72571a30f17b draft

Add test for unique peak names.
author kyost
date Wed, 02 May 2018 13:20:21 -0400
parents fbfe7b785ea7
children 3cd53127a838
line wrap: on
line source

## Command to run tool:
# Rscript --vanilla find_qPCR_regions.R o.coverage.bed f9.coverage.bed lib_sizes.txt cor_cutoff cov_cutoff output_file

# Set up R error handling to go to stderr
options(show.error.messages=F, error=function(){cat(geterrmessage(),file=stderr());q("no",1,F)})

# Avoid crashing Galaxy with an UTF8 error on German LC settings
loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")

args <- commandArgs(TRUE)

o.coverage <- args[1]
f9.coverage <- args[2]
lib_sizes <- args[3]
corr_cutoff <- as.numeric(args[4])
cov_cutoff <- as.numeric(args[5])
output_file <- args[6]

o.coverage.data <- read.delim(o.coverage, header=FALSE)
f9.coverage.data <- read.delim(f9.coverage, header=FALSE)
lib.sizes.data <- read.delim(lib_sizes, header=FALSE)

m <- ncol(o.coverage.data)

#normalize library sizes to 50 mil reads
lib.sizes.data <- lib.sizes.data/50000000
#normalize peak reads and spanning reads by library size
o.coverage.data[,5:m] <- t(apply(o.coverage.data[,5:m], 1, function(x) x/t(lib.sizes.data)))
f9.coverage.data[,5:m] <- t(apply(f9.coverage.data[,5:m], 1, function(x) x/t(lib.sizes.data)))

#returns list of dataframes containing windows for each peak
split_coverage <- function(a) {
  returnlist <- list()
  temp <- a[1,]
  for (i in 2:nrow(a)) {
    if (temp[1,4] == strsplit(as.character(a[i,4]),"_window")[[1]][1]){
      temp <- rbind(temp, a[i,])
    }else{
      returnlist[[length(returnlist)+1]] <- temp
      temp <- a[i,]
    }
  }
  returnlist[[length(returnlist)+1]] <- temp
  return(returnlist)
}

make_cor_plot <- function(a,b, corr_cut, cov_cut, output_file) { #a=o.coverage b=f9.coverage
  
  asplit <- split_coverage(a)
  bsplit <- split_coverage(b)
  
  combined_regions <- data.frame()

  for (i in 1:length(asplit)){
    n <- nrow(asplit[[i]])-1
    m <- ncol(asplit[[i]])
	
	a_split <- asplit[[i]]
	b_split <- bsplit[[i]]
	
	if (sd(t(a_split[1,5:m])) == 0 || sd(t(b_split[-1,5:m])) == 0 ){
		next
	}

	#calculate correlation between total peak reads and spanning reads in each window
    corb <- data.frame(cor(t(a_split[1,5:m])
                           , t(b_split[-1,5:m])
						   , method = "pearson"))
    
    data <- data.frame(t(corb), row.names = NULL, stringsAsFactors = FALSE)
    
    data <- cbind(rep(c(1:n))
                  , a_split[-1,1:4]
                  , data)
    colnames(data)<-c("position","chr", "start", "stop", "name", "correlation")
    rownames(data)<-NULL
    a_split$average <- rowMeans(a_split[,5:m])
    b_split$average <- rowMeans(b_split[,5:m])
    data2 <- data.frame(cbind(c(1:n), a_split[-1,m+1]), stringsAsFactors=FALSE)
    data3 <- data.frame(cbind(c(1:n), b_split[-1,m+1]), stringsAsFactors=FALSE)
    colnames(data2)<-c("position","averageReadDepth")
    colnames(data3)<-c("position","averageReadDepth")
    
	#keep windows for which correlation with total peak reads is above threshold and 
	#number of spanning reads is above cutoff
    regions <- data[which(data$correlation > corr_cut & data3$averageReadDepth > cov_cut),]
    regions <- regions[,2:4]
	
	#collapse windows to non-overlappin regions
    if (nrow(regions)>0){
                regions$V4 <- paste(as.character(a_split[1,4]),"_region1",sep="")
    collapsed_regions <- regions[1,]
    nregion <- 1
    if (nrow(regions)>=2){
		for (j in 2:nrow(regions)) {
			if (as.numeric(regions[j,2]) <= (as.numeric(regions[j-1,3]))) {
				collapsed_regions[nrow(collapsed_regions),3] <- regions[j,3]
			}else{
				collapsed_regions <- rbind(collapsed_regions, regions[j,])
				nregion <- nregion + 1
				collapsed_regions[nrow(collapsed_regions),4] <- paste(as.character(a_split[1,4])
																	,"_region"
																	, as.character(nregion)
																	,sep="")	
			}
		}
	}	
	rownames(collapsed_regions) <- NULL
		
	combined_regions <- rbind(combined_regions, collapsed_regions)
}
    

 }
  
  write.table(combined_regions
              , output_file
              , sep = "\t"
              , col.names = FALSE
              , row.names = FALSE
              , quote = FALSE)	
  print("Successfully found optimal ATAC-qPCR regions.")		  
}

make_cor_plot(o.coverage.data, f9.coverage.data, corr_cutoff, cov_cutoff, output_file)