comparison find_qPCR_regions.R @ 0:fd3ea97a96bc draft

planemo upload commit 103cb51ec368438642504c3f98b76c363db478bb
author kyost
date Sat, 28 Apr 2018 15:07:26 -0400
parents
children fbfe7b785ea7
comparison
equal deleted inserted replaced
-1:000000000000 0:fd3ea97a96bc
1 ## Command to run tool:
2 # Rscript --vanilla find_qPCR_regions.R o.coverage.bed f9.coverage.bed lib_sizes.txt cor_cutoff cov_cutoff output_file
3
4 # Set up R error handling to go to stderr
5 options(show.error.messages=F, error=function(){cat(geterrmessage(),file=stderr());q("no",1,F)})
6
7 # Avoid crashing Galaxy with an UTF8 error on German LC settings
8 loc <- Sys.setlocale("LC_MESSAGES", "en_US.UTF-8")
9
10 args <- commandArgs(TRUE)
11
12 o.coverage <- args[1]
13 f9.coverage <- args[2]
14 lib_sizes <- args[3]
15 corr_cutoff <- as.numeric(args[4])
16 cov_cutoff <- as.numeric(args[5])
17 output_file <- args[6]
18
19 o.coverage.data <- read.delim(o.coverage, header=FALSE)
20 f9.coverage.data <- read.delim(f9.coverage, header=FALSE)
21 lib.sizes.data <- read.delim(lib_sizes, header=FALSE)
22
23 m <- ncol(o.coverage.data)
24
25 #normalize library sizes to 50 mil reads
26 lib.sizes.data <- lib.sizes.data/50000000
27 #normalize peak reads and spanning reads by library size
28 o.coverage.data[,5:m] <- t(apply(o.coverage.data[,5:m], 1, function(x) x/t(lib.sizes.data)))
29 f9.coverage.data[,5:m] <- t(apply(f9.coverage.data[,5:m], 1, function(x) x/t(lib.sizes.data)))
30
31 #returns list of dataframes containing windows for each peak
32 split_coverage <- function(a) {
33 returnlist <- list()
34 temp <- a[1,]
35 for (i in 2:nrow(a)) {
36 if (temp[1,4] == strsplit(as.character(a[i,4]),"_window")[[1]][1]){
37 temp <- rbind(temp, a[i,])
38 }else{
39 returnlist[[length(returnlist)+1]] <- temp
40 temp <- a[i,]
41 }
42 }
43 returnlist[[length(returnlist)+1]] <- temp
44 return(returnlist)
45 }
46
47 make_cor_plot <- function(a,b, corr_cut, cov_cut, output_file) { #a=o.coverage b=f9.coverage
48
49 asplit <- split_coverage(a)
50 bsplit <- split_coverage(b)
51
52 combined_regions <- data.frame()
53
54 for (i in 1:length(asplit)){
55 n <- nrow(asplit[[i]])-1
56 m <- ncol(asplit[[i]])
57
58 a_split <- asplit[[i]]
59 b_split <- bsplit[[i]]
60
61 #calculate correlation between total peak reads and spanning reads in each window
62 corb <- data.frame(cor(t(a_split[1,5:m])
63 , t(b_split[-1,5:m])
64 , method = "pearson"))
65
66 data <- data.frame(t(corb), row.names = NULL, stringsAsFactors = FALSE)
67
68 data <- cbind(rep(c(1:n))
69 , a_split[-1,1:4]
70 , data)
71 colnames(data)<-c("position","chr", "start", "stop", "name", "correlation")
72 rownames(data)<-NULL
73 a_split$average <- rowMeans(a_split[,5:m])
74 b_split$average <- rowMeans(b_split[,5:m])
75 data2 <- data.frame(cbind(c(1:n), a_split[-1,m+1]), stringsAsFactors=FALSE)
76 data3 <- data.frame(cbind(c(1:n), b_split[-1,m+1]), stringsAsFactors=FALSE)
77 colnames(data2)<-c("position","averageReadDepth")
78 colnames(data3)<-c("position","averageReadDepth")
79
80 #keep windows for which correlation with total peak reads is above threshold and
81 #number of spanning reads is above cutoff
82 regions <- data[which(data$correlation > corr_cut & data3$averageReadDepth > cov_cut),]
83 regions <- regions[,2:4]
84
85 #collapse windows to non-overlappin regions
86 if (nrow(regions)>0){
87 regions$V4 <- paste(as.character(a_split[1,4]),"_region1",sep="")
88 collapsed_regions <- regions[1,]
89 nregion <- 1
90 if (nrow(regions)>=2){
91 for (j in 2:nrow(regions)) {
92 if (as.numeric(regions[j,2]) <= (as.numeric(regions[j-1,3]))) {
93 collapsed_regions[nrow(collapsed_regions),3] <- regions[j,3]
94 }else{
95 collapsed_regions <- rbind(collapsed_regions, regions[j,])
96 nregion <- nregion + 1
97 collapsed_regions[nrow(collapsed_regions),4] <- paste(as.character(a_split[1,4])
98 ,"_region"
99 , as.character(nregion)
100 ,sep="")
101 }
102 }
103 }
104 rownames(collapsed_regions) <- NULL
105
106 combined_regions <- rbind(combined_regions, collapsed_regions)
107 }
108
109
110 }
111
112 write.table(combined_regions
113 , output_file
114 , sep = "\t"
115 , col.names = FALSE
116 , row.names = FALSE
117 , quote = FALSE)
118 print("Successfully found optimal ATAC-qPCR regions.")
119 }
120
121 make_cor_plot(o.coverage.data, f9.coverage.data, corr_cutoff, cov_cutoff, output_file)