annotate batch-consistency-analysis.r @ 5:48767bec000d draft

Uploaded
author modencode-dcc
date Thu, 17 Jan 2013 15:45:48 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
5
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
1 ##############################################################################
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
2
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
3 # Modified 06/29/12: Kar Ming Chu
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
4 # Modified to work with Galaxy
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
5
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
6 # Usage: Rscript batch-consistency-analysis.r peakfile1 peakfile2 half.width overlap.ratio is.broadpeak sig.value gtable r.output overlap.output npeaks.output em.sav.output uri.sav.output
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
7
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
8 # Changes:
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
9 # - Appended parameter for input gnome table called gtable
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
10 # - Appended parameter for specifying Rout output file name (required by Galaxy)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
11 # - Appended parameter for specifying Peak overlap output file name (required by Galaxy)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
12 # - Appended parameter for specifying Npeak above IDR output file name (required by Galaxy)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
13 # - Removed parameter outfile.prefix since main output files are replaced with strict naming
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
14 # - Appended parameter for specifying em.sav output file (for use with batch-consistency-plot.r)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
15 # - Appended parameter for specifying uri.sav output file (for use with batch-consistency-plot.r)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
16
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
17 ##############################################################################
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
18
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
19 # modified 3-29-10: Qunhua Li
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
20 # add 2 columns in the output of "-overlapped-peaks.txt": local.idr and IDR
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
21
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
22 # 01-20-2010 Qunhua Li
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
23 #
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
24 # This program performs consistency analysis for a pair of peak calling outputs
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
25 # It takes narrowPeak or broadPeak formats.
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
26 #
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
27 # usage: Rscript batch-consistency-analysis2.r peakfile1 peakfile2 half.width outfile.prefix overlap.ratio is.broadpeak sig.value
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
28 #
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
29 # peakfile1 and peakfile2 : the output from peak callers in narrowPeak or broadPeak format
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
30 # half.width: -1 if using the reported peak width,
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
31 # a numerical value to truncate the peaks to
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
32 # outfile.prefix: prefix of output file
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
33 # overlap.ratio: a value between 0 and 1. It controls how much overlaps two peaks need to have to be called as calling the same region. It is the ratio of overlap / short peak of the two. When setting at 0, it means as long as overlapped width >=1bp, two peaks are deemed as calling the same region.
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
34 # is.broadpeak: a logical value. If broadpeak is used, set as T; if narrowpeak is used, set as F
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
35 # sig.value: type of significant values, "q.value", "p.value" or "signal.value" (default, i.e. fold of enrichment)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
36
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
37 args <- commandArgs(trailingOnly=T)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
38
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
39 # consistency between peakfile1 and peakfile2
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
40 #input1.dir <- args[1]
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
41 #input2.dir <- args[2] # directories of the two input files
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
42 peakfile1 <- args[1]
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
43 peakfile2 <- args[2]
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
44
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
45 if(as.numeric(args[3])==-1){ # enter -1 when using the reported length
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
46 half.width <- NULL
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
47 }else{
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
48 half.width <- as.numeric(args[3])
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
49 }
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
50
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
51 overlap.ratio <- args[4]
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
52
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
53 if(args[5] == "T"){
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
54 is.broadpeak <- T
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
55 }else{
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
56 is.broadpeak <- F
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
57 }
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
58
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
59 sig.value <- args[6]
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
60
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
61
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
62 #dir1 <- "~/ENCODE/anshul/data/"
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
63 #dir2 <- dir1
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
64 #peakfile1 <- "../data/SPP.YaleRep1Gm12878Cfos.VS.Gm12878Input.PointPeak.narrowPeak"
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
65 #peakfile2 <- "../data/SPP.YaleRep3Gm12878Cfos.VS.Gm12878Input.PointPeak.narrowPeak"
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
66 #half.width <- NULL
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
67 #overlap.ratio <- 0.1
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
68 #sig.value <- "signal.value"
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
69
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
70
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
71 source("/mnt/galaxyTools/galaxy-central/tools/modENCODE_DCC_tools/idr/functions-all-clayton-12-13.r")
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
72
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
73 # read the length of the chromosomes, which will be used to concatenate chr's
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
74 # chr.file <- "genome_table.txt"
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
75 # args[7] is the gtable
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
76 chr.file <- args[7]
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
77
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
78 chr.size <- read.table(chr.file)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
79
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
80 # setting output files
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
81 r.output <- args[8]
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
82 overlap.output <- args[9]
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
83 npeaks.output <- args[10]
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
84 em.sav.output <- args[11]
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
85 uri.sav.output <- args[12]
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
86
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
87 # sink(paste(output.prefix, "-Rout.txt", sep=""))
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
88 sink(r.output)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
89
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
90 ############# process the data
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
91 cat("is.broadpeak", is.broadpeak, "\n")
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
92 # process data, summit: the representation of the location of summit
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
93 rep1 <- process.narrowpeak(paste(peakfile1, sep=""), chr.size, half.width=half.width, summit="offset", broadpeak=is.broadpeak)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
94 rep2 <- process.narrowpeak(paste(peakfile2, sep=""), chr.size, half.width=half.width, summit="offset", broadpeak=is.broadpeak)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
95
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
96 cat(paste("read", peakfile1, ": ", nrow(rep1$data.ori), "peaks\n", nrow(rep1$data.cleaned), "peaks are left after cleaning\n", peakfile2, ": ", nrow(rep2$data.ori), "peaks\n", nrow(rep2$data.cleaned), " peaks are left after cleaning"))
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
97
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
98 if(args[3]==-1){
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
99 cat(paste("half.width=", "reported", "\n"))
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
100 }else{
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
101 cat(paste("half.width=", half.width, "\n"))
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
102 }
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
103 cat(paste("significant measure=", sig.value, "\n"))
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
104
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
105 # compute correspondence profile (URI)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
106 uri.output <- compute.pair.uri(rep1$data.cleaned, rep2$data.cleaned, sig.value1=sig.value, sig.value2=sig.value, overlap.ratio=overlap.ratio)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
107
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
108 #uri.output <- compute.pair.uri(rep1$data.cleaned, rep2$data.cleaned)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
109
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
110 cat(paste("URI is done\n"))
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
111
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
112 # save output
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
113 # save(uri.output, file=paste(output.prefix, "-uri.sav", sep=""))
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
114 save(uri.output, file=uri.sav.output)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
115 cat(paste("URI is saved at: ", uri.sav.output))
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
116
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
117
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
118 # EM procedure for inference
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
119 em.output <- fit.em(uri.output$data12.enrich, fix.rho2=T)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
120
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
121 #em.output <- fit.2copula.em(uri.output$data12.enrich, fix.rho2=T, "gaussian")
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
122
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
123 cat(paste("EM is done\n\n"))
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
124
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
125 save(em.output, file=em.sav.output)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
126 cat(paste("EM is saved at: ", em.sav.output))
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
127
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
128
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
129 # write em output into a file
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
130
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
131 cat(paste("EM estimation for the following files\n", peakfile1, "\n", peakfile2, "\n", sep=""))
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
132
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
133 print(em.output$em.fit$para)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
134
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
135 # add on 3-29-10
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
136 # output both local idr and IDR
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
137 idr.local <- 1-em.output$em.fit$e.z
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
138 IDR <- c()
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
139 o <- order(idr.local)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
140 IDR[o] <- cumsum(idr.local[o])/c(1:length(o))
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
141
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
142
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
143 write.out.data <- data.frame(chr1=em.output$data.pruned$sample1[, "chr"],
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
144 start1=em.output$data.pruned$sample1[, "start.ori"],
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
145 stop1=em.output$data.pruned$sample1[, "stop.ori"],
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
146 sig.value1=em.output$data.pruned$sample1[, "sig.value"],
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
147 chr2=em.output$data.pruned$sample2[, "chr"],
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
148 start2=em.output$data.pruned$sample2[, "start.ori"],
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
149 stop2=em.output$data.pruned$sample2[, "stop.ori"],
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
150 sig.value2=em.output$data.pruned$sample2[, "sig.value"],
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
151 idr.local=1-em.output$em.fit$e.z, IDR=IDR)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
152
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
153 # write.table(write.out.data, file=paste(output.prefix, "-overlapped-peaks.txt", sep=""))
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
154 write.table(write.out.data, file=overlap.output)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
155 cat(paste("Write overlapped peaks and local idr to: ", overlap.output, sep=""))
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
156
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
157 # number of peaks passing IDR range (0.01-0.25)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
158 IDR.cutoff <- seq(0.01, 0.25, by=0.01)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
159 idr.o <- order(write.out.data$idr.local)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
160 idr.ordered <- write.out.data$idr.local[idr.o]
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
161 IDR.sum <- cumsum(idr.ordered)/c(1:length(idr.ordered))
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
162
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
163 IDR.count <- c()
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
164 n.cutoff <- length(IDR.cutoff)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
165 for(i in 1:n.cutoff){
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
166 IDR.count[i] <- sum(IDR.sum <= IDR.cutoff[i])
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
167 }
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
168
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
169
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
170 # write the number of peaks passing various IDR range into a file
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
171 idr.cut <- data.frame(peakfile1, peakfile2, IDR.cutoff=IDR.cutoff, IDR.count=IDR.count)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
172 write.table(idr.cut, file=npeaks.output, append=T, quote=F, row.names=F, col.names=F)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
173 cat(paste("Write number of peaks above IDR cutoff [0.01, 0.25]: ","npeaks-aboveIDR.txt\n", sep=""))
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
174
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
175 mar.mean <- get.mar.mean(em.output$em.fit)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
176
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
177 cat(paste("Marginal mean of two components:\n"))
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
178 print(mar.mean)
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
179
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
180 sink()
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
181
48767bec000d Uploaded
modencode-dcc
parents:
diff changeset
182