diff batch-consistency-analysis.r @ 5:48767bec000d draft

Uploaded
author modencode-dcc
date Thu, 17 Jan 2013 15:45:48 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/batch-consistency-analysis.r	Thu Jan 17 15:45:48 2013 -0500
@@ -0,0 +1,182 @@
+##############################################################################
+
+# Modified 06/29/12: Kar Ming Chu
+# Modified to work with Galaxy
+
+# Usage: 	Rscript batch-consistency-analysis.r peakfile1 peakfile2 half.width overlap.ratio  is.broadpeak sig.value gtable r.output overlap.output npeaks.output em.sav.output uri.sav.output
+
+# Changes:
+# 	- Appended parameter for input gnome table called gtable
+#	- Appended parameter for specifying Rout output file name (required by Galaxy)
+#	- Appended parameter for specifying Peak overlap output file name (required by Galaxy)
+#	- Appended parameter for specifying Npeak above IDR output file name (required by Galaxy)
+#	- Removed parameter outfile.prefix since main output files are replaced with strict naming
+#	- Appended parameter for specifying em.sav output file (for use with batch-consistency-plot.r)
+#	- Appended parameter for specifying uri.sav output file (for use with batch-consistency-plot.r)
+
+##############################################################################
+
+# modified 3-29-10: Qunhua Li
+# add 2 columns in the output of "-overlapped-peaks.txt": local.idr and IDR
+
+# 01-20-2010 Qunhua Li
+#
+# This program performs consistency analysis for a pair of peak calling outputs
+# It takes narrowPeak or broadPeak formats.
+# 
+# usage: Rscript batch-consistency-analysis2.r peakfile1 peakfile2 half.width outfile.prefix overlap.ratio  is.broadpeak sig.value
+#
+# peakfile1 and peakfile2 : the output from peak callers in narrowPeak or broadPeak format
+# half.width: -1 if using the reported peak width, 
+#             a numerical value to truncate the peaks to
+# outfile.prefix: prefix of output file
+# overlap.ratio: a value between 0 and 1. It controls how much overlaps two peaks need to have to be called as calling the same region. It is the ratio of overlap / short peak of the two. When setting at 0, it means as long as overlapped width >=1bp, two peaks are deemed as calling the same region.
+# is.broadpeak: a logical value. If broadpeak is used, set as T; if narrowpeak is used, set as F
+# sig.value: type of significant values, "q.value", "p.value" or "signal.value" (default, i.e. fold of enrichment)
+
+args <- commandArgs(trailingOnly=T)
+
+# consistency between peakfile1 and peakfile2
+#input1.dir <- args[1]
+#input2.dir <- args[2] # directories of the two input files
+peakfile1 <- args[1]
+peakfile2 <- args[2]
+
+if(as.numeric(args[3])==-1){ # enter -1 when using the reported length 
+  half.width <- NULL
+}else{
+  half.width <- as.numeric(args[3])
+}
+
+overlap.ratio <- args[4]
+
+if(args[5] == "T"){
+  is.broadpeak <- T
+}else{
+  is.broadpeak <- F
+}
+
+sig.value <- args[6]
+
+
+#dir1 <- "~/ENCODE/anshul/data/"
+#dir2 <- dir1
+#peakfile1 <- "../data/SPP.YaleRep1Gm12878Cfos.VS.Gm12878Input.PointPeak.narrowPeak"
+#peakfile2 <- "../data/SPP.YaleRep3Gm12878Cfos.VS.Gm12878Input.PointPeak.narrowPeak"
+#half.width <- NULL
+#overlap.ratio <- 0.1
+#sig.value <- "signal.value"
+
+
+source("/mnt/galaxyTools/galaxy-central/tools/modENCODE_DCC_tools/idr/functions-all-clayton-12-13.r")
+
+# read the length of the chromosomes, which will be used to concatenate chr's
+# chr.file <- "genome_table.txt"
+# args[7] is the gtable
+chr.file <- args[7]
+
+chr.size <- read.table(chr.file)
+
+# setting output files
+r.output <- args[8]
+overlap.output <- args[9]
+npeaks.output <- args[10]
+em.sav.output <- args[11]
+uri.sav.output <- args[12]
+
+# sink(paste(output.prefix, "-Rout.txt", sep=""))
+sink(r.output)
+
+############# process the data
+cat("is.broadpeak", is.broadpeak, "\n")
+# process data, summit: the representation of the location of summit
+rep1 <- process.narrowpeak(paste(peakfile1, sep=""), chr.size, half.width=half.width, summit="offset", broadpeak=is.broadpeak)
+rep2 <- process.narrowpeak(paste(peakfile2, sep=""), chr.size, half.width=half.width, summit="offset", broadpeak=is.broadpeak)
+
+cat(paste("read", peakfile1, ": ", nrow(rep1$data.ori), "peaks\n", nrow(rep1$data.cleaned), "peaks are left after cleaning\n", peakfile2, ": ", nrow(rep2$data.ori), "peaks\n", nrow(rep2$data.cleaned), " peaks are left after cleaning"))
+
+if(args[3]==-1){
+  cat(paste("half.width=", "reported", "\n"))
+}else{
+  cat(paste("half.width=", half.width, "\n"))
+}  
+cat(paste("significant measure=", sig.value, "\n"))
+
+# compute correspondence profile (URI)
+uri.output <- compute.pair.uri(rep1$data.cleaned, rep2$data.cleaned, sig.value1=sig.value, sig.value2=sig.value, overlap.ratio=overlap.ratio)
+
+#uri.output <- compute.pair.uri(rep1$data.cleaned, rep2$data.cleaned)
+
+cat(paste("URI is done\n"))
+
+# save output
+# save(uri.output, file=paste(output.prefix, "-uri.sav", sep=""))
+save(uri.output, file=uri.sav.output)
+cat(paste("URI is saved at: ", uri.sav.output))
+
+
+# EM procedure for inference
+em.output <- fit.em(uri.output$data12.enrich, fix.rho2=T)
+
+#em.output <- fit.2copula.em(uri.output$data12.enrich, fix.rho2=T, "gaussian")
+
+cat(paste("EM is done\n\n"))
+
+save(em.output, file=em.sav.output)
+cat(paste("EM is saved at: ", em.sav.output))
+
+
+# write em output into a file
+
+cat(paste("EM estimation for the following files\n", peakfile1, "\n", peakfile2, "\n", sep=""))
+
+print(em.output$em.fit$para)
+
+# add on 3-29-10
+# output both local idr and IDR
+idr.local <- 1-em.output$em.fit$e.z
+IDR <- c()
+o <- order(idr.local)
+IDR[o] <- cumsum(idr.local[o])/c(1:length(o))
+
+
+write.out.data <- data.frame(chr1=em.output$data.pruned$sample1[, "chr"],
+                    start1=em.output$data.pruned$sample1[, "start.ori"],
+                    stop1=em.output$data.pruned$sample1[, "stop.ori"],
+                    sig.value1=em.output$data.pruned$sample1[, "sig.value"],   
+                    chr2=em.output$data.pruned$sample2[, "chr"],
+                    start2=em.output$data.pruned$sample2[, "start.ori"],
+                    stop2=em.output$data.pruned$sample2[, "stop.ori"],
+                    sig.value2=em.output$data.pruned$sample2[, "sig.value"],
+                    idr.local=1-em.output$em.fit$e.z, IDR=IDR)
+
+# write.table(write.out.data, file=paste(output.prefix, "-overlapped-peaks.txt", sep=""))
+write.table(write.out.data, file=overlap.output)
+cat(paste("Write overlapped peaks and local idr to: ", overlap.output, sep=""))
+
+# number of peaks passing IDR range (0.01-0.25)
+IDR.cutoff <- seq(0.01, 0.25, by=0.01)
+idr.o <- order(write.out.data$idr.local)
+idr.ordered <- write.out.data$idr.local[idr.o]
+IDR.sum <- cumsum(idr.ordered)/c(1:length(idr.ordered))
+
+IDR.count <- c()
+n.cutoff <- length(IDR.cutoff)
+for(i in 1:n.cutoff){
+  IDR.count[i] <- sum(IDR.sum <= IDR.cutoff[i])
+}
+
+
+# write the number of peaks passing various IDR range into a file
+idr.cut <- data.frame(peakfile1, peakfile2, IDR.cutoff=IDR.cutoff, IDR.count=IDR.count)
+write.table(idr.cut, file=npeaks.output, append=T, quote=F, row.names=F, col.names=F)
+cat(paste("Write number of peaks above IDR cutoff [0.01, 0.25]: ","npeaks-aboveIDR.txt\n", sep=""))
+
+mar.mean <- get.mar.mean(em.output$em.fit)
+
+cat(paste("Marginal mean of two components:\n"))
+print(mar.mean)
+
+sink()
+
+