Mercurial > repos > modencode-dcc > idr_package
comparison batch-consistency-analysis.r @ 20:6f6a9fbe264e draft default tip
Uploaded
| author | modencode-dcc | 
|---|---|
| date | Mon, 21 Jan 2013 13:36:24 -0500 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 19:11269f3b68a0 | 20:6f6a9fbe264e | 
|---|---|
| 1 ############################################################################## | |
| 2 | |
| 3 # Modified 06/29/12: Kar Ming Chu | |
| 4 # Modified to work with Galaxy | |
| 5 | |
| 6 # Usage: Rscript batch-consistency-analysis.r peakfile1 peakfile2 half.width overlap.ratio is.broadpeak sig.value gtable r.output overlap.output npeaks.output em.sav.output uri.sav.output | |
| 7 | |
| 8 # Changes: | |
| 9 # - Appended parameter for input gnome table called gtable | |
| 10 # - Appended parameter for specifying Rout output file name (required by Galaxy) | |
| 11 # - Appended parameter for specifying Peak overlap output file name (required by Galaxy) | |
| 12 # - Appended parameter for specifying Npeak above IDR output file name (required by Galaxy) | |
| 13 # - Removed parameter outfile.prefix since main output files are replaced with strict naming | |
| 14 # - Appended parameter for specifying em.sav output file (for use with batch-consistency-plot.r) | |
| 15 # - Appended parameter for specifying uri.sav output file (for use with batch-consistency-plot.r) | |
| 16 | |
| 17 ############################################################################## | |
| 18 | |
| 19 # modified 3-29-10: Qunhua Li | |
| 20 # add 2 columns in the output of "-overlapped-peaks.txt": local.idr and IDR | |
| 21 | |
| 22 # 01-20-2010 Qunhua Li | |
| 23 # | |
| 24 # This program performs consistency analysis for a pair of peak calling outputs | |
| 25 # It takes narrowPeak or broadPeak formats. | |
| 26 # | |
| 27 # usage: Rscript batch-consistency-analysis2.r peakfile1 peakfile2 half.width outfile.prefix overlap.ratio is.broadpeak sig.value | |
| 28 # | |
| 29 # peakfile1 and peakfile2 : the output from peak callers in narrowPeak or broadPeak format | |
| 30 # half.width: -1 if using the reported peak width, | |
| 31 # a numerical value to truncate the peaks to | |
| 32 # outfile.prefix: prefix of output file | |
| 33 # overlap.ratio: a value between 0 and 1. It controls how much overlaps two peaks need to have to be called as calling the same region. It is the ratio of overlap / short peak of the two. When setting at 0, it means as long as overlapped width >=1bp, two peaks are deemed as calling the same region. | |
| 34 # is.broadpeak: a logical value. If broadpeak is used, set as T; if narrowpeak is used, set as F | |
| 35 # sig.value: type of significant values, "q.value", "p.value" or "signal.value" (default, i.e. fold of enrichment) | |
| 36 | |
| 37 args <- commandArgs(trailingOnly=T) | |
| 38 | |
| 39 # consistency between peakfile1 and peakfile2 | |
| 40 #input1.dir <- args[1] | |
| 41 #input2.dir <- args[2] # directories of the two input files | |
| 42 script_path <- args[1] | |
| 43 peakfile1 <- args[2] | |
| 44 peakfile2 <- args[3] | |
| 45 | |
| 46 if(as.numeric(args[4])==-1){ # enter -1 when using the reported length | |
| 47 half.width <- NULL | |
| 48 }else{ | |
| 49 half.width <- as.numeric(args[4]) | |
| 50 } | |
| 51 | |
| 52 overlap.ratio <- args[5] | |
| 53 | |
| 54 if(args[6] == "T"){ | |
| 55 is.broadpeak <- T | |
| 56 }else{ | |
| 57 is.broadpeak <- F | |
| 58 } | |
| 59 | |
| 60 sig.value <- args[7] | |
| 61 | |
| 62 #dir1 <- "~/ENCODE/anshul/data/" | |
| 63 #dir2 <- dir1 | |
| 64 #peakfile1 <- "../data/SPP.YaleRep1Gm12878Cfos.VS.Gm12878Input.PointPeak.narrowPeak" | |
| 65 #peakfile2 <- "../data/SPP.YaleRep3Gm12878Cfos.VS.Gm12878Input.PointPeak.narrowPeak" | |
| 66 #half.width <- NULL | |
| 67 #overlap.ratio <- 0.1 | |
| 68 #sig.value <- "signal.value" | |
| 69 | |
| 70 | |
| 71 source(paste(script_path, "/functions-all-clayton-12-13.r", sep="")) | |
| 72 | |
| 73 # read the length of the chromosomes, which will be used to concatenate chr's | |
| 74 # chr.file <- "genome_table.txt" | |
| 75 # args[8] is the gtable | |
| 76 chr.file <- args[8] | |
| 77 | |
| 78 chr.size <- read.table(paste(script_path, "/genome_tables/", chr.file, sep="")) | |
| 79 | |
| 80 # setting output files | |
| 81 r.output <- args[9] | |
| 82 overlap.output <- args[10] | |
| 83 npeaks.output <- args[11] | |
| 84 em.sav.output <- args[12] | |
| 85 uri.sav.output <- args[13] | |
| 86 | |
| 87 # sink(paste(output.prefix, "-Rout.txt", sep="")) | |
| 88 sink(r.output) | |
| 89 | |
| 90 ############# process the data | |
| 91 cat("is.broadpeak", is.broadpeak, "\n") | |
| 92 # process data, summit: the representation of the location of summit | |
| 93 rep1 <- process.narrowpeak(paste(peakfile1, sep=""), chr.size, half.width=half.width, summit="offset", broadpeak=is.broadpeak) | |
| 94 rep2 <- process.narrowpeak(paste(peakfile2, sep=""), chr.size, half.width=half.width, summit="offset", broadpeak=is.broadpeak) | |
| 95 | |
| 96 cat(paste("read", peakfile1, ": ", nrow(rep1$data.ori), "peaks\n", nrow(rep1$data.cleaned), "peaks are left after cleaning\n", peakfile2, ": ", nrow(rep2$data.ori), "peaks\n", nrow(rep2$data.cleaned), " peaks are left after cleaning")) | |
| 97 | |
| 98 if(args[4]==-1){ | |
| 99 cat(paste("half.width=", "reported", "\n")) | |
| 100 }else{ | |
| 101 cat(paste("half.width=", half.width, "\n")) | |
| 102 } | |
| 103 cat(paste("significant measure=", sig.value, "\n")) | |
| 104 | |
| 105 # compute correspondence profile (URI) | |
| 106 uri.output <- compute.pair.uri(rep1$data.cleaned, rep2$data.cleaned, sig.value1=sig.value, sig.value2=sig.value, overlap.ratio=overlap.ratio) | |
| 107 | |
| 108 #uri.output <- compute.pair.uri(rep1$data.cleaned, rep2$data.cleaned) | |
| 109 | |
| 110 cat(paste("URI is done\n")) | |
| 111 | |
| 112 # save output | |
| 113 # save(uri.output, file=paste(output.prefix, "-uri.sav", sep="")) | |
| 114 save(uri.output, file=uri.sav.output) | |
| 115 cat(paste("URI is saved at: ", uri.sav.output)) | |
| 116 | |
| 117 | |
| 118 # EM procedure for inference | |
| 119 em.output <- fit.em(uri.output$data12.enrich, fix.rho2=T) | |
| 120 | |
| 121 #em.output <- fit.2copula.em(uri.output$data12.enrich, fix.rho2=T, "gaussian") | |
| 122 | |
| 123 cat(paste("EM is done\n\n")) | |
| 124 | |
| 125 save(em.output, file=em.sav.output) | |
| 126 cat(paste("EM is saved at: ", em.sav.output)) | |
| 127 | |
| 128 | |
| 129 # write em output into a file | |
| 130 | |
| 131 cat(paste("EM estimation for the following files\n", peakfile1, "\n", peakfile2, "\n", sep="")) | |
| 132 | |
| 133 print(em.output$em.fit$para) | |
| 134 | |
| 135 # add on 3-29-10 | |
| 136 # output both local idr and IDR | |
| 137 idr.local <- 1-em.output$em.fit$e.z | |
| 138 IDR <- c() | |
| 139 o <- order(idr.local) | |
| 140 IDR[o] <- cumsum(idr.local[o])/c(1:length(o)) | |
| 141 | |
| 142 | |
| 143 write.out.data <- data.frame(chr1=em.output$data.pruned$sample1[, "chr"], | |
| 144 start1=em.output$data.pruned$sample1[, "start.ori"], | |
| 145 stop1=em.output$data.pruned$sample1[, "stop.ori"], | |
| 146 sig.value1=em.output$data.pruned$sample1[, "sig.value"], | |
| 147 chr2=em.output$data.pruned$sample2[, "chr"], | |
| 148 start2=em.output$data.pruned$sample2[, "start.ori"], | |
| 149 stop2=em.output$data.pruned$sample2[, "stop.ori"], | |
| 150 sig.value2=em.output$data.pruned$sample2[, "sig.value"], | |
| 151 idr.local=1-em.output$em.fit$e.z, IDR=IDR) | |
| 152 | |
| 153 # write.table(write.out.data, file=paste(output.prefix, "-overlapped-peaks.txt", sep="")) | |
| 154 write.table(write.out.data, file=overlap.output) | |
| 155 cat(paste("Write overlapped peaks and local idr to: ", overlap.output, sep="")) | |
| 156 | |
| 157 # number of peaks passing IDR range (0.01-0.25) | |
| 158 IDR.cutoff <- seq(0.01, 0.25, by=0.01) | |
| 159 idr.o <- order(write.out.data$idr.local) | |
| 160 idr.ordered <- write.out.data$idr.local[idr.o] | |
| 161 IDR.sum <- cumsum(idr.ordered)/c(1:length(idr.ordered)) | |
| 162 | |
| 163 IDR.count <- c() | |
| 164 n.cutoff <- length(IDR.cutoff) | |
| 165 for(i in 1:n.cutoff){ | |
| 166 IDR.count[i] <- sum(IDR.sum <= IDR.cutoff[i]) | |
| 167 } | |
| 168 | |
| 169 | |
| 170 # write the number of peaks passing various IDR range into a file | |
| 171 idr.cut <- data.frame(peakfile1, peakfile2, IDR.cutoff=IDR.cutoff, IDR.count=IDR.count) | |
| 172 write.table(idr.cut, file=npeaks.output, append=T, quote=F, row.names=F, col.names=F) | |
| 173 cat(paste("Write number of peaks above IDR cutoff [0.01, 0.25]: ","npeaks-aboveIDR.txt\n", sep="")) | |
| 174 | |
| 175 mar.mean <- get.mar.mean(em.output$em.fit) | |
| 176 | |
| 177 cat(paste("Marginal mean of two components:\n")) | |
| 178 print(mar.mean) | |
| 179 | |
| 180 sink() | |
| 181 | |
| 182 | 
