Mercurial > repos > modencode-dcc > idr_package
changeset 9:c8f47481b704 draft
Uploaded
| author | modencode-dcc | 
|---|---|
| date | Thu, 17 Jan 2013 15:59:06 -0500 | 
| parents | 6a4297fa2df1 | 
| children | e9eb89c2e71d | 
| files | functions-all-clayton-12-13.r | 
| diffstat | 1 files changed, 3099 insertions(+), 0 deletions(-) [+] | 
line wrap: on
 line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/functions-all-clayton-12-13.r Thu Jan 17 15:59:06 2013 -0500 @@ -0,0 +1,3099 @@ +# revised on 2-20-10 +# - fix error in pass.structure: reverse rank.combined, so that big sig.value +# are ranked with small numbers (1, 2, ...) +# - fix error on get.ez.tt.all: get ez.cutoff from sorted e.z + +# +# modified EM procedure to compute empirical CDF more precisely - 09/2009 + + + +# this file contains the functions for +# 1. computing the correspondence profile (upper rank intersection and derivatives) +# 2. inference of copula mixture model +# +# It also has functions for +# 1. reading peak caller results +# 2. processing and matching called peaks +# 3. plotting results + + +################ read peak caller results + +# process narrow peak format +# some peak callers may not report q-values, p-values or fold of enrichment +# need further process before comparison +# +# stop.exclusive: Is the basepair of peak.list$stop exclusive? In narrowpeak and broadpeak format they are exclusive. +# If it is exclusive, we need subtract peak.list$stop by 1 to avoid the same basepair being both a start and a stop of two +# adjacent peaks, which creates trouble for finding correct intersect +process.narrowpeak <- function(narrow.file, chr.size, half.width=NULL, summit="offset", stop.exclusive=T, broadpeak=F){ + + + aa <- read.table(narrow.file) + + if(broadpeak){ + bb.ori <- data.frame(chr=aa$V1, start=aa$V2, stop=aa$V3, signal.value=aa$V7, p.value=aa$V8, q.value=aa$V9) + }else{ + bb.ori <- data.frame(chr=aa$V1, start=aa$V2, stop=aa$V3, signal.value=aa$V7, p.value=aa$V8, q.value=aa$V9, summit=aa$V10) + } + + if(summit=="summit"){ + bb.ori$summit <- bb.ori$summit-bb.ori$start # change summit to offset to avoid error when concatenating chromosomes + } + + bb <- concatenate.chr(bb.ori, chr.size) + + #bb <- bb.ori + + # remove the peaks that has the same start and stop value + bb <- bb[bb$start != bb$stop,] + + if(stop.exclusive==T){ + bb$stop <- bb$stop-1 + } + + if(!is.null(half.width)){ + bb$start.ori <- bb$start + bb$stop.ori <- bb$stop + + # if peak is narrower than the specified window, stay with its width + # otherwise chop wider peaks to specified width + width <- bb$stop-bb$start +1 + is.wider <- width > 2*half.width + + if(summit=="offset" | summit=="summit"){ # if summit is offset from start + bb$start[is.wider] <- bb$start.ori[is.wider] + bb$summit[is.wider]-half.width + bb$stop[is.wider] <- bb$start.ori[is.wider] + bb$summit[is.wider]+half.width + } else { + if(summit=="unknown"){ + bb$start[is.wider] <- bb$start.ori[is.wider]+round(width[is.wider]/2) - half.width + bb$stop[is.wider] <- bb$start.ori[is.wider]+round(width[is.wider]/2) + half.width + } + } + } + + bb <- clean.data(bb) + invisible(list(data.ori=bb.ori, data.cleaned=bb)) +} + +# clean data +# and concatenate chromosomes if needed +clean.data <- function(adata){ + + # remove the peaks that has the same start and stop value + adata <- adata[adata$start != adata$stop,] + + # if some stops and starts are the same, need fix them + stop.in.start <- is.element(adata$stop, adata$start) + n.fix <- sum(stop.in.start) + if(n.fix >0){ + print(paste("Fix", n.fix, "stops\n")) + adata$stop[stop.in.start] <- adata$stop[stop.in.start]-1 + } + + return(adata) +} + +# concatenate peaks +# peaks: the dataframe to have all the peaks +# chr.file: the file to keep the length of each chromosome +# chr files should come from the species that the data is from +#concatenate.chr <- function(peaks, chr.size){ + + # chr.size <- read.table(chr.file) +# chr.o <- order(chr.size[,1]) +# chr.size <- chr.size[chr.o,] +# +# chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2])) +# chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift) +# +# for(i in 1:nrow(chr.size)){ +# is.in <- as.character(peaks$chr) == as.character(chr.size.cum$chr[i]) +# if(sum(is.in)>0){ +# peaks[is.in,]$start <- peaks[is.in,]$start + chr.size.cum$shift[i] +# peaks[is.in,]$stop <- peaks[is.in,]$stop + chr.size.cum$shift[i] +# } +# } +# +# invisible(peaks) +#} + + + + +# concatenate peaks +# peaks: the dataframe to have all the peaks +# chr.file: the file to keep the length of each chromosome +# chr files should come from the species that the data is from +concatenate.chr <- function(peaks, chr.size){ + + # chr.size <- read.table(chr.file) + chr.o <- order(chr.size[,1]) + chr.size <- chr.size[chr.o,] + + chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2])) + chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift) + + peaks$start.ori <- peaks$start + peaks$stop.ori <- peaks$stop + + for(i in 1:nrow(chr.size)){ + is.in <- as.character(peaks$chr) == as.character(chr.size.cum$chr[i]) + if(sum(is.in)>0){ + peaks[is.in,]$start <- peaks[is.in,]$start + chr.size.cum$shift[i] + peaks[is.in,]$stop <- peaks[is.in,]$stop + chr.size.cum$shift[i] + } + } + + invisible(peaks) +} + + +deconcatenate.chr <- function(peaks, chr.size){ + + chr.o <- order(chr.size[,1]) + chr.size <- chr.size[chr.o,] + + chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2])) + chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift) + + peaks$chr <- rep(NA, nrow(peaks)) + + for(i in 1:(nrow(chr.size.cum)-1)){ + is.in <- peaks$start > chr.size.cum[i,2] & peaks$start <= chr.size.cum[i+1, 2] + if(sum(is.in)>0){ + peaks[is.in,]$start <- peaks[is.in,]$start - chr.size.cum[i,2] + peaks[is.in,]$stop <- peaks[is.in,]$stop - chr.size.cum[i,2]+1 + peaks[is.in,]$chr <- chr.size[i,1] + } + } + + if(i == nrow(chr.size.cum)){ + is.in <- peaks$start > chr.size.cum[i, 2] + if(sum(is.in)>0){ + peaks[is.in,]$start <- peaks[is.in,]$start - chr.size.cum[i,2] + peaks[is.in,]$stop <- peaks[is.in,]$stop - chr.size.cum[i,2]+1 + peaks[is.in,]$chr <- chr.size[i,1] + } + } + + invisible(peaks) +} + +################ preprocessing peak calling output + + +# +# read two calling results and sort by peak starting locations, +# then find overlap between peaks +# INPUT: +# rep1: the 1st replicate +# rep2: the 2nd replicate +# OUTPUT: +# id1, id2: the labels for the identified peaks on the replicates +find.overlap <- function(rep1, rep2){ + + o1 <- order(rep1$start) + rep1 <- rep1[o1,] + + o2 <- order(rep2$start) + rep2 <- rep2[o2,] + + n1 <- length(o1) + n2 <- length(o2) + + # assign common ID to peaks + id1 <- rep(0, n1) # ID assigned on rep1 + id2 <- rep(0, n2) # ID assigned on rep2 + id <- 1 # keep track common id's + + # check if two replicates overlap with each other + i <- 1 + j <- 1 + + while(i <= n1|| j <= n2){ + + # && (id1[n1] ==0 || id2[n2] ==0) + + # if one list runs out + if(i > n1 && j < n2){ + + j <- j+1 + id2[j] <- id + id <- id +1 + next + } else{ + if(j > n2 && i < n1){ + i <- i+1 + id1[i] <- id + id <- id +1 + next + } else { + if(i >= n1 && j >=n2) + break + } + } + + # if not overlap + + if(!(rep1$start[i] <= rep2$stop[j] && rep2$start[j] <= rep1$stop[i])){ + + # at the start of loop, when both are not assigned an ID + # the one locates in front is assigned first + if(id1[i] ==0 && id2[j]==0){ + if(rep1$stop[i] < rep2$stop[j]){ + id1[i] <- id + } else { + id2[j] <- id + } + } else { # in the middle of the loop, when one is already assigned + # The one that has not assigned gets assigned + # if(id1[i] ==0){ # id1[i] is not assigned + # id1[i] <- id + # } else { # id2[i] is not assigned + # id2[j] <- id + # } + + # order the id according to location + if(rep1$stop[i] <= rep2$stop[j]){ + id1[i] <- max(id2[j], id1[i]) + id2[j] <- id + } else { + if(rep1$stop[i] > rep2$stop[j]){ + id2[j] <- max(id1[i], id2[j]) + id1[i] <- id + } + } + + } + + id <- id +1 + + } else { # if overlap + + if(id1[i] == 0 && id2[j] == 0){ # not assign label yet + id1[i] <- id + id2[j] <- id + id <- id +1 + } else { # one peak is already assigned label, the other is 0 + + id1[i] <- max(id1[i], id2[j]) # this is a way to copy the label of the assigned peak without knowing which one is already assigned + id2[j] <- id1[i] # syncronize the labels + } + + } + + if(rep1$stop[i] < rep2$stop[j]){ + i <- i+1 + } else { + j <- j+1 + } + + } + + invisible(list(id1=id1, id2=id2)) + +} + +# Impute the missing significant value for the peaks called only on one replicate. +# value +# INPUT: +# rep1, rep2: the two peak calling output +# id1, id2: the IDs assigned by function find.overlap, vectors +# If id1[i]==id2[j], peak i on rep1 overlaps with peak j on rep2 +# p.value.impute: the significant value to impute for the missing peaks +# OUTPUT: +# rep1, rep2: peaks ordered by the start locations with imputed peaks +# id1, id2: the IDs with imputed peaks +fill.missing.peaks <- function(rep1, rep2, id1, id2, p.value.impute){ + +# rep1 <- data.frame(chr=rep1$chr, start=rep1$start, stop=rep1$stop, sig.value=rep1$sig.value) +# rep2 <- data.frame(chr=rep2$chr, start=rep2$start, stop=rep2$stop, sig.value=rep2$sig.value) + + o1 <- order(rep1$start) + rep1 <- rep1[o1,] + + o2 <- order(rep2$start) + rep2 <- rep2[o2,] + + entry.in1.not2 <- !is.element(id1, id2) + entry.in2.not1 <- !is.element(id2, id1) + + if(sum(entry.in1.not2) > 0){ + + temp1 <- rep1[entry.in1.not2, ] + + # impute sig.value + temp1$sig.value <- p.value.impute + temp1$signal.value <- p.value.impute + temp1$p.value <- p.value.impute + temp1$q.value <- p.value.impute + + rep2.filled <- rbind(rep2, temp1) + id2.filled <- c(id2, id1[entry.in1.not2]) + } else { + id2.filled <- id2 + rep2.filled <- rep2 + } + + if(sum(entry.in2.not1) > 0){ + + temp2 <- rep2[entry.in2.not1, ] + + # fill in p.values to 1 + temp2$sig.value <- p.value.impute + temp2$signal.value <- p.value.impute + temp2$p.value <- p.value.impute + temp2$q.value <- p.value.impute + + + # append to the end + rep1.filled <- rbind(rep1, temp2) + + id1.filled <- c(id1, id2[entry.in2.not1]) + } else { + id1.filled <- id1 + rep1.filled <- rep1 + } + + # sort rep1 and rep2 by the same id + o1 <- order(id1.filled) + rep1.ordered <- rep1.filled[o1, ] + + o2 <- order(id2.filled) + rep2.ordered <- rep2.filled[o2, ] + + invisible(list(rep1=rep1.ordered, rep2=rep2.ordered, + id1=id1.filled[o1], id2=id2.filled[o2])) + } + +# Merge peaks with same ID on the same replicates +# (They are generated if two peaks on rep1 map to the same peak on rep2) +# need peak.list have 3 columns: start, stop and sig.value +merge.peaks <- function(peak.list, id){ + + i <- 1 + j <- 1 + dup.index <- c() + sig.value <- c() + start.new <- c() + stop.new <- c() + id.new <- c() + + # original data + chr <- c() + start.ori <- c() + stop.ori <- c() + + signal.value <- c() + p.value <- c() + q.value <- c() + + while(i < length(id)){ + + if(id[i] == id[i+1]){ + dup.index <- c(dup.index, i, i+1) # push on dup.index + } else { + if(length(dup.index)>0){ # pop from dup.index + sig.value[j] <- mean(peak.list$sig.value[unique(dup.index)]) # mean of -log(pvalue) + start.new[j] <- peak.list$start[min(dup.index)] + stop.new[j] <- peak.list$stop[max(dup.index)] + id.new[j] <- id[max(dup.index)] + + signal.value[j] <- mean(peak.list$signal.value[unique(dup.index)]) # mean of -log(pvalue) + p.value[j] <- mean(peak.list$p.value[unique(dup.index)]) # mean of -log(pvalue) + q.value[j] <- mean(peak.list$q.value[unique(dup.index)]) # mean of -log(pvalue) + + chr[j] <- as.character(peak.list$chr[min(dup.index)]) + start.ori[j] <- peak.list$start.ori[min(dup.index)] + stop.ori[j] <- peak.list$stop.ori[max(dup.index)] + + dup.index <- c() + } else { # nothing to pop + sig.value[j] <- peak.list$sig.value[i] + start.new[j] <- peak.list$start[i] + stop.new[j] <- peak.list$stop[i] + id.new[j] <- id[i] + + signal.value[j] <- peak.list$signal.value[i] + p.value[j] <- peak.list$p.value[i] + q.value[j] <- peak.list$q.value[i] + + chr[j] <- as.character(peak.list$chr[i]) + start.ori[j] <- peak.list$start.ori[i] + stop.ori[j] <- peak.list$stop.ori[i] + + } + j <- j+1 + } + i <- i+1 + } + + data.new <- data.frame(id=id.new, sig.value=sig.value, start=start.new, stop=stop.new, signal.value=signal.value, p.value=p.value, q.value=q.value, chr=chr, start.ori=start.ori, stop.ori=stop.ori) + invisible(data.new) +} + + + + + +# a wrap function to fill in missing peaks, merge peaks and impute significant values +# out1 and out2 are two peak calling outputs +pair.peaks <- function(out1, out2, p.value.impute=0){ + + aa <- find.overlap(out1, out2) + bb <- fill.missing.peaks(out1, out2, aa$id1, aa$id2, p.value.impute=0) + + cc1 <- merge.peaks(bb$rep1, bb$id1) + cc2 <- merge.peaks(bb$rep2, bb$id2) + + invisible(list(merge1=cc1, merge2=cc2)) +} + + + +# overlap.ratio is a parameter to define the percentage of overlap +# if overlap.ratio =0, 1 basepair overlap is counted as overlap +# if overlap.ratio between 0 and 1, it is the minimum proportion of +# overlap required to be called as a match +# it is computed as the overlap part/min(peak1.length, peak2.length) +pair.peaks.filter <- function(out1, out2, p.value.impute=0, overlap.ratio=0){ + + aa <- find.overlap(out1, out2) + bb <- fill.missing.peaks(out1, out2, aa$id1, aa$id2, p.value.impute=0) + + cc1 <- merge.peaks(bb$rep1, bb$id1) + cc2 <- merge.peaks(bb$rep2, bb$id2) + + frag12 <- cbind(cc1$start, cc1$stop, cc2$start, cc2$stop) + + frag.ratio <- apply(frag12, 1, overlap.middle) + + frag.ratio[cc1$sig.value==p.value.impute | cc2$sig.value==p.value.impute] <- 0 + + cc1$frag.ratio <- frag.ratio + cc2$frag.ratio <- frag.ratio + + merge1 <- cc1[cc1$frag.ratio >= overlap.ratio,] + merge2 <- cc2[cc2$frag.ratio >= overlap.ratio,] + + invisible(list(merge1=merge1, merge2=merge2)) +} + +# x[1], x[2] are the start and end of the first fragment +# and x[3] and x[4] are the start and end of the 2nd fragment +# If there are two fragments, we can find the overlap by ordering the +# start and stop of all the ends and find the difference between the middle two +overlap.middle <- function(x){ + + x.o <- x[order(x)] + f1 <- x[2]-x[1] + f2 <- x[4]-x[3] + + f.overlap <- abs(x.o[3]-x.o[2]) + f.overlap.ratio <- f.overlap/min(f1, f2) + + return(f.overlap.ratio) +} + + + +####### +####### compute correspondence profile +####### + +# compute upper rank intersection for one t +# tv: the upper percentile +# x is sorted by the order of paired variable +comp.uri <- function(tv, x){ + n <- length(x) + qt <- quantile(x, prob=1-tv[1]) # tv[1] is t +# sum(x[1:ceiling(n*tv[2])] >= qt)/n/tv[2]- tv[1]*tv[2] #tv[2] is v + sum(x[1:ceiling(n*tv[2])] >= qt)/n + +} + +# compute the correspondence profile +# tt, vv: vector between (0, 1) for percentages +get.uri.2d <- function(x1, x2, tt, vv, spline.df=NULL){ + + o <- order(x1, x2, decreasing=T) + + # sort x2 by the order of x1 + x2.ordered <- x2[o] + + tv <- cbind(tt, vv) + ntotal <- length(x1) # number of peaks + + uri <- apply(tv, 1, comp.uri, x=x2.ordered) + + # compute the derivative of URI vs t using small bins + uri.binned <- uri[seq(1, length(uri), by=4)] + tt.binned <- tt[seq(1, length(uri), by=4)] + uri.slope <- (uri.binned[2:(length(uri.binned))] - uri.binned[1:(length(uri.binned)-1)])/(tt.binned[2:(length(uri.binned))] - tt.binned[1:(length(tt.binned)-1)]) + + # smooth uri using spline + # first find where the jump is and don't fit the jump + # this is the index on the left + # jump.left.old <- which.max(uri[-1]-uri[-length(uri)]) + short.list.length <- min(sum(x1>0)/length(x1), sum(x2>0)/length(x2)) + + if(short.list.length < max(tt)){ + jump.left <- which(tt>short.list.length)[1]-1 + } else { + jump.left <- which.max(tt) + } + +# reversed.index <- seq(length(tt), 1, by=-1) +# nequal <- sum(uri[reversed.index]== tt[reversed.index]) +# temp <- which(uri[reversed.index]== tt[reversed.index])[nequal] +# jump.left <- length(tt)-temp + + if(jump.left < 6){ + jump.left <- length(tt) + } + + + if(is.null(spline.df)) + uri.spl <- smooth.spline(tt[1:jump.left], uri[1:jump.left], df=6.4) + else{ + uri.spl <- smooth.spline(tt[1:jump.left], uri[1:jump.left], df=spline.df) + } + # predict the first derivative + uri.der <- predict(uri.spl, tt[1:jump.left], deriv=1) + + invisible(list(tv=tv, uri=uri, + uri.slope=uri.slope, t.binned=tt.binned[2:length(uri.binned)], + uri.spl=uri.spl, uri.der=uri.der, jump.left=jump.left, + ntotal=ntotal)) + } + + +# change the scale of uri from based on t (percentage) to n (number of peaks or basepairs) +# this is for plotting multiple pairwise URI's on the same plot +scale.t2n <- function(uri){ + + ntotal <- uri$ntotal + tv <- uri$tv*uri$ntotal + uri.uri <- uri$uri*uri$ntotal + jump.left <- uri$jump.left + uri.spl <- uri$uri.spl + uri.spl$x <- uri$uri.spl$x*uri$ntotal + uri.spl$y <- uri$uri.spl$y*uri$ntotal + + t.binned <- uri$t.binned*uri$ntotal + uri.slope <- uri$uri.slope + uri.der <- uri$uri.der + uri.der$x <- uri$uri.der$x*uri$ntotal + uri.der$y <- uri$uri.der$y + + uri.n <- list(tv=tv, uri=uri.uri, t.binned=t.binned, uri.slope=uri.slope, uri.spl=uri.spl, uri.der=uri.der, ntotal=ntotal, jump.left=jump.left) + return(uri.n) +} + + + + +# a wrapper for running URI for peaks from peak calling results +# both data1 and data2 are calling results in narrowpeak format +compute.pair.uri <- function(data.1, data.2, sig.value1="signal.value", sig.value2="signal.value", spline.df=NULL, overlap.ratio=0){ + + tt <- seq(0.01, 1, by=0.01) + vv <- tt + + if(sig.value1=="signal.value"){ + data.1.enrich <- data.frame(chr=data.1$chr, start.ori=data.1$start.ori, stop.ori=data.1$stop.ori, start=data.1$start, stop=data.1$stop, sig.value=data.1$signal.value, signal.value=data.1$signal.value, p.value=data.1$p.value, q.value=data.1$q.value) + } else { + if(sig.value1=="p.value"){ + data.1.enrich <- data.frame(chr=data.1$chr, start.ori=data.1$start.ori, stop.ori=data.1$stop.ori, start=data.1$start, stop=data.1$stop, sig.value=data.1$p.value, signal.value=data.1$signal.value, p.value=data.1$p.value, q.value=data.1$q.value) + } else { + if(sig.value1=="q.value"){ + data.1.enrich <- data.frame(chr=data.1$chr, start.ori=data.1$start.ori, stop.ori=data.1$stop.ori, start=data.1$start, stop=data.1$stop, sig.value=data.1$q.value, signal.value=data.1$signal.value, p.value=data.1$p.value, q.value=data.1$q.value) + } + } + } + + if(sig.value2=="signal.value"){ + data.2.enrich <- data.frame(chr=data.2$chr, start.ori=data.2$start.ori, stop.ori=data.2$stop.ori, start=data.2$start, stop=data.2$stop, sig.value=data.2$signal.value, signal.value=data.2$signal.value, p.value=data.2$p.value, q.value=data.2$q.value) + } else { + if(sig.value2=="p.value"){ + data.2.enrich <- data.frame(chr=data.2$chr, start.ori=data.2$start.ori, stop.ori=data.2$stop.ori, start=data.2$start, stop=data.2$stop, sig.value=data.2$p.value, signal.value=data.2$signal.value, p.value=data.2$p.value, q.value=data.2$q.value) + } else { + if(sig.value2=="q.value"){ + data.2.enrich <- data.frame(chr=data.2$chr, start.ori=data.2$start.ori, stop.ori=data.2$stop.ori, start=data.2$start, stop=data.2$stop, sig.value=data.2$q.value, signal.value=data.2$signal.value, p.value=data.2$p.value, q.value=data.2$q.value) + } + } + } + + ### by peaks + # data12.enrich <- pair.peaks(data.1.enrich, data.2.enrich) + data12.enrich <- pair.peaks.filter(data.1.enrich, data.2.enrich, p.value.impute=0, overlap.ratio) + uri <- get.uri.2d(as.numeric(as.character(data12.enrich$merge1$sig.value)), as.numeric(as.character(data12.enrich$merge2$sig.value)), tt, vv, spline.df=spline.df) + uri.n <- scale.t2n(uri) + + return(list(uri=uri, uri.n=uri.n, data12.enrich=data12.enrich, sig.value1=sig.value1, sig.value2=sig.value2)) + + +} + + + +# compute uri for matched sample +get.uri.matched <- function(data12, df=10){ + + tt <- seq(0.01, 1, by=0.01) + vv <- tt + uri <- get.uri.2d(data12$sample1$sig.value, data12$sample2$sig.value, tt, vv, spline.df=df) + + # change scale from t to n + uri.n <- scale.t2n(uri) + + return(list(uri=uri, uri.n=uri.n)) + +} + +# map.uv is a pair of significant values corresponding to specified consistency FDR +# assuming values in map.uv and qvalue are linearly related +# data.set is the original data set +# sig.value is the name of the significant value in map.uv, say enrichment +# nominal.value is the one we want to map to, say q-value +# +map.sig.value <- function(data.set, map.uv, nominal.value){ + + index.nominal <- which(names(data.set$merge1)==nominal.value) + nentry <- nrow(map.uv) + map.nominal <- rbind(map.uv[, c("sig.value1", "sig.value2")]) + + for(i in 1:nentry){ + + map.nominal[i, "sig.value1"] <- data.set$merge1[unique(which.min(abs(data.set$merge1$sig.value-map.uv[i, "sig.value1"]))), index.nominal] + map.nominal[i, "sig.value2"] <- data.set$merge2[unique(which.min(abs(data.set$merge2$sig.value-map.uv[i, "sig.value2"]))), index.nominal] + } + + invisible(map.nominal) +} + + +############### plot correspondence profile + +# plot multiple comparison wrt one template +# uri.list contains the total number of peaks +# plot.missing=F: not plot the missing points on the right +plot.uri.group <- function(uri.n.list, plot.dir, file.name=NULL, legend.txt, xlab.txt="num of significant peaks", ylab.txt="num of peaks in common", col.start=0, col.txt=NULL, plot.missing=F, title.txt=NULL){ + + if(is.null(col.txt)) + col.txt <- c("black", "red", "purple", "green", "blue", "cyan", "magenta", "orange", "grey") + + n <- length(uri.n.list) + + ntotal <- c() + for(i in 1:n) + ntotal[i] <- uri.n.list[[i]]$ntotal + + jump.left <- c() + jump.left.der <- c() + ncommon <- c() + for(i in 1:n){ +# jump.left[i] <- which.max(uri.n.list[[i]]$uri[-1]-uri.n.list[[i]]$uri[-length(uri.n.list[[i]]$uri)]) +# if(jump.left[i] < 6) +# jump.left[i] <- length(uri.n.list[[i]]$uri) + +## reversed.index <- seq(length(uri.n.list[[i]]$tv[,1]), 1, by=-1) +## nequal <- sum(uri.n.list[[i]]$uri[reversed.index]== uri.n.list[[i]]$tv[reversed.index,1]) +## temp <- which(uri.n.list[[i]]$uri[reversed.index]== uri.n.list[[i]]$tv[reversed.index,1])[nequal] +## jump.left[i] <- length(uri.n.list[[i]]$tv[,1])-temp +##print(uri.n.list[[i]]$uri) +##print(uri.n.list[[i]]$tv[,1]) +## jump.left[i] <- uri.n.list[[i]]$jump.left + +# jump.left.der[i] <- sum(uri.n.list[[i]]$t.binned < uri.n.list[[i]]$uri.der$x[length(uri.n.list[[i]]$uri.der$x)]) + + jump.left[i] <- uri.n.list[[i]]$jump.left + jump.left.der[i] <- jump.left[i] + ncommon[i] <- uri.n.list[[i]]$tv[jump.left[i],1] + } + + + if(plot.missing){ + max.peak <- max(ntotal) + } else { + max.peak <- max(ncommon)*1.05 + } + + if(!is.null(file.name)){ + postscript(paste(plot.dir, "uri.", file.name, sep="")) + par(mfrow=c(1,1), mar=c(5,5,4,2)) + } + + plot(uri.n.list[[1]]$tv[,1], uri.n.list[[1]]$uri, type="n", xlab=xlab.txt, ylab=ylab.txt, xlim=c(0, max.peak), ylim=c(0, max.peak), cex.lab=2) + + for(i in 1:n){ + + if(plot.missing){ + points(uri.n.list[[i]]$tv[,1], uri.n.list[[i]]$uri, col=col.txt[i+col.start], cex=0.5 ) + } else { + points(uri.n.list[[i]]$tv[1:jump.left[i],1], uri.n.list[[i]]$uri[1:jump.left[i]], col=col.txt[i+col.start], cex=0.5) + } + lines(uri.n.list[[i]]$uri.spl, col=col.txt[i+col.start], lwd=4) + } + abline(coef=c(0,1), lty=3) + legend(0, max.peak, legend=legend.txt, col=col.txt[(col.start+1):length(col.txt)], lty=1, lwd=3, cex=2) + if(!is.null(title)) + title(title.txt) + + if(!is.null(file.name)){ + dev.off() + } + + if(!is.null(file.name)){ + postscript(paste(plot.dir, "duri.", file.name, sep="")) + par(mfrow=c(1,1), mar=c(5,5,4,2)) + } + plot(uri.n.list[[1]]$t.binned, uri.n.list[[1]]$uri.slope, type="n", xlab=xlab.txt, ylab="slope", xlim=c(0, max.peak), ylim=c(0, 1.5), cex.lab=2) + + for(i in 1:n){ +# if(plot.missing){ +# points(uri.n.list[[i]]$t.binned, uri.n.list[[i]]$uri.slope, col=col.txt[i+col.start], cex=0.5) +# } else { +# points(uri.n.list[[i]]$t.binned[1:jump.left.der[i]], uri.n.list[[i]]$uri.slope[1:jump.left.der[i]], col=col.txt[i+col.start], cex=0.5) +# } + lines(uri.n.list[[i]]$uri.der, col=col.txt[i+col.start], lwd=4) + } + abline(h=1, lty=3) + legend(0.5*max.peak, 1.5, legend=legend.txt, col=col.txt[(col.start+1):length(col.txt)], lty=1, lwd=3, cex=2) + + if(!is.null(title)) + title(title.txt) + + if(!is.null(file.name)){ + dev.off() + } + +} + + + +####################### +####################### copula fitting for matched peaks +####################### + +# estimation from mixed copula model + +# 4-5-09 +# A nonparametric estimation of mixed copula model + + +# updated + +# c1, c2, f1, f2, g1, g2 are vectors +# c1*f1*g1 and c2*f2*g2 are copula densities for the two components +# xd1 and yd1 are the values of marginals for the first component +# xd2 and yd2 are the values of marginals for the 2nd component +# +# ez is the prob for being in the consistent group +get.ez <- function(p, c1, c2, xd1, yd1, xd2, yd2){ + + return(p*c1*xd1*yd1/(p*c1*xd1*yd1 + (1-p)*c2*xd2*yd2)) +} + +# checked + +# this is C_12 not the copula density function c=C_12 * f1* f2 +# since nonparametric estimation is used here for f1 and f2, which +# are constant throughout the iterations, we don't need them for optimization +# +# bivariate gaussian copula function +# t and s are vectors of same length, both are percentiles +# return a vector +gaussian.cop.den <- function(t, s, rho){ + + A <- qnorm(t)^2 + qnorm(s)^2 + B <- qnorm(t)*qnorm(s) + + loglik <- -log(1-rho^2)/2 - rho/(2*(1-rho^2))*(rho*A-2*B) + + return(exp(loglik)) +} + +clayton.cop.den <- function(t, s, rho){ + + if(rho > 0) + return(exp(log(rho+1)-(rho+1)*(log(t)+log(s))-(2+1/rho)*log(t^(-rho) + s^(-rho)-1))) + + if(rho==0) + return(1) + + if(rho<0) + stop("Incorrect Clayton copula coefficient") + +} + + +# checked +# estimate rho from Gaussian copula +mle.gaussian.copula <- function(t, s, e.z){ + + # reparameterize to bound from rho=+-1 + l.c <- function(rho, t, s, e.z){ +# cat("rho=", rho, "\n") + sum(e.z*log(gaussian.cop.den(t, s, rho)))} + + rho.max <- optimize(f=l.c, c(-0.998, 0.998), maximum=T, tol=0.00001, t=t, s=s, e.z=e.z) + +#print(rho.max$m) + +#cat("cor=", cor(qnorm(t)*e.z, qnorm(s)*e.z), "\t", "rho.max=", rho.max$m, "\n") +# return(sign(rho.max$m)/(1+rho.max$m)) + return(rho.max$m) +} + + +# estimate mle from Clayton copula, +mle.clayton.copula <- function(t, s, e.z){ + + l.c <- function(rho, t, s, e.z){ + lc <- sum(e.z*log(clayton.cop.den(t, s, rho))) +# cat("rho=", rho, "\t", "l.c=", lc, "\n") + return(lc) + } + + rho.max <- optimize(f=l.c, c(0.1, 20), maximum=T, tol=0.00001, t=t, s=s, e.z=e.z) + + return(rho.max$m) +} + + + +# updated +# mixture likelihood of two gaussian copula +# nonparametric and ranked transformed +loglik.2gaussian.copula <- function(x, y, p, rho1, rho2, x.mar, y.mar){ + + px.1 <- get.pdf.cdf(x, x.mar$f1) + px.2 <- get.pdf.cdf(x, x.mar$f2) + py.1 <- get.pdf.cdf(y, y.mar$f1) + py.2 <- get.pdf.cdf(y, y.mar$f2) + + c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1) + c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2) + + sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf)) +} + +loglik.2copula <- function(x, y, p, rho1, rho2, x.mar, y.mar, copula.txt){ + + px.1 <- pdf.cdf$px.1 + px.2 <- pdf.cdf$px.2 + py.1 <- pdf.cdf$py.1 + py.2 <- pdf.cdf$py.2 + + if(copula.txt=="gaussian"){ + c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1) + c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2) + } else { + if(copula.txt=="clayton"){ + c1 <- clayton.cop.den(px.1$cdf, py.1$cdf, rho1) + c2 <- clayton.cop.den(px.2$cdf, py.2$cdf, rho2) + } + } + sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf)) +} + + +# estimate the marginals of each component using histogram estimator in EM +# return the density, breaks, and cdf of the histogram estimator +est.mar.hist <- function(x, e.z, breaks){ + + binwidth <- c() + nbin <- length(breaks)-1 + nx <- length(x) + + # the histogram + x1.pdf <- c() + x2.pdf <- c() + x1.cdf <- c() + x2.cdf <- c() + + # the pdf for each point + x1.pdf.value <- rep(NA, nx) + x2.pdf.value <- rep(NA, nx) + + x1.cdf.value <- rep(NA, nx) + x2.cdf.value <- rep(NA, nx) + + for(i in 1:nbin){ + + binwidth[i] <- breaks[i+1] - breaks[i] + if(i < nbin) + in.bin <- x>= breaks[i] & x < breaks[i+1] + else # last bin + in.bin <- x>= breaks[i] & x <=breaks[i+1] + + # each bin add one observation to avoid empty bins + # multiple (nx+nbin)/(nx+nbin+1) to avoid blowup when looking up for + # quantiles + x1.pdf[i] <- (sum(e.z[in.bin])+1)/(sum(e.z)+nbin)/binwidth[i]*(nx+nbin)/(nx+nbin+1) + x2.pdf[i] <- (sum(1-e.z[in.bin])+1)/(sum(1-e.z)+nbin)/binwidth[i]*(nx+nbin)/(nx+nbin+1) + + +# x1.pdf[i] <- sum(e.z[in.bin])/sum(e.z)/binwidth[i]*nx/(nx+1) +# x2.pdf[i] <- sum(1-e.z[in.bin])/sum(1-e.z)/binwidth[i]*nx/(nx+1) + +# treat each bin as a value for a discrete variable +# x1.cdf[i] <- sum(x1.pdf[1:i]*binwidth[1:i]) +# x2.cdf[i] <- sum(x2.pdf[1:i]*binwidth[1:i]) + + + # cumulative density before reaching i + if(i>1){ + x1.cdf[i] <- sum(x1.pdf[1:(i-1)]*binwidth[1:(i-1)]) + x2.cdf[i] <- sum(x2.pdf[1:(i-1)]*binwidth[1:(i-1)]) + } else{ + x1.cdf[i] <- 0 + x2.cdf[i] <- 0 + } + + # make a vector of nx to store the values of pdf and cdf for each x + # this will speed up the computation dramatically + x1.pdf.value[in.bin] <- x1.pdf[i] + x2.pdf.value[in.bin] <- x2.pdf[i] + + x1.cdf.value[in.bin] <- x1.cdf[i] + x1.pdf[i]*(x[in.bin]-breaks[i]) + x2.cdf.value[in.bin] <- x2.cdf[i] + x2.pdf[i]*(x[in.bin]-breaks[i]) + } + +# x1.cdf <- cumsum(x1.pdf*binwidth) +# x2.cdf <- cumsum(x2.pdf*binwidth) + + f1 <-list(breaks=breaks, density=x1.pdf, cdf=x1.cdf) + f2 <-list(breaks=breaks, density=x2.pdf, cdf=x2.cdf) + + f1.value <- list(pdf=x1.pdf.value, cdf=x1.cdf.value) + f2.value <- list(pdf=x2.pdf.value, cdf=x2.cdf.value) + + return(list(f1=f1, f2=f2, f1.value=f1.value, f2.value=f2.value)) +} + +# estimate the marginal cdf from rank +est.cdf.rank <- function(x, conf.z){ + + # add 1 to prevent blow up + x1.cdf <- rank(x[conf.z==1])/(length(x[conf.z==1])+1) + + x2.cdf <- rank(x[conf.z==0])/(length(x[conf.z==0])+1) + + return(list(cdf1=x1.cdf, cdf2=x2.cdf)) +} + +# df is a density function with fields: density, cdf and breaks, x is a scalar +get.pdf <- function(x, df){ + + if(x < df$breaks[1]) + cat("x is out of the range of df\n") + + index <- which(df$breaks >= x)[1] + + if(index==1) + index <- index +1 + return(df$density[index-1]) +} + +# get cdf from histgram estimator for a single value +get.cdf <- function(x, df){ + + index <- which(df$breaks >= x)[1] + if(index==1) + index <- index +1 + return(df$cdf[index-1]) +} + +# df is a density function with fields: density, cdf and breaks +get.pdf.cdf <- function(x.vec, df){ + + x.pdf <- sapply(x.vec, get.pdf, df=df) + x.cdf <- sapply(x.vec, get.cdf, df=df) + return(list(cdf=x.cdf, pdf=x.pdf)) +} + +# E-step +# x and y are the original observations or ranks +# rho1 and rho2 are the parameters of each copula +# f1, f2, g1, g2 are functions, each is a histogram +e.step.2gaussian <- function(x, y, p, rho1, rho2, x.mar, y.mar){ + + # get pdf and cdf of each component from functions in the corresponding component + px.1 <- get.pdf.cdf(x, x.mar$f1) + px.2 <- get.pdf.cdf(x, x.mar$f2) + py.1 <- get.pdf.cdf(y, y.mar$f1) + py.2 <- get.pdf.cdf(y, y.mar$f2) + + c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1) + c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2) + + return(get.ez(p, c1, c2, px.1$pdf, py.1$pdf, px.2$pdf, py.2$pdf)) +} + +# E-step +# rho1 and rho2 are the parameters of each copula +e.step.2copula <- function(x, y, p, rho1, rho2, x.mar, y.mar, copula.txt){ + + # get pdf and cdf of each component from functions in the corresponding component + px.1 <- get.pdf.cdf(x, x.mar$f1) + px.2 <- get.pdf.cdf(x, x.mar$f2) + py.1 <- get.pdf.cdf(y, y.mar$f1) + py.2 <- get.pdf.cdf(y, y.mar$f2) + + if(copula.txt=="gaussian"){ + c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1) + c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2) + } else { + if(copula.txt=="clayton"){ + c1 <- clayton.cop.den(px.1$cdf, py.1$cdf, rho1) + c2 <- clayton.cop.den(px.2$cdf, py.2$cdf, rho2) + } + } + return(get.ez(p, c1, c2, px.1$pdf, py.1$pdf, px.2$pdf, py.2$pdf)) +} + + + + +# M-step +m.step.2gaussian <- function(x, y, e.z, breaks){ + + # compute f1, f2, g1 and g2 + x.mar <- est.mar.hist(x, e.z, breaks) + y.mar <- est.mar.hist(y, e.z, breaks) + + px.1 <- get.pdf.cdf(x, x.mar$f1) + px.2 <- get.pdf.cdf(x, x.mar$f2) + py.1 <- get.pdf.cdf(y, y.mar$f1) + py.2 <- get.pdf.cdf(y, y.mar$f2) + + rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z) + rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z) + + p <- sum(e.z)/length(e.z) + + return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar)) +} + +m.step.2copula <- function(x, y, e.z, breaks, copula.txt){ + + # compute f1, f2, g1 and g2 + x.mar <- est.mar.hist(x, e.z, breaks) + y.mar <- est.mar.hist(y, e.z, breaks) + + px.1 <- get.pdf.cdf(x, x.mar$f1) + px.2 <- get.pdf.cdf(x, x.mar$f2) + py.1 <- get.pdf.cdf(y, y.mar$f1) + py.2 <- get.pdf.cdf(y, y.mar$f2) + + if(copula.txt=="gaussian"){ + rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z) + rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z) + } else { + if(copula.txt=="clayton"){ + rho1 <- mle.clayton.copula(px.1$cdf, py.1$cdf, e.z) + rho2 <- mle.clayton.copula(px.2$cdf, py.2$cdf, 1-e.z) + } + } + + p <- sum(e.z)/length(e.z) + + return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar)) +} + + + +# E-step: pass values +# x and y are the original observations or ranks +# rho1 and rho2 are the parameters of each copula +# f1, f2, g1, g2 are functions, each is a histogram +e.step.2gaussian.value <- function(x, y, p, rho1, rho2, pdf.cdf){ + + c1 <- gaussian.cop.den(pdf.cdf$px.1$cdf, pdf.cdf$py.1$cdf, rho1) + c2 <- gaussian.cop.den(pdf.cdf$px.2$cdf, pdf.cdf$py.2$cdf, rho2) + + e.z <- get.ez(p, c1, c2, pdf.cdf$px.1$pdf, pdf.cdf$py.1$pdf, + pdf.cdf$px.2$pdf, pdf.cdf$py.2$pdf) + return(e.z) +} + + +e.step.2copula.value <- function(x, y, p, rho1, rho2, pdf.cdf, copula.txt){ + + if(copula.txt =="gaussian"){ + c1 <- gaussian.cop.den(pdf.cdf$px.1$cdf, pdf.cdf$py.1$cdf, rho1) + c2 <- gaussian.cop.den(pdf.cdf$px.2$cdf, pdf.cdf$py.2$cdf, rho2) + } else { + if(copula.txt =="clayton"){ + c1 <- clayton.cop.den(pdf.cdf$px.1$cdf, pdf.cdf$py.1$cdf, rho1) + c2 <- clayton.cop.den(pdf.cdf$px.2$cdf, pdf.cdf$py.2$cdf, rho2) + } + } + + e.z <- get.ez(p, c1, c2, pdf.cdf$px.1$pdf, pdf.cdf$py.1$pdf, + pdf.cdf$px.2$pdf, pdf.cdf$py.2$pdf) + return(e.z) +} + + +# M-step: pass values +m.step.2gaussian.value <- function(x, y, e.z, breaks, fix.rho2){ + + # compute f1, f2, g1 and g2 + x.mar <- est.mar.hist(x, e.z, breaks) + y.mar <- est.mar.hist(y, e.z, breaks) + +# px.1 <- get.pdf.cdf(x, x.mar$f1) +# px.2 <- get.pdf.cdf(x, x.mar$f2) +# py.1 <- get.pdf.cdf(y, y.mar$f1) +# py.2 <- get.pdf.cdf(y, y.mar$f2) + + px.1 <- x.mar$f1.value + px.2 <- x.mar$f2.value + py.1 <- y.mar$f1.value + py.2 <- y.mar$f2.value + + rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z) + + if(!fix.rho2) + rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z) + else + rho2 <- 0 + + p <- sum(e.z)/length(e.z) + + pdf.cdf <- list(px.1=px.1, px.2=px.2, py.1=py.1, py.2=py.2) + + return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar, + pdf.cdf=pdf.cdf)) +} + +m.step.2gaussian.value2 <- function(x, y, e.z, breaks, fix.rho2, x.mar, y.mar){ + + # compute f1, f2, g1 and g2 +# x.mar <- est.mar.hist(x, e.z, breaks) +# y.mar <- est.mar.hist(y, e.z, breaks) + +# px.1 <- get.pdf.cdf(x, x.mar$f1) +# px.2 <- get.pdf.cdf(x, x.mar$f2) +# py.1 <- get.pdf.cdf(y, y.mar$f1) +# py.2 <- get.pdf.cdf(y, y.mar$f2) + + px.1 <- x.mar$f1.value + px.2 <- x.mar$f2.value + py.1 <- y.mar$f1.value + py.2 <- y.mar$f2.value + + rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z) + + if(!fix.rho2) + rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z) + else + rho2 <- 0 + + p <- sum(e.z)/length(e.z) + + pdf.cdf <- list(px.1=px.1, px.2=px.2, py.1=py.1, py.2=py.2) + + return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar, + pdf.cdf=pdf.cdf)) +} + + + +m.step.2copula.value <- function(x, y, e.z, breaks, fix.rho2, copula.txt){ + + # compute f1, f2, g1 and g2 + x.mar <- est.mar.hist(x, e.z, breaks) + y.mar <- est.mar.hist(y, e.z, breaks) + +# px.1 <- get.pdf.cdf(x, x.mar$f1) +# px.2 <- get.pdf.cdf(x, x.mar$f2) +# py.1 <- get.pdf.cdf(y, y.mar$f1) +# py.2 <- get.pdf.cdf(y, y.mar$f2) + + px.1 <- x.mar$f1.value + px.2 <- x.mar$f2.value + py.1 <- y.mar$f1.value + py.2 <- y.mar$f2.value + + if(copula.txt=="gaussian"){ + rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z) + + if(!fix.rho2) + rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z) + else + rho2 <- 0 + } else { + + if(copula.txt=="clayton"){ + rho1 <- mle.clayton.copula(px.1$cdf, py.1$cdf, e.z) + + if(!fix.rho2) + rho2 <- mle.clayton.copula(px.2$cdf, py.2$cdf, 1-e.z) + else + rho2 <- 0 + } + } + + p <- sum(e.z)/length(e.z) + + pdf.cdf <- list(px.1=px.1, px.2=px.2, py.1=py.1, py.2=py.2) + + return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar, + pdf.cdf=pdf.cdf)) +} + + + + +# updated +# mixture likelihood of two gaussian copula +# nonparametric and ranked transformed +loglik.2gaussian.copula.value <- function(x, y, p, rho1, rho2, pdf.cdf){ + + px.1 <- pdf.cdf$px.1 + px.2 <- pdf.cdf$px.2 + py.1 <- pdf.cdf$py.1 + py.2 <- pdf.cdf$py.2 + + c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1) + c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2) + + sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf)) +} + + + +# updated +# mixture likelihood of two gaussian copula +# nonparametric and ranked transformed +loglik.2copula.value <- function(x, y, p, rho1, rho2, pdf.cdf, copula.txt){ + + px.1 <- pdf.cdf$px.1 + px.2 <- pdf.cdf$px.2 + py.1 <- pdf.cdf$py.1 + py.2 <- pdf.cdf$py.2 + + if(copula.txt=="gaussian"){ + c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1) + c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2) + } else { + if(copula.txt=="clayton"){ + c1 <- clayton.cop.den(px.1$cdf, py.1$cdf, rho1) + c2 <- clayton.cop.den(px.2$cdf, py.2$cdf, rho2) + } + } + + sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf)) +} + + + +# EM for 2 Gaussian, speed up computation, unfinished + +em.2gaussian.quick <- function(x, y, p0, rho1.0, rho2.0, eps, fix.p=F, stoc=T, fix.rho2=T){ + + x <- rank(x, tie="random") + y <- rank(y, tie="random") + +# x <- rank(x, tie="average") +# y <- rank(y, tie="average") + + # nbin=20 + xy.min <- min(x, y) + xy.max <- max(x, y) + binwidth <- (xy.max-xy.min)/50 + breaks <- seq(xy.min-binwidth/100, xy.max+binwidth/100, by=(xy.max-xy.min+binwidth/50)/50) +# breaks <- seq(xy.min, xy.max, by=binwidth) + + + # initiate marginals + # initialization: first p0 data has +# e.z <- e.step.2gaussian(x, y, p0, rho1.0, rho2.0, x0.mar, y0.mar) # this starting point assumes two components are overlapped + + e.z <- c(rep(0.9, round(length(x)*p0)), rep(0.1, length(x)-round(length(x)*p0))) + + if(!stoc) + para <- m.step.2gaussian.value(x, y, e.z, breaks, fix.rho2) + else + para <- m.step.2gaussian.stoc.value(x, y, e.z, breaks, fix.rho2) + + + if(fix.p){ + p <- p0 + } else { + p <- para$p + } + + if(fix.rho2){ + rho2 <- rho2.0 + } else { + rho2 <- para$rho2 + } + +# rho1 <- 0.8 + rho1 <- para$rho1 + + l0 <- loglik.2gaussian.copula.value(x, y, p, rho1, rho2, para$pdf.cdf) + + loglik.trace <- c() + loglik.trace[1] <- l0 +# loglik.trace[2] <- l1 + to.run <- T + + i <- 2 + + # this two lines to remove +# x.mar <- est.mar.hist(x, e.z, breaks) +# y.mar <- est.mar.hist(y, e.z, breaks) + + while(to.run){ + + e.z <- e.step.2gaussian.value(x, y, p, rho1, rho2, para$pdf.cdf) + if(!stoc) + para <- m.step.2gaussian.value(x, y, e.z, breaks, fix.rho2) + else + para <- m.step.2gaussian.stoc.value(x, y, e.z, breaks, fix.rho2) + + # fix x.mar and y.mar : to remove +# if(!stoc) +# para <- m.step.2gaussian.value2(x, y, e.z, breaks, fix.rho2, x.mar, y.mar) +# else +# para <- m.step.2gaussian.stoc.value(x, y, e.z, breaks, fix.rho2) + + + if(fix.p){ + p <- p0 + } else { + p <- para$p + } + + if(fix.rho2){ + rho2 <- rho2.0 + } else { + rho2 <- para$rho2 + } + +# rho1 <- 0.8 + rho1 <- para$rho1 + + # l0 <- l1 + l1 <- loglik.2gaussian.copula.value(x, y, p, rho1, rho2, para$pdf.cdf) + loglik.trace[i] <- l1 + +#cat("l1=", l1, "\n") + + # Aitken acceleration criterion + if(i > 2){ + l.inf <- loglik.trace[i-2] + (loglik.trace[i-1] - loglik.trace[i-2])/(1-(loglik.trace[i]-loglik.trace[i-1])/(loglik.trace[i-1]-loglik.trace[i-2])) + to.run <- abs(l.inf - loglik.trace[i]) > eps +#cat("para=", "p=", para$p, " rho1=", rho1, " rho2=", rho2, "\n") +#cat("l.inf=", l.inf, "\n") +#cat(l.inf-loglik.trace[i], "\n") + } + + i <- i+1 + } + + bic <- -2*l1 + (2*(length(breaks)-1+1)+1-fix.p-fix.rho2)*log(length(x)) # parameters + return(list(para=list(p=para$p, rho1=rho1, rho2=rho2), + loglik=l1, bic=bic, e.z=e.z, conf.z = para$conf.z, + loglik.trace=loglik.trace, x.mar=para$x.mar, y.mar=para$y.mar, + breaks=breaks)) +} + + + +em.2copula.quick <- function(x, y, p0, rho1.0, rho2.0, eps, fix.p=F, stoc=T, fix.rho2=T, copula.txt, nbin=50){ + + x <- rank(x, tie="random") + y <- rank(y, tie="random") + +# x <- rank(x, tie="first") +# y <- rank(y, tie="first") + + # nbin=50 + xy.min <- min(x, y) + xy.max <- max(x, y) + binwidth <- (xy.max-xy.min)/50 + breaks <- seq(xy.min-binwidth/100, xy.max+binwidth/100, by=(xy.max-xy.min+binwidth/50)/nbin) +# breaks <- seq(xy.min, xy.max, by=binwidth) + + # initiate marginals + # initialization: first p0 data has +# e.z <- e.step.2gaussian(x, y, p0, rho1.0, rho2.0, x0.mar, y0.mar) # this starting point assumes two components are overlapped + + e.z <- c(rep(0.9, round(length(x)*p0)), rep(0.1, length(x)-round(length(x)*p0))) + + + if(!stoc) + para <- m.step.2copula.value(x, y, e.z, breaks, fix.rho2, copula.txt) + else + para <- m.step.2copula.stoc.value(x, y, e.z, breaks, fix.rho2, copula.txt) + + if(fix.p){ + p <- p0 + } else { + p <- para$p + } + + if(fix.rho2){ + rho2 <- rho2.0 + } else { + rho2 <- para$rho2 + } + + l0 <- loglik.2copula.value(x, y, p, para$rho1, rho2, para$pdf.cdf, copula.txt) + + loglik.trace <- c() + loglik.trace[1] <- l0 +# loglik.trace[2] <- l1 + to.run <- T + + i <- 2 + + while(to.run){ + + e.z <- e.step.2copula.value(x, y, p, para$rho1, rho2, para$pdf.cdf, copula.txt) + if(!stoc) + para <- m.step.2copula.value(x, y, e.z, breaks, fix.rho2, copula.txt) + else + para <- m.step.2copula.stoc.value(x, y, e.z, breaks, fix.rho2, copula.txt) + + if(fix.p){ + p <- p0 + } else { + p <- para$p + } + + if(fix.rho2){ + rho2 <- rho2.0 + } else { + rho2 <- para$rho2 + } + + + # l0 <- l1 + l1 <- loglik.2copula.value(x, y, p, para$rho1, rho2, para$pdf.cdf, copula.txt) + loglik.trace[i] <- l1 + +cat("l1=", l1, "\n") + + # Aitken acceleration criterion + if(i > 2){ + l.inf <- loglik.trace[i-2] + (loglik.trace[i-1] - loglik.trace[i-2])/(1-(loglik.trace[i]-loglik.trace[i-1])/(loglik.trace[i-1]-loglik.trace[i-2])) + to.run <- abs(l.inf - loglik.trace[i]) > eps +cat("para=", "p=", para$p, " rho1=", para$rho1, " rho2=", rho2, "\n") +#cat("l.inf=", l.inf, "\n") +#cat(l.inf-loglik.trace[i], "\n") + } + + i <- i+1 + } + + bic <- -2*l1 + (2*(length(breaks)-1+1)+1-fix.p-fix.rho2)*log(length(x)) # parameters + return(list(para=list(p=para$p, rho1=para$rho1, rho2=rho2), + loglik=l1, bic=bic, e.z=e.z, conf.z = para$conf.z, + loglik.trace=loglik.trace, x.mar=para$x.mar, y.mar=para$y.mar, + breaks=breaks)) +} + + +####################### +####################### fit EM procedure for the matched peaks +####################### + +# remove the unmatched ones +#rm.unmatch <- function(sample1, sample2, p.value.impute=0){ +# +# sample1.prune <- sample1[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,] +# sample2.prune <- sample2[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,] +# +# invisible(list(sample1=sample1.prune$sig.value, sample2=sample2.prune$sig.value)) +#} + + +# fit 2-component model +#fit.em <- function(sample12, fix.rho2=T){ +# +# prune.sample <- rm.unmatch(sample12$merge1, sample12$merge2) +# +# em.fit <- em.2gaussian.quick(-prune.sample$sample1, -prune.sample$sample2, +# p0=0.5, rho1.0=0.7, rho2.0=0, eps=0.01, fix.p=F, stoc=F, fix.rho2) +# +# invisible(list(em.fit=em.fit, data.pruned=prune.sample)) +#} + + +rm.unmatch <- function(sample1, sample2, p.value.impute=0){ + + sample1.prune <- sample1[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,] + sample2.prune <- sample2[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,] + + invisible(list(sample1=sample1.prune, sample2=sample2.prune)) +} + + +# fit 2-component model +fit.em <- function(sample12, fix.rho2=T){ + + prune.sample <- rm.unmatch(sample12$merge1, sample12$merge2) + + em.fit <- em.2gaussian.quick(-prune.sample$sample1$sig.value, -prune.sample$sample2$sig.value, + p0=0.5, rho1.0=0.7, rho2.0=0, eps=0.01, fix.p=F, stoc=F, fix.rho2) + + invisible(list(em.fit=em.fit, data.pruned=prune.sample)) +} + + + +fit.2copula.em <- function(sample12, fix.rho2=T, copula.txt){ + + prune.sample <- rm.unmatch(sample12$merge1, sample12$merge2) + +# o <- order(prune.sample$sample1) +# n <- length(prune.sample$sample1) + +# para <- init(prune.sample$sample1$sig.value, prune.sample$sample2$sig.value, c(rep(0, round(n/3)), rep(c(0,1), round(n/6)), rep(1, n-round(n/3)-round(n/6)))) + +# temp <- init.dist(f0, f1) + para <- list() + para$rho <- 0.6 + para$p <- 0.3 + para$mu <- 2.5 + para$sigma <- 1 +## para$mu <- -temp$mu +## para$sigma <- temp$sigma +#cat("mu=", para$mu, "sigma=", para$sigma, "\n") + +# em.fit <- em.transform.1loop(-prune.sample$sample1, -prune.sample$sample2, + cat("EM is running") + em.fit <- em.transform(prune.sample$sample1$sig.value, prune.sample$sample2$sig.value, para$mu, para$sigma, para$rho, para$p, eps=0.01) + + invisible(list(em.fit=em.fit, data.pruned=prune.sample)) +} + + + + +# fit 1-component model +fit.1.component <- function(data.pruned, breaks){ + +# gaussian.1 <- fit.gaussian.1(-data.pruned$sample1$sig.value, -data.pruned$sample2$sig.value, breaks) +# clayton.1 <- fit.clayton.1(-data.pruned$sample1$sig.value, -data.pruned$sample2$sig.value, breaks) + + gaussian.1 <- fit.gaussian.1(-data.pruned$sample1, -data.pruned$sample2, breaks) + clayton.1 <- fit.clayton.1(-data.pruned$sample1, -data.pruned$sample2, breaks) + + return(list(gaussian.1=gaussian.1, clayton.1=clayton.1)) +} + + + +################# +# Fit a single component +################# + +# a single gaussian copula +# if breaks=NULL, use empirical pdf, otherwise use histogram estimate +fit.gaussian.1 <- function(x, y, breaks=NULL){ + + # rank transformed and compute the empirical cdf + t <- emp.mar.cdf.rank(x) + s <- emp.mar.cdf.rank(y) + + mle.rho <- mle.gaussian.copula(t, s, rep(1, length(t))) + + c1 <- gaussian.cop.den(t, s, mle.rho) +cat("c1", sum(log(c1)), "\n") + + if(is.null(breaks)){ + f1 <- emp.mar.pdf.rank(t) + f2 <- emp.mar.pdf.rank(s) + } else { + x.mar <- est.mar.hist(rank(x), rep(1, length(x)), breaks) + y.mar <- est.mar.hist(rank(y), rep(1, length(y)), breaks) + + f1 <- x.mar$f1.value$pdf # only one component + f2 <- y.mar$f1.value$pdf + } + + +cat("f1", sum(log(f1)), "\n") +cat("f2", sum(log(f2)), "\n") + + loglik <- sum(log(c1)+log(f1)+log(f2)) + + bic <- -2*loglik + log(length(t))*(1+length(breaks)-1) + + return(list(rho=mle.rho, loglik=loglik, bic=bic)) +} + + +# a single Clayton copula +fit.clayton.1 <- function(x, y, breaks=NULL){ + + # rank transformed and compute the empirical cdf + t <- emp.mar.cdf.rank(x) + s <- emp.mar.cdf.rank(y) + + mle.rho <- mle.clayton.copula(t, s, rep(1, length(t))) + + c1 <- clayton.cop.den(t, s, mle.rho) + + if(is.null(breaks)){ + f1 <- emp.mar.pdf.rank(t) + f2 <- emp.mar.pdf.rank(s) + } else { + x.mar <- est.mar.hist(rank(x), rep(1, length(x)), breaks) + y.mar <- est.mar.hist(rank(y), rep(1, length(y)), breaks) + + f1 <- x.mar$f1.value$pdf # only one component + f2 <- y.mar$f1.value$pdf + } + + loglik <- sum(log(c1)+log(f1)+log(f2)) + + bic <- -2*loglik + log(length(t))*(1+length(breaks)-1) + + return(list(rho=mle.rho, tau=rho/(rho+2), loglik=loglik, bic=bic)) +} + +## obsolete function (01-06-2010) +## compute the average posterior probability to belong to the random component +## for peaks selected at different cutoffs +comp.uri.ez <- function(tt, u, v, e.z){ + + u.t <- quantile(u, prob=(1-tt)) + v.t <- quantile(v, prob=(1-tt)) + + # ez <- mean(e.z[u >= u.t & v >=u.t]) Is this wrong? + ez <- mean(e.z[u >= u.t & v >=v.t]) + + return(ez) +} + +## obsolete function (01-06-2010) +# compute the largest posterior error probability corresponding to +# the square centered at the origin and spanned top tt% on both coordinates +# so the consistent low rank ones are excluded +# boundary.txt: either "max" or "min", if it is error prob, use "max" +comp.ez.cutoff <- function(tt, u, v, e.z, boundary.txt){ + + u.t <- quantile(u, prob=(1-tt)) + v.t <- quantile(v, prob=(1-tt)) + + if(boundary.txt == "max"){ + # ez.bound <- max(e.z[u >= u.t & v >=u.t]) + ez.bound <- max(e.z[u >= u.t & v >=v.t]) + } else { + # ez.bound <- min(e.z[u >= u.t & v >=u.t]) + ez.bound <- min(e.z[u >= u.t & v >=v.t]) + } + + return(ez.bound) + +} + +# obsolete functions: 01-06-2010 +# compute the error rate +# u.t and v.t are the quantiles +# this one is used for the plots generated initially in the brief writeup +# and it was used for processing merged data in July before the IDR definition +# is formalized +# It does not implement the current definition of IDR +get.ez.tt.old <- function(em.fit, reverse=T, fdr.level=c(0.01, 0.05, 0.1)){ + + u <- em.fit$data.pruned$sample1 + v <- em.fit$data.pruned$sample2 + + tt <- seq(0.01, 0.99, by=0.01) + if(reverse){ + e.z <- 1-em.fit$em.fit$e.z # this is the error prob + uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z) + ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="max") + } else { + e.z <- em.fit$em.fit$e.z + uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z) + ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="min") + } + + u.t <- quantile(u, prob=(1-tt)) + v.t <- quantile(v, prob=(1-tt)) + + # find the levels on the two replicates + sig.value1 <- c() + sig.value2 <- c() + error.prob.cutoff <- c() + n.selected.match <- c() + + for(i in 1:length(fdr.level)){ + + # find which uri.ez is closet to fdr.level + index <- which.min(abs(uri.ez - fdr.level[i])) + sig.value1[i] <- u.t[index] + sig.value2[i] <- v.t[index] + error.prob.cutoff[i] <- ez.bound[index] + if(reverse){ + n.selected.match[i] <- sum(e.z<=ez.bound[index]) + } else { + n.selected.match[i] <- sum(e.z>=ez.bound[index]) + } + } + + # output the cutoff of posterior probability, signal values on two replicates + map.uv <- cbind(error.prob.cutoff, sig.value1, sig.value2, n.selected.match) + + return(list(n=tt*length(u), uri.ez=uri.ez, u.t=u.t, v.t=v.t, tt=tt, fdr.level=fdr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff)) +} + +# created: 01-06-2010 +# Output IDR at various number of selected peaks +# Find cutoff (idr cutoff, sig.value cutoff on each replicate) for specified IDR level +# IDR definition is similar to FDR +get.ez.tt <- function(em.fit, idr.level=c(0.01, 0.05, 0.1)){ + +# u <- em.fit$data.pruned$sample1$sig.value +# v <- em.fit$data.pruned$sample2$sig.value + u <- em.fit$data.pruned$sample1 + v <- em.fit$data.pruned$sample2 + + e.z <- 1-em.fit$em.fit$e.z # this is the error prob + + o <- order(e.z) + e.z.ordered <- e.z[o] + n.select <- c(1:length(e.z)) + IDR <- cumsum(e.z.ordered)/n.select + + u.o <- u[o] + v.o <- v[o] + + n.level <- length(idr.level) +# sig.value1 <- rep(NA, n.level) +# sig.value2 <- rep(NA, n.level) + ez.cutoff <- rep(NA, n.level) + n.selected <- rep(NA, n.level) + + for(i in 1:length(idr.level)){ + + # find which uri.ez is closet to fdr.level + index <- which.min(abs(IDR - idr.level[i])) +# sig.value1[i] <- min(u.o[1:index]) +# sig.value2[i] <- min(v.o[1:index]) + ez.cutoff[i] <- e.z[index] + n.selected[i] <- sum(e.z<=ez.cutoff[i]) + } + + # output the cutoff of posterior probability, number of selected overlapped peaks +# map.uv <- cbind(ez.cutoff, sig.value1, sig.value2, n.selected) + + map.uv <- cbind(ez.cutoff, n.selected) + + return(list(n=n.select, IDR=IDR, idr.level=idr.level, map.uv=map.uv)) +} + +# return(list(n=tt*length(u), uri.ez=uri.ez, fdr.level=fdr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff)) + + + + + +### compute the mean of the marginals +get.mar.mean <- function(em.out){ + + x.f1 <- em.out$x.mar$f1 + x.f2 <- em.out$x.mar$f2 + + y.f1 <- em.out$y.mar$f1 + y.f2 <- em.out$y.mar$f2 + + x.stat1 <- get.hist.mean(x.f1) + x.stat2 <- get.hist.mean(x.f2) + y.stat1 <- get.hist.mean(y.f1) + y.stat2 <- get.hist.mean(y.f2) + + return(list(x.mean1=x.stat1$mean, x.mean2=x.stat2$mean, + y.mean1=y.stat1$mean, y.mean2=y.stat2$mean, + x.sd1=x.stat1$sd, x.sd2=x.stat2$sd, + y.sd1=y.stat1$sd, y.sd2=y.stat2$sd + )) + +} + + +# compute the mean of marginals +get.hist.mean <- function(x.f){ + + nbreaks <- length(x.f$breaks) + x.bin <- x.f$breaks[-1]-x.f$breaks[-nbreaks] + + x.mid <- (x.f$breaks[-nbreaks]+x.f$breaks[-1])/2 + x.mean <- sum(x.mid*x.f$density*x.bin) + x.sd <- sqrt(sum(x.mid*x.mid*x.f$density*x.bin)-x.mean^2) + + return(list(mean=x.mean, sd=x.sd)) +} + +get.hist.var <- function(x.f){ + + nbreaks <- length(x.f$breaks) + x.bin <- x.f$breaks[-1]-x.f$breaks[-nbreaks] + + x.mid <- (x.f$breaks[-nbreaks]+x.f$breaks[-1])/2 + x.mean <- sum(x.mid*x.f$density*x.bin) + + return(mean=x.mean) +} + +# obsolete function (01-06-2010) +# plot +plot.ez.group.old <- function(ez.list, plot.dir, file.name=NULL, legend.txt, y.lim=NULL, xlab.txt="num of significant peaks", ylab.txt="avg posterior prob of being random", col.txt=NULL, title.txt=NULL){ + + if(is.null(col.txt)) + col.txt <- c("black", "red", "purple", "green", "blue", "cyan", "magenta", "orange", "grey") + + x <- c() + y <- c() + + for(i in 1:length(ez.list)){ + x <- c(x, ez.list[[i]]$n) + + y <- c(y, ez.list[[i]]$uri.ez) + } + + if(is.null(y.lim)) + y.lim <- c(0, max(y)) + + if(!is.null(file.name)){ + postscript(paste(plot.dir, "ez.", file.name, sep="")) + par(mfrow=c(1,1), mar=c(5,5,4,2)) + } + + plot(x, y, ylim=y.lim, type="n", xlab=xlab.txt, ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2) + + for(i in 1:length(ez.list)){ + lines(ez.list[[i]]$n, ez.list[[i]]$uri.ez, col=col.txt[i], cex=2, lwd=5) + } + +# plot(ez.list[[1]]$u.t, y, ylim=y.lim, type="l", xlab="rep-sig", ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2) +# plot(ez.list[[1]]$v.t, y, ylim=y.lim, type="l", xlab="rep-sig", ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2) + + + legend(0, y.lim[2], legend=legend.txt, col=col.txt[1:length(col.txt)], lty=1, lwd=5, cex=2) + + if(!is.null(title)) + title(title.txt) + + if(!is.null(file.name)){ + dev.off() + } + +} + + +plot.ez.group <- function(ez.list, plot.dir, file.name=NULL, legend.txt, y.lim=NULL, xlab.txt="num of significant peaks", ylab.txt="IDR", col.txt=NULL, title.txt=NULL){ + + if(is.null(col.txt)) + col.txt <- c("black", "red", "purple", "green", "blue", "cyan", "magenta", "orange", "grey") + + n.entry <- length(ez.list) + x <- rep(NA, n.entry) + y.max <- rep(NA, n.entry) + + for(i in 1:n.entry){ + x[i] <- max(ez.list[[i]]$n) + + y.max[i] <- max(ez.list[[i]]$IDR) + + } + + if(is.null(y.lim)) + y.lim <- c(0, max(y.max)) + + if(!is.null(file.name)){ + postscript(paste(plot.dir, "ez.", file.name, sep="")) + par(mfrow=c(1,1), mar=c(5,5,4,2)) + } + + + + plot(c(0, max(x)), y.lim, ylim=y.lim, type="n", xlab=xlab.txt, ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2) + + q <- seq(0.01, 0.99, by=0.01) + + for(i in 1:length(ez.list)){ + + n.plot <- round(quantile(ez.list[[i]]$n, prob=q)) + IDR.plot <- ez.list[[i]]$IDR[n.plot] + lines(n.plot, IDR.plot, col=col.txt[i], cex=2, lwd=5) + } + + + legend(0, y.lim[2], legend=legend.txt, col=col.txt[1:length(col.txt)], lty=1, lwd=5, cex=2) + + if(!is.null(title)) + title(title.txt) + + if(!is.null(file.name)){ + dev.off() + } + +} + + + +############################################################################# +############################################################################# +# statistics about peaks selected on the individual replicates +# +# idr.level: the consistency cutoff, say 0.05 +# uri.output: a list of uri.output from consistency analysis generated by batch-consistency-analysis.r +# ez.list : a list of IDRs computed from get.ez.tt using the same idr.level +# +################## + + +# obsolete? +# compute the error rate +# u.t and v.t are the quantiles +# +# map back to all peaks and report the number of peaks selected +get.ez.tt.all.old <- function(em.fit, all.data1, all.data2, idr.level){ + + u <- em.fit$data.pruned$sample1 + v <- em.fit$data.pruned$sample2 + + tt <- seq(0.01, 0.99, by=0.01) +# if(reverse){ + e.z <- 1-em.fit$em.fit$e.z # this is the error prob + uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z) + ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="max") +# } else { +# e.z <- em.fit$em.fit$e.z +# uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z) +# ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="min") +# } + + u.t <- quantile(u, prob=(1-tt)) + v.t <- quantile(v, prob=(1-tt)) + + # find the levels on the two replicates + sig.value1 <- c() + sig.value2 <- c() + error.prob.cutoff <- c() + n.selected.match <- c() + npeak.rep1 <- c() + npeak.rep2 <- c() + + for(i in 1:length(idr.level)){ + + # find which uri.ez is closet to idr.level + index <- which.min(abs(uri.ez - as.numeric(idr.level[i]))) + + sig.value1[i] <- u.t[index] + sig.value2[i] <- v.t[index] + error.prob.cutoff[i] <- ez.bound[index] + n.selected.match[i] <- sum(u>= u.t[index] & v>=v.t[index]) + + npeak.rep1[i] <- sum(all.data1["sig.value"] >= sig.value1[i]) + npeak.rep2[i] <- sum(all.data2["sig.value"] >= sig.value2[i]) + } + + + # output the cutoff of posterior probability, signal values on two replicates + map.uv <- cbind(error.prob.cutoff, sig.value1, sig.value2, n.selected.match, npeak.rep1, npeak.rep2) + + return(list(n=tt*length(u), uri.ez=uri.ez, u.t=u.t, v.t=v.t, tt=tt, idr.level=idr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff)) +} + + +get.ez.tt.all <- function(em.fit, all.data1, all.data2, idr.level=c(0.01, 0.05, 0.1)){ + + u <- em.fit$data.pruned$sample1$sig.value + v <- em.fit$data.pruned$sample2$sig.value +# u <- em.fit$data.pruned$sample1 +# v <- em.fit$data.pruned$sample2 + + e.z <- 1-em.fit$em.fit$e.z # this is the error prob + + o <- order(e.z) + e.z.ordered <- e.z[o] + n.select <- c(1:length(e.z)) + IDR <- cumsum(e.z.ordered)/n.select + + u.o <- u[o] + v.o <- v[o] + + n.level <- length(idr.level) +# sig.value1 <- rep(NA, n.level) +# sig.value2 <- rep(NA, n.level) + ez.cutoff <- rep(NA, n.level) + n.selected <- rep(NA, n.level) + npeak.rep1 <- rep(NA, n.level) + npeak.rep2 <- rep(NA, n.level) + + for(i in 1:length(idr.level)){ + + # find which uri.ez is closet to fdr.level + index <- which.min(abs(IDR - idr.level[i])) +# sig.value1[i] <- min(u.o[1:index]) +# sig.value2[i] <- min(v.o[1:index]) + ez.cutoff[i] <- e.z.ordered[index] # fixed on 02/20/10 + n.selected[i] <- sum(e.z<=ez.cutoff[i]) +# npeak.rep1[i] <- sum(all.data1["sig.value"] >= sig.value1[i]) +# npeak.rep2[i] <- sum(all.data2["sig.value"] >= sig.value2[i]) + } + + # output the cutoff of posterior probability, number of selected overlapped peaks + map.uv <- cbind(ez.cutoff, n.selected) + + return(list(n=n.select, IDR=IDR, idr.level=idr.level, map.uv=map.uv)) +} + +# return(list(n=tt*length(u), uri.ez=uri.ez, fdr.level=fdr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff)) + + + + + + +####### the following is for determining thresholds for merged dataset + +############# select peaks above a given threshold +# +# pass.threshold: a simple method, passing the threshold on the threshold on the individual replicate to the pooled sample +# +# sig.map.list: a list of matrix to include all the cutoff values, each row corresponds to a cutoff. The first column is idr.level +# the 2nd column is the cutoff of ez, the rest of columns are consistency analysis for other replicates +# sig.value.name: the name of the sig.value column +# combined: combined dataset +# nrep: number of pairs of comparisons +# +# Procedure: +# 1. Find the significant threshold corresponding to the idr cutoff on the matched peaks. +# 2. Each time we will get two or more (if >2 replicates) cutoffs and will report the most stringent and the least stringent +# cutoff and the number of peaks selected at those two cutoffs +############# + +pass.threshold <- function(sig.map.list, sig.value.name, combined, idr.level, nrep, chr.size){ + + sig.map <- c() + + # choose idr.level + idr.index <- which(rbind(sig.map.list[[1]])[,1] == idr.level) + if(length(i) ==0){ + print("no level matches specified idr.level") + return(-1) + } + + for(i in 1:length(sig.map.list)) + sig.map <- c(sig.map, rbind(sig.map.list[[i]])[idr.index, c("sig.value1", "sig.value2")]) + + + npeak.tight <- c() + npeak.loose <- c() + + + max.sig <- max(sig.map) + min.sig <- min(sig.map) + selected.sig.tight <- combined[combined[,sig.value.name]>=max.sig, ] + selected.sig.loose <- combined[combined[,sig.value.name]>=min.sig, ] + + selected.sig.tight <- deconcatenate.chr(selected.sig.tight, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")] + selected.sig.loose <- deconcatenate.chr(selected.sig.loose, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")] + + npeak.tight <- nrow(selected.sig.tight) + npeak.loose <- nrow(selected.sig.loose) + + + npeak.stat <- list(idr.level=idr.level, max.sig=max.sig, min.sig=min.sig, npeak.tight=npeak.tight, npeak.loose=npeak.loose) + + invisible(list(npeak.stat=npeak.stat, combined.selected.tight=selected.sig.tight, combined.selected.loose=selected.sig.loose)) +} + +################# +# pass the regions selected from consistency analysis to combined data +# Threshold is determined on the replicates, the regions above the threshold are selected +# then peaks on the combined data are selected from the selected regions +# +# To avoid being too stringent, regions satisfying the following conditions are selected +# 1. regions above the significant threshold determined by consistency analysis on either replicate +# 2. regions that have consistent low peaks, i.e. posterior prob > threshold but not passing the significant threshold +# +# This method doesn't make a difference when using different thresholds +################# + +pass.region <- function(sig.map.list, uri.output, ez.list, em.output, combined, idr.level, sig.value.impute=0, chr.size){ + + combined <- combined[, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")] + npair <- length(uri.output) # number of pairs of consistency analysis + combined.region <- c() + + # choose idr.level + idr.index <- which(rbind(sig.map.list[[1]])[,1] == idr.level) + if(length(idr.index) ==0){ + print("no level matches specified idr.level") + return(-1) + } + + for(j in 1:npair){ + # select peaks from individual replicates using individual cutoff + above.1 <- uri.output[[j]]$data12.enrich$merge1["sig.value"] >= ez.list[[j]]$map.uv[idr.index,"sig.value1"] + above.2 <- uri.output[[j]]$data12.enrich$merge1["sig.value"] >= ez.list[[j]]$map.uv[idr.index,"sig.value2"] + selected.sig.rep1 <- uri.output[[j]]$data12.enrich$merge1[above.1, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")] + selected.sig.rep2 <- uri.output[[j]]$data12.enrich$merge2[above.2, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")] + + # find the peaks that are overlapped with reliable peaks in the individual replicates + overlap.1 <- pair.peaks(selected.sig.rep1, combined)$merge2 + overlap.2 <- pair.peaks(selected.sig.rep2, combined)$merge2 + + # choose the ones with significant value > 0, which are the overlapped ones + + combined.in1 <- overlap.1[overlap.1$sig.value > sig.value.impute, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")] + combined.in2 <- overlap.2[overlap.2$sig.value > sig.value.impute, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")] + + ## consistent low significant ones + ## first find consistenct ones, ie. high posterior prob + # is.consistent <- ez.list[[j]]$e.z < ez.list[[j]]$ez.cutoff + + # data.matched <- keep.match(uri.output[[j]]$data12.enrich$merge1[!above.1, ], uri.output[[j]]$data12.enrich$merge2[!above.2, ], sig.value.impute=0) + # data.matched$sample1 <- data.matched$sample1[, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")] + # data.matched$sample2 <- data.matched$sample2[, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")] + + # consistent.in1 <- data.matched$sample1[is.consistent, ] + # consistent.in2 <- data.matched$sample2[is.consistent, ] + + # overlap.consistent.1 <- pair.peaks(consistent.in1, combined)$merge2 + # overlap.consistent.2 <- pair.peaks(consistent.in2, combined)$merge2 + + ## choose the ones with significant value > 0, which are the overlapped ones + + # combined.consistent.in1 <- overlap.consistent.1[overlap.consistent.1$sig.value > sig.value.impute, ] + # combined.consistent.in2 <- overlap.consistent.2[overlap.consistent.2$sig.value > sig.value.impute, ] + + # combined.region <- rbind(combined.region, combined.in1, combined.in2, combined.consistent.in1, combined.consistent.in2) + + combined.region <- rbind(combined.region, combined.in1, combined.in2) + + is.repeated <- duplicated(combined.region$start) + combined.region <- combined.region[!is.repeated, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")] + + } + npeak <- nrow(combined.region) + + sig.combined <- c(min(combined.region[,"sig.value"], na.rm=T), max(combined.region[,"sig.value"], na.rm=T)) + + # idr.combined <- c(min(combined.region[,"q.value"], na.rm=T), max(combined.region[,"q.value"], na.rm=T)) + + npeak.stat <- list(idr.level=idr.level, npeak=npeak) + + combined.region <- deconcatenate.chr(combined.region, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")] + + invisible(list(npeak.stat=npeak.stat, combined.selected=combined.region, sig.combined=sig.combined)) +} + +################ +# pass structure: this method does another round of inference on the combined data +# +# To make the mixture structure comparable on the replicates and the combined data, the 2nd inference is done on the peaks +# at the reliable regions on the combined data, using rank transformed significant values. The mixture structure is estimated using my consistency analysis, which +# estimates marginal distributions of ranks using nonparametric ways. Then the significant values are found out. +# There are several advantages to do it this way: +# 1. The premise of passing structure is that the means and variance (i.e. distribution) of two replicates should be the same +# The significant values on the two replicates clearly have different distributions. The structure estimated from consistency +# analysis will generate similar rank distribution on two replicates by its setup (i.e. same number of peaks are paired up). +# 2. Because pooled sample is a black box, the structure is more likely to be followed in the matched regions than other locations, +# after all, we don't know what other things are. If even the structure doesn't hold on the matched regions, +# which is possible, let alone the other regions. Focusing on the reliable regions helps to get rid of those unknown noises. +# +# +# modified on 2-20-10: reverse rank.combined, make big sig.value with small +# ranks, to be consistent with f1 and f2 +################ + +pass.structure <- function(uri.output, em.output, combined, idr.level, sig.value.impute, chr.size, overlap.ratio=0){ + + columns.keep <- c("sig.value", "start", "stop", "signal.value", "p.value", "q.value", "chr", "start.ori", "stop.ori") + combined <- combined[, columns.keep] + combined.selected.all <- c() + + for(j in 1:npair){ + + sample1 <- uri.output[[j]]$data12.enrich$merge1[, columns.keep] + sample2 <- uri.output[[j]]$data12.enrich$merge2[, columns.keep] + + # find peaks on the matched region on the combined one + data.matched <- keep.match(sample1, sample2, sig.value.impute=sig.value.impute) + + data.matched$sample1 <- data.matched$sample1[, columns.keep] + data.matched$sample2 <- data.matched$sample2[, columns.keep] + + overlap.1 <- pair.peaks.filter(data.matched$sample1, combined, p.value.impute=sig.value.impute, overlap.ratio)$merge2 + overlap.2 <- pair.peaks.filter(data.matched$sample2, combined, p.value.impute=sig.value.impute, overlap.ratio)$merge2 + + # choose the ones with significant value > sig.value.impute, which are the overlapped ones + + combined.in1 <- overlap.1[overlap.1$sig.value > sig.value.impute, ] + combined.in2 <- overlap.2[overlap.2$sig.value > sig.value.impute, ] + + combined.region <- rbind(combined.in1, combined.in2) + + is.repeated <- duplicated(combined.region$start) + combined.region <- combined.region[!is.repeated,] + + # now rank the peaks in matched region + rank.combined <- rank(-combined.region$sig.value) + + # now transform the parameters estimated into the new scale + npeaks.overlap <- nrow(combined.region) + npeaks.consistent <- nrow(cbind(em.output[[j]]$data.pruned$sample1)) + + + # the breaks are the same for x and y + f1 <- list(breaks=em.output[[j]]$em.fit$x.mar$f1$breaks*npeaks.overlap/npeaks.consistent, density=(em.output[[j]]$em.fit$x.mar$f1$density+em.output[[j]]$em.fit$y.mar$f1$density)/2) + # the first break boundary goes up when changing scale, need set it back to be a bit smaller than 1 + f1$breaks[1] <- min(f1$breaks[1], 0.95) + + f2 <- list(breaks=em.output[[j]]$em.fit$x.mar$f2$breaks*npeaks.overlap/npeaks.consistent, density=(em.output[[j]]$em.fit$x.mar$f2$density+em.output[[j]]$em.fit$y.mar$f2$density)/2) + # the first break boundary goes up when changing scale, need set it back to be a bit smaller than 1 + f2$breaks[1] <- min(f2$breaks[1], 0.95) + + p <- em.output[[j]]$em.fit$para$p + + # find the posterior probability + errorprob.combined <- get.comp2.prob(rank.combined, p, f1, f2) + + # compute the FDR and find cutoff of posterior prob and the sig value + o <- order(errorprob.combined) + idr <- cumsum(errorprob.combined[o])/c(1:length(o)) + idr.index <- which(idr > idr.level)[1] + errorprob.cutoff <- errorprob.combined[o][idr.index] + + # find the minimum significant measure among selected peaks + sig.value <- min(combined.region$sig.value[o][1:idr.index]) + # sig.value <- quantile(combined.region$sig.value[o][1:idr.index], prob=0.05) +#sig.value <- quantile(combined.region$sig.value[errorprob.combined<=errorprob.cutoff], prob=0.05) + + # apply the significant value on the whole pooled list + combined.selected <- combined[combined$sig.value >= sig.value,] + + combined.selected.all <- rbind(combined.selected.all, combined.selected) + } + + is.repeated <- duplicated(combined.selected.all$start) + combined.selected.all <- combined.selected.all[!is.repeated,] + + npeak <- nrow(combined.selected.all) + + npeak.stat <- list(idr.level=idr.level, npeak=npeak) + + sig.combined <- c(min(combined.selected.all[,"sig.value"], na.rm=T), max(combined.selected.all[,"sig.value"], na.rm=T)) + + # idr.combined <- c(min(combined.selected.all[,"q.value"], na.rm=T), max(combined.selected.all[,"q.value"], na.rm=T)) + # combined.selected.all <- deconcatenate.chr(combined.selected.all, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")] + + combined.selected.all <- combined.selected.all[, c("chr", "start.ori", "stop.ori", "signal.value", "p.value", "q.value")] + colnames(combined.selected.all) <- c("chr", "start", "stop", "signal.value", "p.value", "q.value") + + invisible(list(npeak.stat=npeak.stat, combined.selected=combined.selected.all, sig.combined=sig.combined)) +} + + + +# get the posterior probability of the 2nd component +get.comp2.prob <- function(x, p, f1, f2){ + + # get pdf and cdf of each component from functions in the corresponding component + px.1 <- sapply(x, get.pdf, df=f1) + px.2 <- sapply(x, get.pdf, df=f2) + + comp2prob <- 1 - p*px.1/(p*px.1+(1-p)*px.2) + + return(comp2prob) +} + +keep.match <- function(sample1, sample2, sig.value.impute=0){ + + sample1.prune <- sample1[sample1$sig.value > sig.value.impute & sample2$sig.value > sig.value.impute,] + sample2.prune <- sample2[sample1$sig.value > sig.value.impute & sample2$sig.value > sig.value.impute,] + + invisible(list(sample1=sample1.prune, sample2=sample2.prune)) +} + + +############################################## +# +# The following is for simulation +# +############################################## + + +# simulate gaussian copula +# u is the uniform random variable and rho is correlation coefficient +simu.gaussian.copula <- function(u, rho){ + + n <- length(u) + + # simulate y given x=qnorm(u) + y <- qnorm(u)*rho + rnorm(n)*sqrt(1-rho^2) + + v <- pnorm(y) + + invisible(v) +} + +## simulate Clayton copula from its generating function +## Genest and MacKay (1986) + +phi.ori <- function(t, s){ + + (t^(-s) -1)/s +} + + +phi.inv <- function(y, s){ + + exp(-log(s*y+1)/s) +} + +phi.der <- function(t, s){ + + -t^(-s-1) +} + +phi.der.inv <- function(y, s){ + + exp(log(-y)/(-s-1)) +} + +get.w <- function(u, t, s){ + + phi.der.inv(phi.der(u, s)/t, s) +} + +get.v <- function(w, u, s){ + + phi.inv(phi.ori(w, s) - phi.ori(u, s), s) +} + +# u is a uniform random variable, s is the association parameter +simu.clayton.copula <- function(u, s){ + + t <- runif(length(u)) + + if(s>0){ + w <- get.w(u, t, s) + v <- get.v(w, u, s) + return(v) + } + + if(s==0){ + return(t) + } + + if(s <0){ + print("Invalid association parameters for clayton copula") + } + +} + + + +###### 09-09-09 + +# simulate a two-component copula mixture: +# - marginal distributions for the two variables in each component are both +# normal and with the same parameters +# p is the mixing proportion of component 1 +# n is the total sample size +simu.copula.2mix <- function(s1, s2, p, n, mu1, mu2, sd1, sd2, copula.txt){ + + n1 <- round(n*p) + n2 <- n-n1 + + u1 <- runif(n1) + + if(copula.txt =="clayton") + v1 <- simu.clayton.copula(u1, s1) + else{ + if(copula.txt =="gaussian") + v1 <- simu.gaussian.copula(u1, s1) + } + + u2 <- runif(n2) + + if(copula.txt =="clayton") + v2 <- simu.clayton.copula(u2, s2) + else{ + if(copula.txt =="gaussian") + v2 <- simu.gaussian.copula(u2, s2) + } + + # generate test statistics + sample1.1 <- qnorm(u1, mu1, sd1) + sample1.2 <- qnorm(v1, mu1, sd1) + + sample2.1 <- qnorm(u2, mu2, sd2) + sample2.2 <- qnorm(v2, mu2, sd2) + + return(list(u=c(u1, u2), v=c(v1, v2), + u.inv=c(sample1.1, sample2.1), v.inv=c(sample1.2, sample2.2), + label=c(rep(1, n1), rep(2, n2)))) +} + +# using inverse of the cdf to generate original observations + +simu.copula.2mix.inv <- function(s1, s2, p, n, cdf1.x, cdf1.y, cdf2.x, cdf2.y, copula.txt){ + + n1 <- round(n*p) + n2 <- n-n1 + + u1 <- runif(n1) + + if(copula.txt =="clayton") + v1 <- simu.clayton.copula(u1, s1) + else{ + if(copula.txt =="gaussian") + v1 <- simu.gaussian.copula(u1, s1) + } + + u2 <- runif(n2) + + if(copula.txt =="clayton") + v2 <- simu.clayton.copula(u2, s2) + else{ + if(copula.txt =="gaussian") + v2 <- simu.gaussian.copula(u2, s2) + } + + # generate test statistics +# sample1.1 <- qnorm(u1, mu1, sd1) +# sample1.2 <- qnorm(v1, mu1, sd1) + +# sample2.1 <- qnorm(u2, mu2, sd2) +# sample2.2 <- qnorm(v2, mu2, sd2) + + sample1.x <- inv.cdf.vec(u1, cdf1.x) + sample1.y <- inv.cdf.vec(v1, cdf1.y) + + sample2.x <- inv.cdf.vec(u2, cdf2.x) + sample2.y <- inv.cdf.vec(v2, cdf2.y) + + + return(list(u=c(u1, u2), v=c(v1, v2), + u.inv=c(sample1.x, sample2.x), v.inv=c(sample1.y, sample2.y), + label=c(rep(1, n1), rep(2, n2)))) +} + +# obtain original observation by converting cdf into quantiles +# u is one cdf +# u.cdf is a cdf (assuming it is a histogram) and has the break points (cdf$cdf and cdf$breaks) +# the smallest value of cdf=0 and the largest =1 +inv.cdf <- function(u, u.cdf){ + + # which bin it falls into + i <- which(u.cdf$cdf> u)[1] + q.u <- (u - u.cdf$cdf[i-1])/(u.cdf$cdf[i] - u.cdf$cdf[i-1])* (u.cdf$breaks[i]-u.cdf$breaks[i-1]) + u.cdf$breaks[i-1] + + return(q.u) +} + +inv.cdf.vec <- function(u, u.cdf){ + + # check if cdf has the right range (0, 1) + ncdf <- length(u.cdf$cdf) + nbreaks <- length(u.cdf$breaks) + + if(ncdf == nbreaks-1 & u.cdf$cdf[ncdf]< 1) + u.cdf[ncdf] <- 1 + + q.u <- sapply(u, inv.cdf, u.cdf) + + return(q.u) +} + +# here we simulate a likely real situation +# the test statistics from two normal distributions +# according to their labels, then convert them into p-values w.r.t H0 using +# one-sided test. +# The test statistics are correlated for the signal component and independent +# for the noise component +# For the signal component, Y = X + eps, where eps ~ N(0, sigma^2) +simu.test.stat <- function(p, n, mu1, sd1, mu0, sd0, sd.e){ + + # first component - signal + n.signal <- round(n*p) + n.noise <- n - n.signal + + # labels + labels <- c(rep(1, n.signal), rep(0, n.noise)) + + # test statistics for signal and noise + mu.signal <- rnorm(n.signal, mu1, sd1) + x.signal <- mu.signal + rnorm(n.signal, 0, sd.e) + x.noise <- rnorm(n.noise, mu0, sd0) + rnorm(n.noise, 0, sd.e) + + y.signal <- mu.signal + rnorm(n.signal, 0, sd.e) + # sd.e can be dependent on signal + y.noise <- rnorm(n.noise, mu0, sd0) + rnorm(n.noise, 0, sd.e) + + # concatenate + x <- c(x.signal, x.noise) + y <- c(y.signal, y.noise) + + # convert to p-values based on H0 + p.x <- 1-pnorm(x, mu0, sqrt(sd0^2+sd.e^2)) + p.y <- 1-pnorm(y, mu0, sqrt(sd0^2+sd.e^2)) + + return(list(p.x=p.x, p.y=p.y, x=x, y=y, labels=labels)) + +} + +# compute the tradeoff and calibration +forward.decoy.tradeoff.ndecoy <- function(xx, labels, ndecoy){ + + xx <- round(xx, 5) + o <- order(xx, decreasing=T) + + rand <- 1-labels # if rand==0, consistent + # order the random indicator in the same order + rand.o <- rand[o] + + if(sum(rand.o) > ndecoy){ + index.decoy <- which(cumsum(rand.o)==ndecoy) + } else { + index.decoy <- which(cumsum(rand.o)==sum(rand.o)) + } + + cutoff.decoy <- xx[o][index.decoy] + + # only consider the unique ones + cutoff.unique <- unique(xx[o]) + + cutoff <- cutoff.unique[cutoff.unique >= cutoff.decoy[length(cutoff.decoy)]] + + get.decoy.count <- function(cut.off){ + above <- rep(0, length(xx)) + above[xx >= cut.off] <- 1 + decoy.count <- sum(above==1 & rand==1) + return(decoy.count) + } + + get.forward.count <- function(cut.off){ + above <- rep(0, length(xx)) + above[xx >= cut.off] <- 1 + forward.count <- sum(above==1 & rand==0) + return(forward.count) + } + + get.est.fdr <- function(cut.off){ + above <- rep(0, length(xx)) + above[xx >= cut.off] <- 1 + est.fdr <- 1-mean(xx[above==1]) + return(est.fdr) + } + + # assuming rand=0 is right + get.false.neg.count <- function(cut.off){ + below <- rep(0, length(xx)) + below[xx < cut.off] <- 1 + false.neg.count <- sum(below==1 & rand==0) + return(false.neg.count) + } + + get.false.pos.count <- function(cut.off){ + above <- rep(0, length(xx)) + above[xx >= cut.off] <- 1 + false.pos.count <- sum(above==1 & rand==1) + return(false.pos.count) + } + + decoy <- sapply(cutoff, get.decoy.count) + forward <- sapply(cutoff, get.forward.count) + + est.fdr <- sapply(cutoff, get.est.fdr) + emp.fdr <- decoy/(decoy+forward) + + # compute specificity and sensitivity + # assuming rand=1 is wrong and rand=0 is right + false.neg <- sapply(cutoff, get.false.neg.count) + false.pos <- sapply(cutoff, get.false.pos.count) + + true.pos <- sum(rand==0)-false.neg + true.neg <- sum(rand==1)-false.pos + + sensitivity <- true.pos/(true.pos+false.neg) + specificity <- true.neg/(true.neg+false.pos) + + return(list(decoy=decoy, forward=forward, cutoff=cutoff, est.fdr=est.fdr, emp.fdr=emp.fdr, sensitivity=sensitivity, specificity=specificity)) +} + + +# compute the em for jackknife and all data, and find FDR +get.emp.jack <- function(a, p0){ + + nobs <- length(a$labels) + est <- list() + est.all <- list() + + temp.all <- em.transform(-a$p.x, -a$p.y, mu=1.5, sigma=1.4, rho=0.4, p=0.7, eps=0.01) +# temp.all <- em.2copula.quick(a$p.x, a$p.y, p0=p0, rho1.0=0.7, +# rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian") + + est.all$p <- temp.all$para$p + est.all$rho1 <- temp.all$para$rho1 + est.all$FDR <- get.FDR(temp.all$e.z) + + FDR <- list() + p <- c() + rho1 <- c() + + + for(i in 1:nobs){ + + temp <- em.transform(-a$p.x[-i], -a$p.y[-i], mu=1.5, sigma=1.4, rho=0.4, p=0.7, eps=0.01) +# temp <- em.2copula.quick(a$p.x[-i], a$p.y[-i], p0=p0, rho1.0=0.7, +# rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian") + + est[[i]] <- list(p=temp$para$p, rho1=temp$para$rho1, FDR=get.FDR(temp$e.z)) + + FDR[[i]] <- est[[i]]$FDR # this is the FDR for top n peaks + p[i] <- est[[i]]$p + rho1[i] <- est[[i]]$rho1 + } + + est.jack <- list(FDR=FDR, p=p, rho1=rho1) + return(list(est.jack=est.jack, est.all=est.all)) +} + + +# get the npeaks corresponding to the nominal FDR estimated from the sample +# and find the corresponding FDR from the entire data +get.FDR.jack <- function(est, FDR.nominal){ + + nobs <- length(est$est.jack$FDR) + FDR.all <- c() + top.n <- c() + + for(i in 1:nobs){ + top.n[i] <- max(which(est$est.jack$FDR[[i]] <= FDR.nominal)) + FDR.all[i] <- est$est.all$FDR[top.n[i]] + } + + invisible(list(FDR.all=FDR.all, top.n=top.n)) +} + +# compute Jackknife peudonumber +# a is the dataset +get.emp.IF <- function(a, p0){ + + nobs <- length(a$labels) + est <- list() + est.all <- list() + + temp.all <- em.2copula.quick(a$p.x, a$p.y, p0=p0, rho1.0=0.7, + rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian") + + est.all$p <- temp.all$para$p + est.all$rho1 <- temp.all$para$rho1 + est.all$FDR <- get.FDR(temp.all$e.z) + + IF.FDR <- list() + IF.p <- c() + IF.rho1 <- c() + + for(i in 1:nobs){ + + temp <- em.2copula.quick(a$p.x[-i], a$p.y[-i], p0=p0, rho1.0=0.7, + rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian") + + est[[i]] <- list(p=temp$para$p, rho1=temp$para$rho1, FDR=get.FDR(temp$e.z)) + + IF.FDR[[i]] <- (nobs-1)*(est.all$FDR[-nobs] - est[[i]]$FDR) # this is the FDR for top n peaks + IF.p[i] <- (nobs-1)*(est.all$p - est[[i]]$p) + IF.rho1[i] <- (nobs-1)*(est.all$rho1 - est[[i]]$rho1) + } + + emp.IF <- list(FDR=IF.FDR, p=IF.p, rho1=IF.rho1) + + invisible(list(emp.IF=emp.IF, est.all=est.all, est=est)) +} + +# e.z is the posterior probability of being in signal component +get.FDR <- function(e.z){ + + e.z.o <- order(1-e.z) + FDR <- cumsum(1-e.z[e.z.o])/c(1:length(e.z.o)) + + invisible(FDR) +} + +# get the FDR of selecting the top n peaks +# IF.est is the sample influence function +# top.n +get.IF.FDR <- function(IF.est, top.n){ + + nobs <- length(IF.est$emp.IF$FDR) + FDR <- c() + + # influence function of p + for(i in 1:nobs) + FDR[i] <- IF.est$emp.IF$FDR[[i]][top.n] + + invisible(FDR) +} + +# get the sample influence function for FDR at a given FDR size +# 1. find the number of peaks selected at a given FDR computed from all obs +# 2. use the number to find the sample influence function for FDR +# IF.est$est.all is the FDR with all peaks +get.IF.FDR.all <- function(IF.est, FDR.size){ + + top.n <- which.min(abs(IF.est$est.all$FDR -FDR.size)) + nobs <- length(IF.est$est.all$FDR) + FDR <- c() + + # influence function of p + for(i in 1:nobs) + FDR[i] <- IF.est$emp.IF$FDR[[i]][top.n] + + invisible(list(FDR=FDR, top.n=top.n)) +} + +plot.simu.uri <- function(x, y){ + + tt <- seq(0.01, 0.99, by=0.01) + uri <- sapply(tt, comp.uri.prob, u=x, v=y) + uri.thin <- uri[seq(1, length(tt), by=3)] + tt.thin <- tt[seq(1, length(tt), by=3)] + duri <- (uri.thin[-1]-uri.thin[-length(uri.thin)])/(tt.thin[-1]-tt.thin[-length(tt.thin)]) + uri.spl <- smooth.spline(tt, uri, df=6.4) + uri.der <- predict(uri.spl, tt, deriv=1) + + par(mfrow=c(2,2)) + plot(x[1:n0], y[1:n0]) + points(x[(n0+1):n], y[(n0+1):n], col=2) + plot(rank(-x)[1:n0], rank(-y)[1:n0]) + points(rank(-x)[(1+n0):n], rank(-y)[(1+n0):n]) + plot(tt, uri) + lines(c(0,1), c(0,1), lty=2) + title(paste("rho1=", rho1, " rho2=", rho2, "p=", p, sep="")) + plot(tt.thin[-1], duri) + lines(uri.der) + abline(h=1) + invisible(list(x=x, y=y, uri=uri, tt=tt, duri=duri, tt.thin=tt.thin, uri.der=uri.der)) + +} + + +###### new fitting procedure + + + + +# 1. rank pairs + +# 2. initialization +# 3. convert to pseudo-number + +# 4. EM + +# need plugin and test +# find the middle point between the bins +get.pseudo.mix <- function(x, mu, sigma, rho, p){ + + + # first compute cdf for points on the grid + # generate 200 points between [-3, mu+3*sigma] + nw <- 1000 + w <- seq(min(-3, mu-3*sigma), max(mu+3*sigma, 3), length=nw) + w.cdf <- p*pnorm(w, mean=mu, sd=sigma) + (1-p)*pnorm(w, mean=0, sd=1) + + i <- 1 + + quan.x <- rep(NA, length(x)) + + for(i in c(1:nw)){ + index <- which(x >= w.cdf[i] & x < w.cdf[i+1]) + quan.x[index] <- (x[index]-w.cdf[i])*(w[i+1]-w[i])/(w.cdf[i+1]-w.cdf[i]) +w[i] + } + + index <- which(x < w.cdf[1]) + if(length(index)>0) + quan.x[index] <- w[1] + + index <- which(x > w.cdf[nw]) + if(length(index)>0) + quan.x[index] <- w[nw] + +# linear.ext <- function(x, w, w.cdf){ + # linear interpolation +# index.up <- which(w.cdf>= x)[1] +# left.index <- which(w.cdf <=x) +# index.down <- left.index[length(left.index)] +# quan.x <- (w[index.up] + w[index.down])/2 +# } + +# x.pseudo <- sapply(x, linear.ext, w=w, w.cdf=w.cdf) + +# invisible(x.pseudo) + invisible(quan.x) +} + + +# EM to compute the latent structure +# steps: +# 1. raw values are first transformed into pseudovalues +# 2. EM is used to compute the underlining structure, which is a mixture +# of two normals +em.transform <- function(x, y, mu, sigma, rho, p, eps){ + + x.cdf.func <- ecdf(x) + y.cdf.func <- ecdf(y) + afactor <- length(x)/(length(x)+1) + x.cdf <- x.cdf.func(x)*afactor + y.cdf <- y.cdf.func(y)*afactor + + # initialization + para <- list() + para$mu <- mu + para$sigma <- sigma + para$rho <- rho + para$p <- p + + j <- 1 + to.run <- T + loglik.trace <- c() + loglik.inner.trace <- c() + + #to.run.inner <- T + z.1 <- get.pseudo.mix(x.cdf, para$mu, para$sigma, para$rho, para$p) + z.2 <- get.pseudo.mix(y.cdf, para$mu, para$sigma, para$rho, para$p) + +# cat("length(z1)", length(z.1), "\n") + while(to.run){ + + # get pseudo value in each cycle +# z.1 <- get.pseudo.mix(x.cdf, para$mu, para$sigma, para$rho, para$p) +# z.2 <- get.pseudo.mix(y.cdf, para$mu, para$sigma, para$rho, para$p) + + i <- 1 + while(to.run){ + + # EM for latent structure + e.z <- e.step.2normal(z.1, z.2, para$mu, para$sigma, para$rho, para$p) + para <- m.step.2normal(z.1, z.2, e.z) +#para$rho <- rho +#para$p <- p +#para$mu <- mu +#para$sigma <- sigma + if(i > 1) + l.old <- l.new + + # this is just the mixture likelihood of two-component Gaussian + l.new <- loglik.2binormal(z.1, z.2, para$mu, para$sigma, para$rho, para$p) + + loglik.inner.trace[i] <- l.new + + if(i > 1){ + to.run <- loglik.inner.trace[i]-loglik.inner.trace[i-1]>eps + } + + +# if(i > 2){ +# l.inf <- loglik.inner.trace[i-2] + (loglik.inner.trace[i-1] - loglik.inner.trace[i-2])/(1-(loglik.inner.trace[i]-loglik.inner.trace[i-1])/(loglik.inner.trace[i-1]-loglik.inner.trace[i-2])) + +# if(loglik.inner.trace[i-1]!=loglik.inner.trace[i-2]) +# to.run <- abs(l.inf - loglik.inner.trace[i]) > eps +# else +# to.run <- F + +# } + + cat("loglik.inner.trace[", i, "]=", loglik.inner.trace[i], "\n") + cat("mu=", para$mu, "sigma=", para$sigma, "p=", para$p, "rho=", para$rho, "\n\n") + + i <- i+1 + } + + + # get pseudo value in each cycle + z.1 <- get.pseudo.mix(x.cdf, para$mu, para$sigma, para$rho, para$p) + z.2 <- get.pseudo.mix(y.cdf, para$mu, para$sigma, para$rho, para$p) + + if(j > 1) + l.old.outer <- l.new.outer + + l.new.outer <- loglik.2binormal(z.1, z.2, para$mu, para$sigma, para$rho, para$p) + + loglik.trace[j] <- l.new.outer + + if(j == 1) + to.run <- T + else{ # stop when iteration>100 + if(j > 100) + to.run <- F + else + to.run <- l.new.outer - l.old.outer > eps + } + +# if(j %% 10==0) + cat("loglik.trace[", j, "]=", loglik.trace[j], "\n") + cat("mu=", para$mu, "sigma=", para$sigma, "p=", para$p, "rho=", para$rho, "\n") + + j <- j+1 + } + + bic <- -2*l.new + 4*log(length(z.1)) + + return(list(para=list(p=para$p, rho=para$rho, mu=para$mu, sigma=para$sigma), + loglik=l.new, bic=bic, e.z=e.z, loglik.trace=loglik.trace)) +} + + + + +# compute log-likelihood for mixture of two bivariate normals +loglik.2binormal <- function(z.1, z.2, mu, sigma, rho, p){ + + l.m <- sum(d.binormal(z.1, z.2, 0, 1, 0)+log(p*exp(d.binormal(z.1, z.2, mu, sigma, rho)-d.binormal(z.1, z.2, 0, 1, 0))+(1-p))) + +# l.m <- sum((p*d.binormal(z.1, z.2, mu, sigma, rho) + (1-p)*d.binormal(z.1, z.2, 0, 1, 0))) + return(l.m) +} + +# check this when rho=1 + +# density of binomial distribution with equal mean and sigma on both dimensions +d.binormal <- function(z.1, z.2, mu, sigma, rho){ + + loglik <- (-log(2)-log(pi)-2*log(sigma) - log(1-rho^2)/2 - (0.5/(1-rho^2)/sigma^2)*((z.1-mu)^2 -2*rho*(z.1-mu)*(z.2-mu) + (z.2-mu)^2)) + + return(loglik) +} + +# E-step for computing the latent strucutre +# e.z is the prob to be in the consistent group +# e.step for estimating posterior prob +# z.1 and z.2 can be vectors or scalars +e.step.2normal <- function(z.1, z.2, mu, sigma, rho, p){ + + e.z <- p/((1-p)*exp(d.binormal(z.1, z.2, 0, 1, 0)-d.binormal(z.1, z.2, mu, sigma, rho))+ p) + + invisible(e.z) +} + +# M-step for computing the latent structure +# m.step for estimating proportion, mean, sd and correlation coefficient +m.step.2normal <- function(z.1, z.2, e.z){ + + p <- mean(e.z) + mu <- sum((z.1+z.2)*e.z)/2/sum(e.z) + sigma <- sqrt(sum(e.z*((z.1-mu)^2+(z.2-mu)^2))/2/sum(e.z)) + rho <- 2*sum(e.z*(z.1-mu)*(z.2-mu))/(sum(e.z*((z.1-mu)^2+(z.2-mu)^2))) + + return(list(p=p, mu=mu, sigma=sigma, rho=rho)) +} + + +# assume top p percent of observations are true +# x and y are ranks, estimate +init <- function(x, y, x.label){ + + x.o <- order(x) + + x.ordered <- x[x.o] + y.ordered <- y[x.o] + x.label.ordered <- x.label[x.o] + + n <- length(x) + p <- sum(x.label)/n + + rho <- cor(x.ordered[1:ceiling(p*n)], y.ordered[1:ceiling(p*n)]) + + temp <- find.mu.sigma(x.ordered, x.label.ordered) + mu <- temp$mu + sigma <- temp$sigma + + invisible(list(mu=mu, sigma=sigma, rho=rho, p=p)) + +} + +# find mu and sigma if the distributions of marginal ranks are known +# take the medians of the two dist and map back to the original +init.dist <- function(f0, f1){ + + # take the median in f0 + index.median.0 <- which(f0$cdf>0.5)[1] + q.0.small <- f0$cdf[index.median.0] # because f0 and f1 have the same bins + q.1.small <- f1$cdf[index.median.0] + + # take the median in f1 + index.median.1 <- which(f1$cdf>0.5)[1] + q.0.big <- f0$cdf[index.median.1] # because f0 and f1 have the same bins + q.1.big <- f1$cdf[index.median.1] + + # find pseudo value for x.middle[1] on normal(0,1) + pseudo.small.0 <- qnorm(q.0.small, mean=0, sd=1) + pseudo.small.1 <- qnorm(q.1.small, mean=0, sd=1) + + # find pseudo value for x.middle[2] on normal(0,1) + pseudo.big.0 <- qnorm(q.0.big, mean=0, sd=1) + pseudo.big.1 <- qnorm(q.1.big, mean=0, sd=1) + + mu <- (pseudo.small.0*pseudo.big.1 - pseudo.small.1*pseudo.big.0)/(pseudo.big.1-pseudo.small.1) + + sigma <- (pseudo.small.0-mu)/pseudo.small.1 + + return(list(mu=mu, sigma=sigma)) +} + +# generate labels + +# find the part of data with overlap + +# find the percentile on noise and signal + +# Suppose there are signal and noise components, with mean=0 and sd=1 for noise +# x and x.label are the rank of the observations and their labels, +# find the mean and sd of the other component +# x.label takes values of 0 and 1 +find.mu.sigma <- function(x, x.label){ + + x.0 <- x[x.label==0] + x.1 <- x[x.label==1] + + n.x0 <- length(x.0) + n.x1 <- length(x.1) + + x.end <- c(min(x.0), min(x.1), max(x.0), max(x.1)) + o <- order(x.end) + x.middle <- x.end[o][c(2,3)] + + # the smaller end of the overlap + q.1.small <- mean(x.1 <= x.middle[1])*n.x1/(n.x1+1) + q.0.small <- mean(x.0 <= x.middle[1])*n.x0/(n.x0+1) + + # the bigger end of the overlap + q.1.big <- mean(x.1 <= x.middle[2])*n.x1/(n.x1+1) + q.0.big <- mean(x.0 <= x.middle[2])*n.x0/(n.x0+1) + + # find pseudo value for x.middle[1] on normal(0,1) + pseudo.small.0 <- qnorm(q.0.small, mean=0, sd=1) + pseudo.small.1 <- qnorm(q.1.small, mean=0, sd=1) + + # find pseudo value for x.middle[2] on normal(0,1) + pseudo.big.0 <- qnorm(q.0.big, mean=0, sd=1) + pseudo.big.1 <- qnorm(q.1.big, mean=0, sd=1) + + mu <- (pseudo.small.0*pseudo.big.1 - pseudo.small.1*pseudo.big.0)/(pseudo.big.1-pseudo.small.1) + + sigma <- (pseudo.small.0-mu)/pseudo.small.1 + + return(list(mu=mu, sigma=sigma)) +}
