3
+ − 1 # revised on 2-20-10
+ − 2 # - fix error in pass.structure: reverse rank.combined, so that big sig.value
+ − 3 # are ranked with small numbers (1, 2, ...)
+ − 4 # - fix error on get.ez.tt.all: get ez.cutoff from sorted e.z
+ − 5
+ − 6 #
+ − 7 # modified EM procedure to compute empirical CDF more precisely - 09/2009
+ − 8
+ − 9
+ − 10
+ − 11 # this file contains the functions for
+ − 12 # 1. computing the correspondence profile (upper rank intersection and derivatives)
+ − 13 # 2. inference of copula mixture model
+ − 14 #
+ − 15 # It also has functions for
+ − 16 # 1. reading peak caller results
+ − 17 # 2. processing and matching called peaks
+ − 18 # 3. plotting results
+ − 19
+ − 20
+ − 21 ################ read peak caller results
+ − 22
+ − 23 # process narrow peak format
+ − 24 # some peak callers may not report q-values, p-values or fold of enrichment
+ − 25 # need further process before comparison
+ − 26 #
+ − 27 # stop.exclusive: Is the basepair of peak.list$stop exclusive? In narrowpeak and broadpeak format they are exclusive.
+ − 28 # If it is exclusive, we need subtract peak.list$stop by 1 to avoid the same basepair being both a start and a stop of two
+ − 29 # adjacent peaks, which creates trouble for finding correct intersect
+ − 30 process.narrowpeak <- function(narrow.file, chr.size, half.width=NULL, summit="offset", stop.exclusive=T, broadpeak=F){
+ − 31
+ − 32
+ − 33 aa <- read.table(narrow.file)
+ − 34
+ − 35 if(broadpeak){
+ − 36 bb.ori <- data.frame(chr=aa$V1, start=aa$V2, stop=aa$V3, signal.value=aa$V7, p.value=aa$V8, q.value=aa$V9)
+ − 37 }else{
+ − 38 bb.ori <- data.frame(chr=aa$V1, start=aa$V2, stop=aa$V3, signal.value=aa$V7, p.value=aa$V8, q.value=aa$V9, summit=aa$V10)
+ − 39 }
+ − 40
+ − 41 if(summit=="summit"){
+ − 42 bb.ori$summit <- bb.ori$summit-bb.ori$start # change summit to offset to avoid error when concatenating chromosomes
+ − 43 }
+ − 44
+ − 45 bb <- concatenate.chr(bb.ori, chr.size)
+ − 46
+ − 47 #bb <- bb.ori
+ − 48
+ − 49 # remove the peaks that has the same start and stop value
+ − 50 bb <- bb[bb$start != bb$stop,]
+ − 51
+ − 52 if(stop.exclusive==T){
+ − 53 bb$stop <- bb$stop-1
+ − 54 }
+ − 55
+ − 56 if(!is.null(half.width)){
+ − 57 bb$start.ori <- bb$start
+ − 58 bb$stop.ori <- bb$stop
+ − 59
+ − 60 # if peak is narrower than the specified window, stay with its width
+ − 61 # otherwise chop wider peaks to specified width
+ − 62 width <- bb$stop-bb$start +1
+ − 63 is.wider <- width > 2*half.width
+ − 64
+ − 65 if(summit=="offset" | summit=="summit"){ # if summit is offset from start
+ − 66 bb$start[is.wider] <- bb$start.ori[is.wider] + bb$summit[is.wider]-half.width
+ − 67 bb$stop[is.wider] <- bb$start.ori[is.wider] + bb$summit[is.wider]+half.width
+ − 68 } else {
+ − 69 if(summit=="unknown"){
+ − 70 bb$start[is.wider] <- bb$start.ori[is.wider]+round(width[is.wider]/2) - half.width
+ − 71 bb$stop[is.wider] <- bb$start.ori[is.wider]+round(width[is.wider]/2) + half.width
+ − 72 }
+ − 73 }
+ − 74 }
+ − 75
+ − 76 bb <- clean.data(bb)
+ − 77 invisible(list(data.ori=bb.ori, data.cleaned=bb))
+ − 78 }
+ − 79
+ − 80 # clean data
+ − 81 # and concatenate chromosomes if needed
+ − 82 clean.data <- function(adata){
+ − 83
+ − 84 # remove the peaks that has the same start and stop value
+ − 85 adata <- adata[adata$start != adata$stop,]
+ − 86
+ − 87 # if some stops and starts are the same, need fix them
+ − 88 stop.in.start <- is.element(adata$stop, adata$start)
+ − 89 n.fix <- sum(stop.in.start)
+ − 90 if(n.fix >0){
+ − 91 print(paste("Fix", n.fix, "stops\n"))
+ − 92 adata$stop[stop.in.start] <- adata$stop[stop.in.start]-1
+ − 93 }
+ − 94
+ − 95 return(adata)
+ − 96 }
+ − 97
+ − 98 # concatenate peaks
+ − 99 # peaks: the dataframe to have all the peaks
+ − 100 # chr.file: the file to keep the length of each chromosome
+ − 101 # chr files should come from the species that the data is from
+ − 102 #concatenate.chr <- function(peaks, chr.size){
+ − 103
+ − 104 # chr.size <- read.table(chr.file)
+ − 105 # chr.o <- order(chr.size[,1])
+ − 106 # chr.size <- chr.size[chr.o,]
+ − 107 #
+ − 108 # chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2]))
+ − 109 # chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift)
+ − 110 #
+ − 111 # for(i in 1:nrow(chr.size)){
+ − 112 # is.in <- as.character(peaks$chr) == as.character(chr.size.cum$chr[i])
+ − 113 # if(sum(is.in)>0){
+ − 114 # peaks[is.in,]$start <- peaks[is.in,]$start + chr.size.cum$shift[i]
+ − 115 # peaks[is.in,]$stop <- peaks[is.in,]$stop + chr.size.cum$shift[i]
+ − 116 # }
+ − 117 # }
+ − 118 #
+ − 119 # invisible(peaks)
+ − 120 #}
+ − 121
+ − 122
+ − 123
+ − 124
+ − 125 # concatenate peaks
+ − 126 # peaks: the dataframe to have all the peaks
+ − 127 # chr.file: the file to keep the length of each chromosome
+ − 128 # chr files should come from the species that the data is from
+ − 129 concatenate.chr <- function(peaks, chr.size){
+ − 130
+ − 131 # chr.size <- read.table(chr.file)
+ − 132 chr.o <- order(chr.size[,1])
+ − 133 chr.size <- chr.size[chr.o,]
+ − 134
+ − 135 chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2]))
+ − 136 chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift)
+ − 137
+ − 138 peaks$start.ori <- peaks$start
+ − 139 peaks$stop.ori <- peaks$stop
+ − 140
+ − 141 for(i in 1:nrow(chr.size)){
+ − 142 is.in <- as.character(peaks$chr) == as.character(chr.size.cum$chr[i])
+ − 143 if(sum(is.in)>0){
+ − 144 peaks[is.in,]$start <- peaks[is.in,]$start + chr.size.cum$shift[i]
+ − 145 peaks[is.in,]$stop <- peaks[is.in,]$stop + chr.size.cum$shift[i]
+ − 146 }
+ − 147 }
+ − 148
+ − 149 invisible(peaks)
+ − 150 }
+ − 151
+ − 152
+ − 153 deconcatenate.chr <- function(peaks, chr.size){
+ − 154
+ − 155 chr.o <- order(chr.size[,1])
+ − 156 chr.size <- chr.size[chr.o,]
+ − 157
+ − 158 chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2]))
+ − 159 chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift)
+ − 160
+ − 161 peaks$chr <- rep(NA, nrow(peaks))
+ − 162
+ − 163 for(i in 1:(nrow(chr.size.cum)-1)){
+ − 164 is.in <- peaks$start > chr.size.cum[i,2] & peaks$start <= chr.size.cum[i+1, 2]
+ − 165 if(sum(is.in)>0){
+ − 166 peaks[is.in,]$start <- peaks[is.in,]$start - chr.size.cum[i,2]
+ − 167 peaks[is.in,]$stop <- peaks[is.in,]$stop - chr.size.cum[i,2]+1
+ − 168 peaks[is.in,]$chr <- chr.size[i,1]
+ − 169 }
+ − 170 }
+ − 171
+ − 172 if(i == nrow(chr.size.cum)){
+ − 173 is.in <- peaks$start > chr.size.cum[i, 2]
+ − 174 if(sum(is.in)>0){
+ − 175 peaks[is.in,]$start <- peaks[is.in,]$start - chr.size.cum[i,2]
+ − 176 peaks[is.in,]$stop <- peaks[is.in,]$stop - chr.size.cum[i,2]+1
+ − 177 peaks[is.in,]$chr <- chr.size[i,1]
+ − 178 }
+ − 179 }
+ − 180
+ − 181 invisible(peaks)
+ − 182 }
+ − 183
+ − 184 ################ preprocessing peak calling output
+ − 185
+ − 186
+ − 187 #
+ − 188 # read two calling results and sort by peak starting locations,
+ − 189 # then find overlap between peaks
+ − 190 # INPUT:
+ − 191 # rep1: the 1st replicate
+ − 192 # rep2: the 2nd replicate
+ − 193 # OUTPUT:
+ − 194 # id1, id2: the labels for the identified peaks on the replicates
+ − 195 find.overlap <- function(rep1, rep2){
+ − 196
+ − 197 o1 <- order(rep1$start)
+ − 198 rep1 <- rep1[o1,]
+ − 199
+ − 200 o2 <- order(rep2$start)
+ − 201 rep2 <- rep2[o2,]
+ − 202
+ − 203 n1 <- length(o1)
+ − 204 n2 <- length(o2)
+ − 205
+ − 206 # assign common ID to peaks
+ − 207 id1 <- rep(0, n1) # ID assigned on rep1
+ − 208 id2 <- rep(0, n2) # ID assigned on rep2
+ − 209 id <- 1 # keep track common id's
+ − 210
+ − 211 # check if two replicates overlap with each other
+ − 212 i <- 1
+ − 213 j <- 1
+ − 214
+ − 215 while(i <= n1|| j <= n2){
+ − 216
+ − 217 # && (id1[n1] ==0 || id2[n2] ==0)
+ − 218
+ − 219 # if one list runs out
+ − 220 if(i > n1 && j < n2){
+ − 221
+ − 222 j <- j+1
+ − 223 id2[j] <- id
+ − 224 id <- id +1
+ − 225 next
+ − 226 } else{
+ − 227 if(j > n2 && i < n1){
+ − 228 i <- i+1
+ − 229 id1[i] <- id
+ − 230 id <- id +1
+ − 231 next
+ − 232 } else {
+ − 233 if(i >= n1 && j >=n2)
+ − 234 break
+ − 235 }
+ − 236 }
+ − 237
+ − 238 # if not overlap
+ − 239
+ − 240 if(!(rep1$start[i] <= rep2$stop[j] && rep2$start[j] <= rep1$stop[i])){
+ − 241
+ − 242 # at the start of loop, when both are not assigned an ID
+ − 243 # the one locates in front is assigned first
+ − 244 if(id1[i] ==0 && id2[j]==0){
+ − 245 if(rep1$stop[i] < rep2$stop[j]){
+ − 246 id1[i] <- id
+ − 247 } else {
+ − 248 id2[j] <- id
+ − 249 }
+ − 250 } else { # in the middle of the loop, when one is already assigned
+ − 251 # The one that has not assigned gets assigned
+ − 252 # if(id1[i] ==0){ # id1[i] is not assigned
+ − 253 # id1[i] <- id
+ − 254 # } else { # id2[i] is not assigned
+ − 255 # id2[j] <- id
+ − 256 # }
+ − 257
+ − 258 # order the id according to location
+ − 259 if(rep1$stop[i] <= rep2$stop[j]){
+ − 260 id1[i] <- max(id2[j], id1[i])
+ − 261 id2[j] <- id
+ − 262 } else {
+ − 263 if(rep1$stop[i] > rep2$stop[j]){
+ − 264 id2[j] <- max(id1[i], id2[j])
+ − 265 id1[i] <- id
+ − 266 }
+ − 267 }
+ − 268
+ − 269 }
+ − 270
+ − 271 id <- id +1
+ − 272
+ − 273 } else { # if overlap
+ − 274
+ − 275 if(id1[i] == 0 && id2[j] == 0){ # not assign label yet
+ − 276 id1[i] <- id
+ − 277 id2[j] <- id
+ − 278 id <- id +1
+ − 279 } else { # one peak is already assigned label, the other is 0
+ − 280
+ − 281 id1[i] <- max(id1[i], id2[j]) # this is a way to copy the label of the assigned peak without knowing which one is already assigned
+ − 282 id2[j] <- id1[i] # syncronize the labels
+ − 283 }
+ − 284
+ − 285 }
+ − 286
+ − 287 if(rep1$stop[i] < rep2$stop[j]){
+ − 288 i <- i+1
+ − 289 } else {
+ − 290 j <- j+1
+ − 291 }
+ − 292
+ − 293 }
+ − 294
+ − 295 invisible(list(id1=id1, id2=id2))
+ − 296
+ − 297 }
+ − 298
+ − 299 # Impute the missing significant value for the peaks called only on one replicate.
+ − 300 # value
+ − 301 # INPUT:
+ − 302 # rep1, rep2: the two peak calling output
+ − 303 # id1, id2: the IDs assigned by function find.overlap, vectors
+ − 304 # If id1[i]==id2[j], peak i on rep1 overlaps with peak j on rep2
+ − 305 # p.value.impute: the significant value to impute for the missing peaks
+ − 306 # OUTPUT:
+ − 307 # rep1, rep2: peaks ordered by the start locations with imputed peaks
+ − 308 # id1, id2: the IDs with imputed peaks
+ − 309 fill.missing.peaks <- function(rep1, rep2, id1, id2, p.value.impute){
+ − 310
+ − 311 # rep1 <- data.frame(chr=rep1$chr, start=rep1$start, stop=rep1$stop, sig.value=rep1$sig.value)
+ − 312 # rep2 <- data.frame(chr=rep2$chr, start=rep2$start, stop=rep2$stop, sig.value=rep2$sig.value)
+ − 313
+ − 314 o1 <- order(rep1$start)
+ − 315 rep1 <- rep1[o1,]
+ − 316
+ − 317 o2 <- order(rep2$start)
+ − 318 rep2 <- rep2[o2,]
+ − 319
+ − 320 entry.in1.not2 <- !is.element(id1, id2)
+ − 321 entry.in2.not1 <- !is.element(id2, id1)
+ − 322
+ − 323 if(sum(entry.in1.not2) > 0){
+ − 324
+ − 325 temp1 <- rep1[entry.in1.not2, ]
+ − 326
+ − 327 # impute sig.value
+ − 328 temp1$sig.value <- p.value.impute
+ − 329 temp1$signal.value <- p.value.impute
+ − 330 temp1$p.value <- p.value.impute
+ − 331 temp1$q.value <- p.value.impute
+ − 332
+ − 333 rep2.filled <- rbind(rep2, temp1)
+ − 334 id2.filled <- c(id2, id1[entry.in1.not2])
+ − 335 } else {
+ − 336 id2.filled <- id2
+ − 337 rep2.filled <- rep2
+ − 338 }
+ − 339
+ − 340 if(sum(entry.in2.not1) > 0){
+ − 341
+ − 342 temp2 <- rep2[entry.in2.not1, ]
+ − 343
+ − 344 # fill in p.values to 1
+ − 345 temp2$sig.value <- p.value.impute
+ − 346 temp2$signal.value <- p.value.impute
+ − 347 temp2$p.value <- p.value.impute
+ − 348 temp2$q.value <- p.value.impute
+ − 349
+ − 350
+ − 351 # append to the end
+ − 352 rep1.filled <- rbind(rep1, temp2)
+ − 353
+ − 354 id1.filled <- c(id1, id2[entry.in2.not1])
+ − 355 } else {
+ − 356 id1.filled <- id1
+ − 357 rep1.filled <- rep1
+ − 358 }
+ − 359
+ − 360 # sort rep1 and rep2 by the same id
+ − 361 o1 <- order(id1.filled)
+ − 362 rep1.ordered <- rep1.filled[o1, ]
+ − 363
+ − 364 o2 <- order(id2.filled)
+ − 365 rep2.ordered <- rep2.filled[o2, ]
+ − 366
+ − 367 invisible(list(rep1=rep1.ordered, rep2=rep2.ordered,
+ − 368 id1=id1.filled[o1], id2=id2.filled[o2]))
+ − 369 }
+ − 370
+ − 371 # Merge peaks with same ID on the same replicates
+ − 372 # (They are generated if two peaks on rep1 map to the same peak on rep2)
+ − 373 # need peak.list have 3 columns: start, stop and sig.value
+ − 374 merge.peaks <- function(peak.list, id){
+ − 375
+ − 376 i <- 1
+ − 377 j <- 1
+ − 378 dup.index <- c()
+ − 379 sig.value <- c()
+ − 380 start.new <- c()
+ − 381 stop.new <- c()
+ − 382 id.new <- c()
+ − 383
+ − 384 # original data
+ − 385 chr <- c()
+ − 386 start.ori <- c()
+ − 387 stop.ori <- c()
+ − 388
+ − 389 signal.value <- c()
+ − 390 p.value <- c()
+ − 391 q.value <- c()
+ − 392
+ − 393 while(i < length(id)){
+ − 394
+ − 395 if(id[i] == id[i+1]){
+ − 396 dup.index <- c(dup.index, i, i+1) # push on dup.index
+ − 397 } else {
+ − 398 if(length(dup.index)>0){ # pop from dup.index
+ − 399 sig.value[j] <- mean(peak.list$sig.value[unique(dup.index)]) # mean of -log(pvalue)
+ − 400 start.new[j] <- peak.list$start[min(dup.index)]
+ − 401 stop.new[j] <- peak.list$stop[max(dup.index)]
+ − 402 id.new[j] <- id[max(dup.index)]
+ − 403
+ − 404 signal.value[j] <- mean(peak.list$signal.value[unique(dup.index)]) # mean of -log(pvalue)
+ − 405 p.value[j] <- mean(peak.list$p.value[unique(dup.index)]) # mean of -log(pvalue)
+ − 406 q.value[j] <- mean(peak.list$q.value[unique(dup.index)]) # mean of -log(pvalue)
+ − 407
+ − 408 chr[j] <- as.character(peak.list$chr[min(dup.index)])
+ − 409 start.ori[j] <- peak.list$start.ori[min(dup.index)]
+ − 410 stop.ori[j] <- peak.list$stop.ori[max(dup.index)]
+ − 411
+ − 412 dup.index <- c()
+ − 413 } else { # nothing to pop
+ − 414 sig.value[j] <- peak.list$sig.value[i]
+ − 415 start.new[j] <- peak.list$start[i]
+ − 416 stop.new[j] <- peak.list$stop[i]
+ − 417 id.new[j] <- id[i]
+ − 418
+ − 419 signal.value[j] <- peak.list$signal.value[i]
+ − 420 p.value[j] <- peak.list$p.value[i]
+ − 421 q.value[j] <- peak.list$q.value[i]
+ − 422
+ − 423 chr[j] <- as.character(peak.list$chr[i])
+ − 424 start.ori[j] <- peak.list$start.ori[i]
+ − 425 stop.ori[j] <- peak.list$stop.ori[i]
+ − 426
+ − 427 }
+ − 428 j <- j+1
+ − 429 }
+ − 430 i <- i+1
+ − 431 }
+ − 432
+ − 433 data.new <- data.frame(id=id.new, sig.value=sig.value, start=start.new, stop=stop.new, signal.value=signal.value, p.value=p.value, q.value=q.value, chr=chr, start.ori=start.ori, stop.ori=stop.ori)
+ − 434 invisible(data.new)
+ − 435 }
+ − 436
+ − 437
+ − 438
+ − 439
+ − 440
+ − 441 # a wrap function to fill in missing peaks, merge peaks and impute significant values
+ − 442 # out1 and out2 are two peak calling outputs
+ − 443 pair.peaks <- function(out1, out2, p.value.impute=0){
+ − 444
+ − 445 aa <- find.overlap(out1, out2)
+ − 446 bb <- fill.missing.peaks(out1, out2, aa$id1, aa$id2, p.value.impute=0)
+ − 447
+ − 448 cc1 <- merge.peaks(bb$rep1, bb$id1)
+ − 449 cc2 <- merge.peaks(bb$rep2, bb$id2)
+ − 450
+ − 451 invisible(list(merge1=cc1, merge2=cc2))
+ − 452 }
+ − 453
+ − 454
+ − 455
+ − 456 # overlap.ratio is a parameter to define the percentage of overlap
+ − 457 # if overlap.ratio =0, 1 basepair overlap is counted as overlap
+ − 458 # if overlap.ratio between 0 and 1, it is the minimum proportion of
+ − 459 # overlap required to be called as a match
+ − 460 # it is computed as the overlap part/min(peak1.length, peak2.length)
+ − 461 pair.peaks.filter <- function(out1, out2, p.value.impute=0, overlap.ratio=0){
+ − 462
+ − 463 aa <- find.overlap(out1, out2)
+ − 464 bb <- fill.missing.peaks(out1, out2, aa$id1, aa$id2, p.value.impute=0)
+ − 465
+ − 466 cc1 <- merge.peaks(bb$rep1, bb$id1)
+ − 467 cc2 <- merge.peaks(bb$rep2, bb$id2)
+ − 468
+ − 469 frag12 <- cbind(cc1$start, cc1$stop, cc2$start, cc2$stop)
+ − 470
+ − 471 frag.ratio <- apply(frag12, 1, overlap.middle)
+ − 472
+ − 473 frag.ratio[cc1$sig.value==p.value.impute | cc2$sig.value==p.value.impute] <- 0
+ − 474
+ − 475 cc1$frag.ratio <- frag.ratio
+ − 476 cc2$frag.ratio <- frag.ratio
+ − 477
+ − 478 merge1 <- cc1[cc1$frag.ratio >= overlap.ratio,]
+ − 479 merge2 <- cc2[cc2$frag.ratio >= overlap.ratio,]
+ − 480
+ − 481 invisible(list(merge1=merge1, merge2=merge2))
+ − 482 }
+ − 483
+ − 484 # x[1], x[2] are the start and end of the first fragment
+ − 485 # and x[3] and x[4] are the start and end of the 2nd fragment
+ − 486 # If there are two fragments, we can find the overlap by ordering the
+ − 487 # start and stop of all the ends and find the difference between the middle two
+ − 488 overlap.middle <- function(x){
+ − 489
+ − 490 x.o <- x[order(x)]
+ − 491 f1 <- x[2]-x[1]
+ − 492 f2 <- x[4]-x[3]
+ − 493
+ − 494 f.overlap <- abs(x.o[3]-x.o[2])
+ − 495 f.overlap.ratio <- f.overlap/min(f1, f2)
+ − 496
+ − 497 return(f.overlap.ratio)
+ − 498 }
+ − 499
+ − 500
+ − 501
+ − 502 #######
+ − 503 ####### compute correspondence profile
+ − 504 #######
+ − 505
+ − 506 # compute upper rank intersection for one t
+ − 507 # tv: the upper percentile
+ − 508 # x is sorted by the order of paired variable
+ − 509 comp.uri <- function(tv, x){
+ − 510 n <- length(x)
+ − 511 qt <- quantile(x, prob=1-tv[1]) # tv[1] is t
+ − 512 # sum(x[1:ceiling(n*tv[2])] >= qt)/n/tv[2]- tv[1]*tv[2] #tv[2] is v
+ − 513 sum(x[1:ceiling(n*tv[2])] >= qt)/n
+ − 514
+ − 515 }
+ − 516
+ − 517 # compute the correspondence profile
+ − 518 # tt, vv: vector between (0, 1) for percentages
+ − 519 get.uri.2d <- function(x1, x2, tt, vv, spline.df=NULL){
+ − 520
+ − 521 o <- order(x1, x2, decreasing=T)
+ − 522
+ − 523 # sort x2 by the order of x1
+ − 524 x2.ordered <- x2[o]
+ − 525
+ − 526 tv <- cbind(tt, vv)
+ − 527 ntotal <- length(x1) # number of peaks
+ − 528
+ − 529 uri <- apply(tv, 1, comp.uri, x=x2.ordered)
+ − 530
+ − 531 # compute the derivative of URI vs t using small bins
+ − 532 uri.binned <- uri[seq(1, length(uri), by=4)]
+ − 533 tt.binned <- tt[seq(1, length(uri), by=4)]
+ − 534 uri.slope <- (uri.binned[2:(length(uri.binned))] - uri.binned[1:(length(uri.binned)-1)])/(tt.binned[2:(length(uri.binned))] - tt.binned[1:(length(tt.binned)-1)])
+ − 535
+ − 536 # smooth uri using spline
+ − 537 # first find where the jump is and don't fit the jump
+ − 538 # this is the index on the left
+ − 539 # jump.left.old <- which.max(uri[-1]-uri[-length(uri)])
+ − 540 short.list.length <- min(sum(x1>0)/length(x1), sum(x2>0)/length(x2))
+ − 541
+ − 542 if(short.list.length < max(tt)){
+ − 543 jump.left <- which(tt>short.list.length)[1]-1
+ − 544 } else {
+ − 545 jump.left <- which.max(tt)
+ − 546 }
+ − 547
+ − 548 # reversed.index <- seq(length(tt), 1, by=-1)
+ − 549 # nequal <- sum(uri[reversed.index]== tt[reversed.index])
+ − 550 # temp <- which(uri[reversed.index]== tt[reversed.index])[nequal]
+ − 551 # jump.left <- length(tt)-temp
+ − 552
+ − 553 if(jump.left < 6){
+ − 554 jump.left <- length(tt)
+ − 555 }
+ − 556
+ − 557
+ − 558 if(is.null(spline.df))
+ − 559 uri.spl <- smooth.spline(tt[1:jump.left], uri[1:jump.left], df=6.4)
+ − 560 else{
+ − 561 uri.spl <- smooth.spline(tt[1:jump.left], uri[1:jump.left], df=spline.df)
+ − 562 }
+ − 563 # predict the first derivative
+ − 564 uri.der <- predict(uri.spl, tt[1:jump.left], deriv=1)
+ − 565
+ − 566 invisible(list(tv=tv, uri=uri,
+ − 567 uri.slope=uri.slope, t.binned=tt.binned[2:length(uri.binned)],
+ − 568 uri.spl=uri.spl, uri.der=uri.der, jump.left=jump.left,
+ − 569 ntotal=ntotal))
+ − 570 }
+ − 571
+ − 572
+ − 573 # change the scale of uri from based on t (percentage) to n (number of peaks or basepairs)
+ − 574 # this is for plotting multiple pairwise URI's on the same plot
+ − 575 scale.t2n <- function(uri){
+ − 576
+ − 577 ntotal <- uri$ntotal
+ − 578 tv <- uri$tv*uri$ntotal
+ − 579 uri.uri <- uri$uri*uri$ntotal
+ − 580 jump.left <- uri$jump.left
+ − 581 uri.spl <- uri$uri.spl
+ − 582 uri.spl$x <- uri$uri.spl$x*uri$ntotal
+ − 583 uri.spl$y <- uri$uri.spl$y*uri$ntotal
+ − 584
+ − 585 t.binned <- uri$t.binned*uri$ntotal
+ − 586 uri.slope <- uri$uri.slope
+ − 587 uri.der <- uri$uri.der
+ − 588 uri.der$x <- uri$uri.der$x*uri$ntotal
+ − 589 uri.der$y <- uri$uri.der$y
+ − 590
+ − 591 uri.n <- list(tv=tv, uri=uri.uri, t.binned=t.binned, uri.slope=uri.slope, uri.spl=uri.spl, uri.der=uri.der, ntotal=ntotal, jump.left=jump.left)
+ − 592 return(uri.n)
+ − 593 }
+ − 594
+ − 595
+ − 596
+ − 597
+ − 598 # a wrapper for running URI for peaks from peak calling results
+ − 599 # both data1 and data2 are calling results in narrowpeak format
+ − 600 compute.pair.uri <- function(data.1, data.2, sig.value1="signal.value", sig.value2="signal.value", spline.df=NULL, overlap.ratio=0){
+ − 601
+ − 602 tt <- seq(0.01, 1, by=0.01)
+ − 603 vv <- tt
+ − 604
+ − 605 if(sig.value1=="signal.value"){
+ − 606 data.1.enrich <- data.frame(chr=data.1$chr, start.ori=data.1$start.ori, stop.ori=data.1$stop.ori, start=data.1$start, stop=data.1$stop, sig.value=data.1$signal.value, signal.value=data.1$signal.value, p.value=data.1$p.value, q.value=data.1$q.value)
+ − 607 } else {
+ − 608 if(sig.value1=="p.value"){
+ − 609 data.1.enrich <- data.frame(chr=data.1$chr, start.ori=data.1$start.ori, stop.ori=data.1$stop.ori, start=data.1$start, stop=data.1$stop, sig.value=data.1$p.value, signal.value=data.1$signal.value, p.value=data.1$p.value, q.value=data.1$q.value)
+ − 610 } else {
+ − 611 if(sig.value1=="q.value"){
+ − 612 data.1.enrich <- data.frame(chr=data.1$chr, start.ori=data.1$start.ori, stop.ori=data.1$stop.ori, start=data.1$start, stop=data.1$stop, sig.value=data.1$q.value, signal.value=data.1$signal.value, p.value=data.1$p.value, q.value=data.1$q.value)
+ − 613 }
+ − 614 }
+ − 615 }
+ − 616
+ − 617 if(sig.value2=="signal.value"){
+ − 618 data.2.enrich <- data.frame(chr=data.2$chr, start.ori=data.2$start.ori, stop.ori=data.2$stop.ori, start=data.2$start, stop=data.2$stop, sig.value=data.2$signal.value, signal.value=data.2$signal.value, p.value=data.2$p.value, q.value=data.2$q.value)
+ − 619 } else {
+ − 620 if(sig.value2=="p.value"){
+ − 621 data.2.enrich <- data.frame(chr=data.2$chr, start.ori=data.2$start.ori, stop.ori=data.2$stop.ori, start=data.2$start, stop=data.2$stop, sig.value=data.2$p.value, signal.value=data.2$signal.value, p.value=data.2$p.value, q.value=data.2$q.value)
+ − 622 } else {
+ − 623 if(sig.value2=="q.value"){
+ − 624 data.2.enrich <- data.frame(chr=data.2$chr, start.ori=data.2$start.ori, stop.ori=data.2$stop.ori, start=data.2$start, stop=data.2$stop, sig.value=data.2$q.value, signal.value=data.2$signal.value, p.value=data.2$p.value, q.value=data.2$q.value)
+ − 625 }
+ − 626 }
+ − 627 }
+ − 628
+ − 629 ### by peaks
+ − 630 # data12.enrich <- pair.peaks(data.1.enrich, data.2.enrich)
+ − 631 data12.enrich <- pair.peaks.filter(data.1.enrich, data.2.enrich, p.value.impute=0, overlap.ratio)
+ − 632 uri <- get.uri.2d(as.numeric(as.character(data12.enrich$merge1$sig.value)), as.numeric(as.character(data12.enrich$merge2$sig.value)), tt, vv, spline.df=spline.df)
+ − 633 uri.n <- scale.t2n(uri)
+ − 634
+ − 635 return(list(uri=uri, uri.n=uri.n, data12.enrich=data12.enrich, sig.value1=sig.value1, sig.value2=sig.value2))
+ − 636
+ − 637
+ − 638 }
+ − 639
+ − 640
+ − 641
+ − 642 # compute uri for matched sample
+ − 643 get.uri.matched <- function(data12, df=10){
+ − 644
+ − 645 tt <- seq(0.01, 1, by=0.01)
+ − 646 vv <- tt
+ − 647 uri <- get.uri.2d(data12$sample1$sig.value, data12$sample2$sig.value, tt, vv, spline.df=df)
+ − 648
+ − 649 # change scale from t to n
+ − 650 uri.n <- scale.t2n(uri)
+ − 651
+ − 652 return(list(uri=uri, uri.n=uri.n))
+ − 653
+ − 654 }
+ − 655
+ − 656 # map.uv is a pair of significant values corresponding to specified consistency FDR
+ − 657 # assuming values in map.uv and qvalue are linearly related
+ − 658 # data.set is the original data set
+ − 659 # sig.value is the name of the significant value in map.uv, say enrichment
+ − 660 # nominal.value is the one we want to map to, say q-value
+ − 661 #
+ − 662 map.sig.value <- function(data.set, map.uv, nominal.value){
+ − 663
+ − 664 index.nominal <- which(names(data.set$merge1)==nominal.value)
+ − 665 nentry <- nrow(map.uv)
+ − 666 map.nominal <- rbind(map.uv[, c("sig.value1", "sig.value2")])
+ − 667
+ − 668 for(i in 1:nentry){
+ − 669
+ − 670 map.nominal[i, "sig.value1"] <- data.set$merge1[unique(which.min(abs(data.set$merge1$sig.value-map.uv[i, "sig.value1"]))), index.nominal]
+ − 671 map.nominal[i, "sig.value2"] <- data.set$merge2[unique(which.min(abs(data.set$merge2$sig.value-map.uv[i, "sig.value2"]))), index.nominal]
+ − 672 }
+ − 673
+ − 674 invisible(map.nominal)
+ − 675 }
+ − 676
+ − 677
+ − 678 ############### plot correspondence profile
+ − 679
+ − 680 # plot multiple comparison wrt one template
+ − 681 # uri.list contains the total number of peaks
+ − 682 # plot.missing=F: not plot the missing points on the right
+ − 683 plot.uri.group <- function(uri.n.list, plot.dir, file.name=NULL, legend.txt, xlab.txt="num of significant peaks", ylab.txt="num of peaks in common", col.start=0, col.txt=NULL, plot.missing=F, title.txt=NULL){
+ − 684
+ − 685 if(is.null(col.txt))
+ − 686 col.txt <- c("black", "red", "purple", "green", "blue", "cyan", "magenta", "orange", "grey")
+ − 687
+ − 688 n <- length(uri.n.list)
+ − 689
+ − 690 ntotal <- c()
+ − 691 for(i in 1:n)
+ − 692 ntotal[i] <- uri.n.list[[i]]$ntotal
+ − 693
+ − 694 jump.left <- c()
+ − 695 jump.left.der <- c()
+ − 696 ncommon <- c()
+ − 697 for(i in 1:n){
+ − 698 # jump.left[i] <- which.max(uri.n.list[[i]]$uri[-1]-uri.n.list[[i]]$uri[-length(uri.n.list[[i]]$uri)])
+ − 699 # if(jump.left[i] < 6)
+ − 700 # jump.left[i] <- length(uri.n.list[[i]]$uri)
+ − 701
+ − 702 ## reversed.index <- seq(length(uri.n.list[[i]]$tv[,1]), 1, by=-1)
+ − 703 ## nequal <- sum(uri.n.list[[i]]$uri[reversed.index]== uri.n.list[[i]]$tv[reversed.index,1])
+ − 704 ## temp <- which(uri.n.list[[i]]$uri[reversed.index]== uri.n.list[[i]]$tv[reversed.index,1])[nequal]
+ − 705 ## jump.left[i] <- length(uri.n.list[[i]]$tv[,1])-temp
+ − 706 ##print(uri.n.list[[i]]$uri)
+ − 707 ##print(uri.n.list[[i]]$tv[,1])
+ − 708 ## jump.left[i] <- uri.n.list[[i]]$jump.left
+ − 709
+ − 710 # jump.left.der[i] <- sum(uri.n.list[[i]]$t.binned < uri.n.list[[i]]$uri.der$x[length(uri.n.list[[i]]$uri.der$x)])
+ − 711
+ − 712 jump.left[i] <- uri.n.list[[i]]$jump.left
+ − 713 jump.left.der[i] <- jump.left[i]
+ − 714 ncommon[i] <- uri.n.list[[i]]$tv[jump.left[i],1]
+ − 715 }
+ − 716
+ − 717
+ − 718 if(plot.missing){
+ − 719 max.peak <- max(ntotal)
+ − 720 } else {
+ − 721 max.peak <- max(ncommon)*1.05
+ − 722 }
+ − 723
+ − 724 if(!is.null(file.name)){
+ − 725 postscript(paste(plot.dir, "uri.", file.name, sep=""))
+ − 726 par(mfrow=c(1,1), mar=c(5,5,4,2))
+ − 727 }
+ − 728
+ − 729 plot(uri.n.list[[1]]$tv[,1], uri.n.list[[1]]$uri, type="n", xlab=xlab.txt, ylab=ylab.txt, xlim=c(0, max.peak), ylim=c(0, max.peak), cex.lab=2)
+ − 730
+ − 731 for(i in 1:n){
+ − 732
+ − 733 if(plot.missing){
+ − 734 points(uri.n.list[[i]]$tv[,1], uri.n.list[[i]]$uri, col=col.txt[i+col.start], cex=0.5 )
+ − 735 } else {
+ − 736 points(uri.n.list[[i]]$tv[1:jump.left[i],1], uri.n.list[[i]]$uri[1:jump.left[i]], col=col.txt[i+col.start], cex=0.5)
+ − 737 }
+ − 738 lines(uri.n.list[[i]]$uri.spl, col=col.txt[i+col.start], lwd=4)
+ − 739 }
+ − 740 abline(coef=c(0,1), lty=3)
+ − 741 legend(0, max.peak, legend=legend.txt, col=col.txt[(col.start+1):length(col.txt)], lty=1, lwd=3, cex=2)
+ − 742 if(!is.null(title))
+ − 743 title(title.txt)
+ − 744
+ − 745 if(!is.null(file.name)){
+ − 746 dev.off()
+ − 747 }
+ − 748
+ − 749 if(!is.null(file.name)){
+ − 750 postscript(paste(plot.dir, "duri.", file.name, sep=""))
+ − 751 par(mfrow=c(1,1), mar=c(5,5,4,2))
+ − 752 }
+ − 753 plot(uri.n.list[[1]]$t.binned, uri.n.list[[1]]$uri.slope, type="n", xlab=xlab.txt, ylab="slope", xlim=c(0, max.peak), ylim=c(0, 1.5), cex.lab=2)
+ − 754
+ − 755 for(i in 1:n){
+ − 756 # if(plot.missing){
+ − 757 # points(uri.n.list[[i]]$t.binned, uri.n.list[[i]]$uri.slope, col=col.txt[i+col.start], cex=0.5)
+ − 758 # } else {
+ − 759 # points(uri.n.list[[i]]$t.binned[1:jump.left.der[i]], uri.n.list[[i]]$uri.slope[1:jump.left.der[i]], col=col.txt[i+col.start], cex=0.5)
+ − 760 # }
+ − 761 lines(uri.n.list[[i]]$uri.der, col=col.txt[i+col.start], lwd=4)
+ − 762 }
+ − 763 abline(h=1, lty=3)
+ − 764 legend(0.5*max.peak, 1.5, legend=legend.txt, col=col.txt[(col.start+1):length(col.txt)], lty=1, lwd=3, cex=2)
+ − 765
+ − 766 if(!is.null(title))
+ − 767 title(title.txt)
+ − 768
+ − 769 if(!is.null(file.name)){
+ − 770 dev.off()
+ − 771 }
+ − 772
+ − 773 }
+ − 774
+ − 775
+ − 776
+ − 777 #######################
+ − 778 ####################### copula fitting for matched peaks
+ − 779 #######################
+ − 780
+ − 781 # estimation from mixed copula model
+ − 782
+ − 783 # 4-5-09
+ − 784 # A nonparametric estimation of mixed copula model
+ − 785
+ − 786
+ − 787 # updated
+ − 788
+ − 789 # c1, c2, f1, f2, g1, g2 are vectors
+ − 790 # c1*f1*g1 and c2*f2*g2 are copula densities for the two components
+ − 791 # xd1 and yd1 are the values of marginals for the first component
+ − 792 # xd2 and yd2 are the values of marginals for the 2nd component
+ − 793 #
+ − 794 # ez is the prob for being in the consistent group
+ − 795 get.ez <- function(p, c1, c2, xd1, yd1, xd2, yd2){
+ − 796
+ − 797 return(p*c1*xd1*yd1/(p*c1*xd1*yd1 + (1-p)*c2*xd2*yd2))
+ − 798 }
+ − 799
+ − 800 # checked
+ − 801
+ − 802 # this is C_12 not the copula density function c=C_12 * f1* f2
+ − 803 # since nonparametric estimation is used here for f1 and f2, which
+ − 804 # are constant throughout the iterations, we don't need them for optimization
+ − 805 #
+ − 806 # bivariate gaussian copula function
+ − 807 # t and s are vectors of same length, both are percentiles
+ − 808 # return a vector
+ − 809 gaussian.cop.den <- function(t, s, rho){
+ − 810
+ − 811 A <- qnorm(t)^2 + qnorm(s)^2
+ − 812 B <- qnorm(t)*qnorm(s)
+ − 813
+ − 814 loglik <- -log(1-rho^2)/2 - rho/(2*(1-rho^2))*(rho*A-2*B)
+ − 815
+ − 816 return(exp(loglik))
+ − 817 }
+ − 818
+ − 819 clayton.cop.den <- function(t, s, rho){
+ − 820
+ − 821 if(rho > 0)
+ − 822 return(exp(log(rho+1)-(rho+1)*(log(t)+log(s))-(2+1/rho)*log(t^(-rho) + s^(-rho)-1)))
+ − 823
+ − 824 if(rho==0)
+ − 825 return(1)
+ − 826
+ − 827 if(rho<0)
+ − 828 stop("Incorrect Clayton copula coefficient")
+ − 829
+ − 830 }
+ − 831
+ − 832
+ − 833 # checked
+ − 834 # estimate rho from Gaussian copula
+ − 835 mle.gaussian.copula <- function(t, s, e.z){
+ − 836
+ − 837 # reparameterize to bound from rho=+-1
+ − 838 l.c <- function(rho, t, s, e.z){
+ − 839 # cat("rho=", rho, "\n")
+ − 840 sum(e.z*log(gaussian.cop.den(t, s, rho)))}
+ − 841
+ − 842 rho.max <- optimize(f=l.c, c(-0.998, 0.998), maximum=T, tol=0.00001, t=t, s=s, e.z=e.z)
+ − 843
+ − 844 #print(rho.max$m)
+ − 845
+ − 846 #cat("cor=", cor(qnorm(t)*e.z, qnorm(s)*e.z), "\t", "rho.max=", rho.max$m, "\n")
+ − 847 # return(sign(rho.max$m)/(1+rho.max$m))
+ − 848 return(rho.max$m)
+ − 849 }
+ − 850
+ − 851
+ − 852 # estimate mle from Clayton copula,
+ − 853 mle.clayton.copula <- function(t, s, e.z){
+ − 854
+ − 855 l.c <- function(rho, t, s, e.z){
+ − 856 lc <- sum(e.z*log(clayton.cop.den(t, s, rho)))
+ − 857 # cat("rho=", rho, "\t", "l.c=", lc, "\n")
+ − 858 return(lc)
+ − 859 }
+ − 860
+ − 861 rho.max <- optimize(f=l.c, c(0.1, 20), maximum=T, tol=0.00001, t=t, s=s, e.z=e.z)
+ − 862
+ − 863 return(rho.max$m)
+ − 864 }
+ − 865
+ − 866
+ − 867
+ − 868 # updated
+ − 869 # mixture likelihood of two gaussian copula
+ − 870 # nonparametric and ranked transformed
+ − 871 loglik.2gaussian.copula <- function(x, y, p, rho1, rho2, x.mar, y.mar){
+ − 872
+ − 873 px.1 <- get.pdf.cdf(x, x.mar$f1)
+ − 874 px.2 <- get.pdf.cdf(x, x.mar$f2)
+ − 875 py.1 <- get.pdf.cdf(y, y.mar$f1)
+ − 876 py.2 <- get.pdf.cdf(y, y.mar$f2)
+ − 877
+ − 878 c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
+ − 879 c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
+ − 880
+ − 881 sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf))
+ − 882 }
+ − 883
+ − 884 loglik.2copula <- function(x, y, p, rho1, rho2, x.mar, y.mar, copula.txt){
+ − 885
+ − 886 px.1 <- pdf.cdf$px.1
+ − 887 px.2 <- pdf.cdf$px.2
+ − 888 py.1 <- pdf.cdf$py.1
+ − 889 py.2 <- pdf.cdf$py.2
+ − 890
+ − 891 if(copula.txt=="gaussian"){
+ − 892 c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
+ − 893 c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
+ − 894 } else {
+ − 895 if(copula.txt=="clayton"){
+ − 896 c1 <- clayton.cop.den(px.1$cdf, py.1$cdf, rho1)
+ − 897 c2 <- clayton.cop.den(px.2$cdf, py.2$cdf, rho2)
+ − 898 }
+ − 899 }
+ − 900 sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf))
+ − 901 }
+ − 902
+ − 903
+ − 904 # estimate the marginals of each component using histogram estimator in EM
+ − 905 # return the density, breaks, and cdf of the histogram estimator
+ − 906 est.mar.hist <- function(x, e.z, breaks){
+ − 907
+ − 908 binwidth <- c()
+ − 909 nbin <- length(breaks)-1
+ − 910 nx <- length(x)
+ − 911
+ − 912 # the histogram
+ − 913 x1.pdf <- c()
+ − 914 x2.pdf <- c()
+ − 915 x1.cdf <- c()
+ − 916 x2.cdf <- c()
+ − 917
+ − 918 # the pdf for each point
+ − 919 x1.pdf.value <- rep(NA, nx)
+ − 920 x2.pdf.value <- rep(NA, nx)
+ − 921
+ − 922 x1.cdf.value <- rep(NA, nx)
+ − 923 x2.cdf.value <- rep(NA, nx)
+ − 924
+ − 925 for(i in 1:nbin){
+ − 926
+ − 927 binwidth[i] <- breaks[i+1] - breaks[i]
+ − 928 if(i < nbin)
+ − 929 in.bin <- x>= breaks[i] & x < breaks[i+1]
+ − 930 else # last bin
+ − 931 in.bin <- x>= breaks[i] & x <=breaks[i+1]
+ − 932
+ − 933 # each bin add one observation to avoid empty bins
+ − 934 # multiple (nx+nbin)/(nx+nbin+1) to avoid blowup when looking up for
+ − 935 # quantiles
+ − 936 x1.pdf[i] <- (sum(e.z[in.bin])+1)/(sum(e.z)+nbin)/binwidth[i]*(nx+nbin)/(nx+nbin+1)
+ − 937 x2.pdf[i] <- (sum(1-e.z[in.bin])+1)/(sum(1-e.z)+nbin)/binwidth[i]*(nx+nbin)/(nx+nbin+1)
+ − 938
+ − 939
+ − 940 # x1.pdf[i] <- sum(e.z[in.bin])/sum(e.z)/binwidth[i]*nx/(nx+1)
+ − 941 # x2.pdf[i] <- sum(1-e.z[in.bin])/sum(1-e.z)/binwidth[i]*nx/(nx+1)
+ − 942
+ − 943 # treat each bin as a value for a discrete variable
+ − 944 # x1.cdf[i] <- sum(x1.pdf[1:i]*binwidth[1:i])
+ − 945 # x2.cdf[i] <- sum(x2.pdf[1:i]*binwidth[1:i])
+ − 946
+ − 947
+ − 948 # cumulative density before reaching i
+ − 949 if(i>1){
+ − 950 x1.cdf[i] <- sum(x1.pdf[1:(i-1)]*binwidth[1:(i-1)])
+ − 951 x2.cdf[i] <- sum(x2.pdf[1:(i-1)]*binwidth[1:(i-1)])
+ − 952 } else{
+ − 953 x1.cdf[i] <- 0
+ − 954 x2.cdf[i] <- 0
+ − 955 }
+ − 956
+ − 957 # make a vector of nx to store the values of pdf and cdf for each x
+ − 958 # this will speed up the computation dramatically
+ − 959 x1.pdf.value[in.bin] <- x1.pdf[i]
+ − 960 x2.pdf.value[in.bin] <- x2.pdf[i]
+ − 961
+ − 962 x1.cdf.value[in.bin] <- x1.cdf[i] + x1.pdf[i]*(x[in.bin]-breaks[i])
+ − 963 x2.cdf.value[in.bin] <- x2.cdf[i] + x2.pdf[i]*(x[in.bin]-breaks[i])
+ − 964 }
+ − 965
+ − 966 # x1.cdf <- cumsum(x1.pdf*binwidth)
+ − 967 # x2.cdf <- cumsum(x2.pdf*binwidth)
+ − 968
+ − 969 f1 <-list(breaks=breaks, density=x1.pdf, cdf=x1.cdf)
+ − 970 f2 <-list(breaks=breaks, density=x2.pdf, cdf=x2.cdf)
+ − 971
+ − 972 f1.value <- list(pdf=x1.pdf.value, cdf=x1.cdf.value)
+ − 973 f2.value <- list(pdf=x2.pdf.value, cdf=x2.cdf.value)
+ − 974
+ − 975 return(list(f1=f1, f2=f2, f1.value=f1.value, f2.value=f2.value))
+ − 976 }
+ − 977
+ − 978 # estimate the marginal cdf from rank
+ − 979 est.cdf.rank <- function(x, conf.z){
+ − 980
+ − 981 # add 1 to prevent blow up
+ − 982 x1.cdf <- rank(x[conf.z==1])/(length(x[conf.z==1])+1)
+ − 983
+ − 984 x2.cdf <- rank(x[conf.z==0])/(length(x[conf.z==0])+1)
+ − 985
+ − 986 return(list(cdf1=x1.cdf, cdf2=x2.cdf))
+ − 987 }
+ − 988
+ − 989 # df is a density function with fields: density, cdf and breaks, x is a scalar
+ − 990 get.pdf <- function(x, df){
+ − 991
+ − 992 if(x < df$breaks[1])
+ − 993 cat("x is out of the range of df\n")
+ − 994
+ − 995 index <- which(df$breaks >= x)[1]
+ − 996
+ − 997 if(index==1)
+ − 998 index <- index +1
+ − 999 return(df$density[index-1])
+ − 1000 }
+ − 1001
+ − 1002 # get cdf from histgram estimator for a single value
+ − 1003 get.cdf <- function(x, df){
+ − 1004
+ − 1005 index <- which(df$breaks >= x)[1]
+ − 1006 if(index==1)
+ − 1007 index <- index +1
+ − 1008 return(df$cdf[index-1])
+ − 1009 }
+ − 1010
+ − 1011 # df is a density function with fields: density, cdf and breaks
+ − 1012 get.pdf.cdf <- function(x.vec, df){
+ − 1013
+ − 1014 x.pdf <- sapply(x.vec, get.pdf, df=df)
+ − 1015 x.cdf <- sapply(x.vec, get.cdf, df=df)
+ − 1016 return(list(cdf=x.cdf, pdf=x.pdf))
+ − 1017 }
+ − 1018
+ − 1019 # E-step
+ − 1020 # x and y are the original observations or ranks
+ − 1021 # rho1 and rho2 are the parameters of each copula
+ − 1022 # f1, f2, g1, g2 are functions, each is a histogram
+ − 1023 e.step.2gaussian <- function(x, y, p, rho1, rho2, x.mar, y.mar){
+ − 1024
+ − 1025 # get pdf and cdf of each component from functions in the corresponding component
+ − 1026 px.1 <- get.pdf.cdf(x, x.mar$f1)
+ − 1027 px.2 <- get.pdf.cdf(x, x.mar$f2)
+ − 1028 py.1 <- get.pdf.cdf(y, y.mar$f1)
+ − 1029 py.2 <- get.pdf.cdf(y, y.mar$f2)
+ − 1030
+ − 1031 c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
+ − 1032 c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
+ − 1033
+ − 1034 return(get.ez(p, c1, c2, px.1$pdf, py.1$pdf, px.2$pdf, py.2$pdf))
+ − 1035 }
+ − 1036
+ − 1037 # E-step
+ − 1038 # rho1 and rho2 are the parameters of each copula
+ − 1039 e.step.2copula <- function(x, y, p, rho1, rho2, x.mar, y.mar, copula.txt){
+ − 1040
+ − 1041 # get pdf and cdf of each component from functions in the corresponding component
+ − 1042 px.1 <- get.pdf.cdf(x, x.mar$f1)
+ − 1043 px.2 <- get.pdf.cdf(x, x.mar$f2)
+ − 1044 py.1 <- get.pdf.cdf(y, y.mar$f1)
+ − 1045 py.2 <- get.pdf.cdf(y, y.mar$f2)
+ − 1046
+ − 1047 if(copula.txt=="gaussian"){
+ − 1048 c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
+ − 1049 c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
+ − 1050 } else {
+ − 1051 if(copula.txt=="clayton"){
+ − 1052 c1 <- clayton.cop.den(px.1$cdf, py.1$cdf, rho1)
+ − 1053 c2 <- clayton.cop.den(px.2$cdf, py.2$cdf, rho2)
+ − 1054 }
+ − 1055 }
+ − 1056 return(get.ez(p, c1, c2, px.1$pdf, py.1$pdf, px.2$pdf, py.2$pdf))
+ − 1057 }
+ − 1058
+ − 1059
+ − 1060
+ − 1061
+ − 1062 # M-step
+ − 1063 m.step.2gaussian <- function(x, y, e.z, breaks){
+ − 1064
+ − 1065 # compute f1, f2, g1 and g2
+ − 1066 x.mar <- est.mar.hist(x, e.z, breaks)
+ − 1067 y.mar <- est.mar.hist(y, e.z, breaks)
+ − 1068
+ − 1069 px.1 <- get.pdf.cdf(x, x.mar$f1)
+ − 1070 px.2 <- get.pdf.cdf(x, x.mar$f2)
+ − 1071 py.1 <- get.pdf.cdf(y, y.mar$f1)
+ − 1072 py.2 <- get.pdf.cdf(y, y.mar$f2)
+ − 1073
+ − 1074 rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)
+ − 1075 rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z)
+ − 1076
+ − 1077 p <- sum(e.z)/length(e.z)
+ − 1078
+ − 1079 return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar))
+ − 1080 }
+ − 1081
+ − 1082 m.step.2copula <- function(x, y, e.z, breaks, copula.txt){
+ − 1083
+ − 1084 # compute f1, f2, g1 and g2
+ − 1085 x.mar <- est.mar.hist(x, e.z, breaks)
+ − 1086 y.mar <- est.mar.hist(y, e.z, breaks)
+ − 1087
+ − 1088 px.1 <- get.pdf.cdf(x, x.mar$f1)
+ − 1089 px.2 <- get.pdf.cdf(x, x.mar$f2)
+ − 1090 py.1 <- get.pdf.cdf(y, y.mar$f1)
+ − 1091 py.2 <- get.pdf.cdf(y, y.mar$f2)
+ − 1092
+ − 1093 if(copula.txt=="gaussian"){
+ − 1094 rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)
+ − 1095 rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z)
+ − 1096 } else {
+ − 1097 if(copula.txt=="clayton"){
+ − 1098 rho1 <- mle.clayton.copula(px.1$cdf, py.1$cdf, e.z)
+ − 1099 rho2 <- mle.clayton.copula(px.2$cdf, py.2$cdf, 1-e.z)
+ − 1100 }
+ − 1101 }
+ − 1102
+ − 1103 p <- sum(e.z)/length(e.z)
+ − 1104
+ − 1105 return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar))
+ − 1106 }
+ − 1107
+ − 1108
+ − 1109
+ − 1110 # E-step: pass values
+ − 1111 # x and y are the original observations or ranks
+ − 1112 # rho1 and rho2 are the parameters of each copula
+ − 1113 # f1, f2, g1, g2 are functions, each is a histogram
+ − 1114 e.step.2gaussian.value <- function(x, y, p, rho1, rho2, pdf.cdf){
+ − 1115
+ − 1116 c1 <- gaussian.cop.den(pdf.cdf$px.1$cdf, pdf.cdf$py.1$cdf, rho1)
+ − 1117 c2 <- gaussian.cop.den(pdf.cdf$px.2$cdf, pdf.cdf$py.2$cdf, rho2)
+ − 1118
+ − 1119 e.z <- get.ez(p, c1, c2, pdf.cdf$px.1$pdf, pdf.cdf$py.1$pdf,
+ − 1120 pdf.cdf$px.2$pdf, pdf.cdf$py.2$pdf)
+ − 1121 return(e.z)
+ − 1122 }
+ − 1123
+ − 1124
+ − 1125 e.step.2copula.value <- function(x, y, p, rho1, rho2, pdf.cdf, copula.txt){
+ − 1126
+ − 1127 if(copula.txt =="gaussian"){
+ − 1128 c1 <- gaussian.cop.den(pdf.cdf$px.1$cdf, pdf.cdf$py.1$cdf, rho1)
+ − 1129 c2 <- gaussian.cop.den(pdf.cdf$px.2$cdf, pdf.cdf$py.2$cdf, rho2)
+ − 1130 } else {
+ − 1131 if(copula.txt =="clayton"){
+ − 1132 c1 <- clayton.cop.den(pdf.cdf$px.1$cdf, pdf.cdf$py.1$cdf, rho1)
+ − 1133 c2 <- clayton.cop.den(pdf.cdf$px.2$cdf, pdf.cdf$py.2$cdf, rho2)
+ − 1134 }
+ − 1135 }
+ − 1136
+ − 1137 e.z <- get.ez(p, c1, c2, pdf.cdf$px.1$pdf, pdf.cdf$py.1$pdf,
+ − 1138 pdf.cdf$px.2$pdf, pdf.cdf$py.2$pdf)
+ − 1139 return(e.z)
+ − 1140 }
+ − 1141
+ − 1142
+ − 1143 # M-step: pass values
+ − 1144 m.step.2gaussian.value <- function(x, y, e.z, breaks, fix.rho2){
+ − 1145
+ − 1146 # compute f1, f2, g1 and g2
+ − 1147 x.mar <- est.mar.hist(x, e.z, breaks)
+ − 1148 y.mar <- est.mar.hist(y, e.z, breaks)
+ − 1149
+ − 1150 # px.1 <- get.pdf.cdf(x, x.mar$f1)
+ − 1151 # px.2 <- get.pdf.cdf(x, x.mar$f2)
+ − 1152 # py.1 <- get.pdf.cdf(y, y.mar$f1)
+ − 1153 # py.2 <- get.pdf.cdf(y, y.mar$f2)
+ − 1154
+ − 1155 px.1 <- x.mar$f1.value
+ − 1156 px.2 <- x.mar$f2.value
+ − 1157 py.1 <- y.mar$f1.value
+ − 1158 py.2 <- y.mar$f2.value
+ − 1159
+ − 1160 rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)
+ − 1161
+ − 1162 if(!fix.rho2)
+ − 1163 rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z)
+ − 1164 else
+ − 1165 rho2 <- 0
+ − 1166
+ − 1167 p <- sum(e.z)/length(e.z)
+ − 1168
+ − 1169 pdf.cdf <- list(px.1=px.1, px.2=px.2, py.1=py.1, py.2=py.2)
+ − 1170
+ − 1171 return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar,
+ − 1172 pdf.cdf=pdf.cdf))
+ − 1173 }
+ − 1174
+ − 1175 m.step.2gaussian.value2 <- function(x, y, e.z, breaks, fix.rho2, x.mar, y.mar){
+ − 1176
+ − 1177 # compute f1, f2, g1 and g2
+ − 1178 # x.mar <- est.mar.hist(x, e.z, breaks)
+ − 1179 # y.mar <- est.mar.hist(y, e.z, breaks)
+ − 1180
+ − 1181 # px.1 <- get.pdf.cdf(x, x.mar$f1)
+ − 1182 # px.2 <- get.pdf.cdf(x, x.mar$f2)
+ − 1183 # py.1 <- get.pdf.cdf(y, y.mar$f1)
+ − 1184 # py.2 <- get.pdf.cdf(y, y.mar$f2)
+ − 1185
+ − 1186 px.1 <- x.mar$f1.value
+ − 1187 px.2 <- x.mar$f2.value
+ − 1188 py.1 <- y.mar$f1.value
+ − 1189 py.2 <- y.mar$f2.value
+ − 1190
+ − 1191 rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)
+ − 1192
+ − 1193 if(!fix.rho2)
+ − 1194 rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z)
+ − 1195 else
+ − 1196 rho2 <- 0
+ − 1197
+ − 1198 p <- sum(e.z)/length(e.z)
+ − 1199
+ − 1200 pdf.cdf <- list(px.1=px.1, px.2=px.2, py.1=py.1, py.2=py.2)
+ − 1201
+ − 1202 return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar,
+ − 1203 pdf.cdf=pdf.cdf))
+ − 1204 }
+ − 1205
+ − 1206
+ − 1207
+ − 1208 m.step.2copula.value <- function(x, y, e.z, breaks, fix.rho2, copula.txt){
+ − 1209
+ − 1210 # compute f1, f2, g1 and g2
+ − 1211 x.mar <- est.mar.hist(x, e.z, breaks)
+ − 1212 y.mar <- est.mar.hist(y, e.z, breaks)
+ − 1213
+ − 1214 # px.1 <- get.pdf.cdf(x, x.mar$f1)
+ − 1215 # px.2 <- get.pdf.cdf(x, x.mar$f2)
+ − 1216 # py.1 <- get.pdf.cdf(y, y.mar$f1)
+ − 1217 # py.2 <- get.pdf.cdf(y, y.mar$f2)
+ − 1218
+ − 1219 px.1 <- x.mar$f1.value
+ − 1220 px.2 <- x.mar$f2.value
+ − 1221 py.1 <- y.mar$f1.value
+ − 1222 py.2 <- y.mar$f2.value
+ − 1223
+ − 1224 if(copula.txt=="gaussian"){
+ − 1225 rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)
+ − 1226
+ − 1227 if(!fix.rho2)
+ − 1228 rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z)
+ − 1229 else
+ − 1230 rho2 <- 0
+ − 1231 } else {
+ − 1232
+ − 1233 if(copula.txt=="clayton"){
+ − 1234 rho1 <- mle.clayton.copula(px.1$cdf, py.1$cdf, e.z)
+ − 1235
+ − 1236 if(!fix.rho2)
+ − 1237 rho2 <- mle.clayton.copula(px.2$cdf, py.2$cdf, 1-e.z)
+ − 1238 else
+ − 1239 rho2 <- 0
+ − 1240 }
+ − 1241 }
+ − 1242
+ − 1243 p <- sum(e.z)/length(e.z)
+ − 1244
+ − 1245 pdf.cdf <- list(px.1=px.1, px.2=px.2, py.1=py.1, py.2=py.2)
+ − 1246
+ − 1247 return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar,
+ − 1248 pdf.cdf=pdf.cdf))
+ − 1249 }
+ − 1250
+ − 1251
+ − 1252
+ − 1253
+ − 1254 # updated
+ − 1255 # mixture likelihood of two gaussian copula
+ − 1256 # nonparametric and ranked transformed
+ − 1257 loglik.2gaussian.copula.value <- function(x, y, p, rho1, rho2, pdf.cdf){
+ − 1258
+ − 1259 px.1 <- pdf.cdf$px.1
+ − 1260 px.2 <- pdf.cdf$px.2
+ − 1261 py.1 <- pdf.cdf$py.1
+ − 1262 py.2 <- pdf.cdf$py.2
+ − 1263
+ − 1264 c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
+ − 1265 c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
+ − 1266
+ − 1267 sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf))
+ − 1268 }
+ − 1269
+ − 1270
+ − 1271
+ − 1272 # updated
+ − 1273 # mixture likelihood of two gaussian copula
+ − 1274 # nonparametric and ranked transformed
+ − 1275 loglik.2copula.value <- function(x, y, p, rho1, rho2, pdf.cdf, copula.txt){
+ − 1276
+ − 1277 px.1 <- pdf.cdf$px.1
+ − 1278 px.2 <- pdf.cdf$px.2
+ − 1279 py.1 <- pdf.cdf$py.1
+ − 1280 py.2 <- pdf.cdf$py.2
+ − 1281
+ − 1282 if(copula.txt=="gaussian"){
+ − 1283 c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
+ − 1284 c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
+ − 1285 } else {
+ − 1286 if(copula.txt=="clayton"){
+ − 1287 c1 <- clayton.cop.den(px.1$cdf, py.1$cdf, rho1)
+ − 1288 c2 <- clayton.cop.den(px.2$cdf, py.2$cdf, rho2)
+ − 1289 }
+ − 1290 }
+ − 1291
+ − 1292 sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf))
+ − 1293 }
+ − 1294
+ − 1295
+ − 1296
+ − 1297 # EM for 2 Gaussian, speed up computation, unfinished
+ − 1298
+ − 1299 em.2gaussian.quick <- function(x, y, p0, rho1.0, rho2.0, eps, fix.p=F, stoc=T, fix.rho2=T){
+ − 1300
+ − 1301 x <- rank(x, tie="random")
+ − 1302 y <- rank(y, tie="random")
+ − 1303
+ − 1304 # x <- rank(x, tie="average")
+ − 1305 # y <- rank(y, tie="average")
+ − 1306
+ − 1307 # nbin=20
+ − 1308 xy.min <- min(x, y)
+ − 1309 xy.max <- max(x, y)
+ − 1310 binwidth <- (xy.max-xy.min)/50
+ − 1311 breaks <- seq(xy.min-binwidth/100, xy.max+binwidth/100, by=(xy.max-xy.min+binwidth/50)/50)
+ − 1312 # breaks <- seq(xy.min, xy.max, by=binwidth)
+ − 1313
+ − 1314
+ − 1315 # initiate marginals
+ − 1316 # initialization: first p0 data has
+ − 1317 # e.z <- e.step.2gaussian(x, y, p0, rho1.0, rho2.0, x0.mar, y0.mar) # this starting point assumes two components are overlapped
+ − 1318
+ − 1319 e.z <- c(rep(0.9, round(length(x)*p0)), rep(0.1, length(x)-round(length(x)*p0)))
+ − 1320
+ − 1321 if(!stoc)
+ − 1322 para <- m.step.2gaussian.value(x, y, e.z, breaks, fix.rho2)
+ − 1323 else
+ − 1324 para <- m.step.2gaussian.stoc.value(x, y, e.z, breaks, fix.rho2)
+ − 1325
+ − 1326
+ − 1327 if(fix.p){
+ − 1328 p <- p0
+ − 1329 } else {
+ − 1330 p <- para$p
+ − 1331 }
+ − 1332
+ − 1333 if(fix.rho2){
+ − 1334 rho2 <- rho2.0
+ − 1335 } else {
+ − 1336 rho2 <- para$rho2
+ − 1337 }
+ − 1338
+ − 1339 # rho1 <- 0.8
+ − 1340 rho1 <- para$rho1
+ − 1341
+ − 1342 l0 <- loglik.2gaussian.copula.value(x, y, p, rho1, rho2, para$pdf.cdf)
+ − 1343
+ − 1344 loglik.trace <- c()
+ − 1345 loglik.trace[1] <- l0
+ − 1346 # loglik.trace[2] <- l1
+ − 1347 to.run <- T
+ − 1348
+ − 1349 i <- 2
+ − 1350
+ − 1351 # this two lines to remove
+ − 1352 # x.mar <- est.mar.hist(x, e.z, breaks)
+ − 1353 # y.mar <- est.mar.hist(y, e.z, breaks)
+ − 1354
+ − 1355 while(to.run){
+ − 1356
+ − 1357 e.z <- e.step.2gaussian.value(x, y, p, rho1, rho2, para$pdf.cdf)
+ − 1358 if(!stoc)
+ − 1359 para <- m.step.2gaussian.value(x, y, e.z, breaks, fix.rho2)
+ − 1360 else
+ − 1361 para <- m.step.2gaussian.stoc.value(x, y, e.z, breaks, fix.rho2)
+ − 1362
+ − 1363 # fix x.mar and y.mar : to remove
+ − 1364 # if(!stoc)
+ − 1365 # para <- m.step.2gaussian.value2(x, y, e.z, breaks, fix.rho2, x.mar, y.mar)
+ − 1366 # else
+ − 1367 # para <- m.step.2gaussian.stoc.value(x, y, e.z, breaks, fix.rho2)
+ − 1368
+ − 1369
+ − 1370 if(fix.p){
+ − 1371 p <- p0
+ − 1372 } else {
+ − 1373 p <- para$p
+ − 1374 }
+ − 1375
+ − 1376 if(fix.rho2){
+ − 1377 rho2 <- rho2.0
+ − 1378 } else {
+ − 1379 rho2 <- para$rho2
+ − 1380 }
+ − 1381
+ − 1382 # rho1 <- 0.8
+ − 1383 rho1 <- para$rho1
+ − 1384
+ − 1385 # l0 <- l1
+ − 1386 l1 <- loglik.2gaussian.copula.value(x, y, p, rho1, rho2, para$pdf.cdf)
+ − 1387 loglik.trace[i] <- l1
+ − 1388
+ − 1389 #cat("l1=", l1, "\n")
+ − 1390
+ − 1391 # Aitken acceleration criterion
+ − 1392 if(i > 2){
+ − 1393 l.inf <- loglik.trace[i-2] + (loglik.trace[i-1] - loglik.trace[i-2])/(1-(loglik.trace[i]-loglik.trace[i-1])/(loglik.trace[i-1]-loglik.trace[i-2]))
+ − 1394 to.run <- abs(l.inf - loglik.trace[i]) > eps
+ − 1395 #cat("para=", "p=", para$p, " rho1=", rho1, " rho2=", rho2, "\n")
+ − 1396 #cat("l.inf=", l.inf, "\n")
+ − 1397 #cat(l.inf-loglik.trace[i], "\n")
+ − 1398 }
+ − 1399
+ − 1400 i <- i+1
+ − 1401 }
+ − 1402
+ − 1403 bic <- -2*l1 + (2*(length(breaks)-1+1)+1-fix.p-fix.rho2)*log(length(x)) # parameters
+ − 1404 return(list(para=list(p=para$p, rho1=rho1, rho2=rho2),
+ − 1405 loglik=l1, bic=bic, e.z=e.z, conf.z = para$conf.z,
+ − 1406 loglik.trace=loglik.trace, x.mar=para$x.mar, y.mar=para$y.mar,
+ − 1407 breaks=breaks))
+ − 1408 }
+ − 1409
+ − 1410
+ − 1411
+ − 1412 em.2copula.quick <- function(x, y, p0, rho1.0, rho2.0, eps, fix.p=F, stoc=T, fix.rho2=T, copula.txt, nbin=50){
+ − 1413
+ − 1414 x <- rank(x, tie="random")
+ − 1415 y <- rank(y, tie="random")
+ − 1416
+ − 1417 # x <- rank(x, tie="first")
+ − 1418 # y <- rank(y, tie="first")
+ − 1419
+ − 1420 # nbin=50
+ − 1421 xy.min <- min(x, y)
+ − 1422 xy.max <- max(x, y)
+ − 1423 binwidth <- (xy.max-xy.min)/50
+ − 1424 breaks <- seq(xy.min-binwidth/100, xy.max+binwidth/100, by=(xy.max-xy.min+binwidth/50)/nbin)
+ − 1425 # breaks <- seq(xy.min, xy.max, by=binwidth)
+ − 1426
+ − 1427 # initiate marginals
+ − 1428 # initialization: first p0 data has
+ − 1429 # e.z <- e.step.2gaussian(x, y, p0, rho1.0, rho2.0, x0.mar, y0.mar) # this starting point assumes two components are overlapped
+ − 1430
+ − 1431 e.z <- c(rep(0.9, round(length(x)*p0)), rep(0.1, length(x)-round(length(x)*p0)))
+ − 1432
+ − 1433
+ − 1434 if(!stoc)
+ − 1435 para <- m.step.2copula.value(x, y, e.z, breaks, fix.rho2, copula.txt)
+ − 1436 else
+ − 1437 para <- m.step.2copula.stoc.value(x, y, e.z, breaks, fix.rho2, copula.txt)
+ − 1438
+ − 1439 if(fix.p){
+ − 1440 p <- p0
+ − 1441 } else {
+ − 1442 p <- para$p
+ − 1443 }
+ − 1444
+ − 1445 if(fix.rho2){
+ − 1446 rho2 <- rho2.0
+ − 1447 } else {
+ − 1448 rho2 <- para$rho2
+ − 1449 }
+ − 1450
+ − 1451 l0 <- loglik.2copula.value(x, y, p, para$rho1, rho2, para$pdf.cdf, copula.txt)
+ − 1452
+ − 1453 loglik.trace <- c()
+ − 1454 loglik.trace[1] <- l0
+ − 1455 # loglik.trace[2] <- l1
+ − 1456 to.run <- T
+ − 1457
+ − 1458 i <- 2
+ − 1459
+ − 1460 while(to.run){
+ − 1461
+ − 1462 e.z <- e.step.2copula.value(x, y, p, para$rho1, rho2, para$pdf.cdf, copula.txt)
+ − 1463 if(!stoc)
+ − 1464 para <- m.step.2copula.value(x, y, e.z, breaks, fix.rho2, copula.txt)
+ − 1465 else
+ − 1466 para <- m.step.2copula.stoc.value(x, y, e.z, breaks, fix.rho2, copula.txt)
+ − 1467
+ − 1468 if(fix.p){
+ − 1469 p <- p0
+ − 1470 } else {
+ − 1471 p <- para$p
+ − 1472 }
+ − 1473
+ − 1474 if(fix.rho2){
+ − 1475 rho2 <- rho2.0
+ − 1476 } else {
+ − 1477 rho2 <- para$rho2
+ − 1478 }
+ − 1479
+ − 1480
+ − 1481 # l0 <- l1
+ − 1482 l1 <- loglik.2copula.value(x, y, p, para$rho1, rho2, para$pdf.cdf, copula.txt)
+ − 1483 loglik.trace[i] <- l1
+ − 1484
+ − 1485 cat("l1=", l1, "\n")
+ − 1486
+ − 1487 # Aitken acceleration criterion
+ − 1488 if(i > 2){
+ − 1489 l.inf <- loglik.trace[i-2] + (loglik.trace[i-1] - loglik.trace[i-2])/(1-(loglik.trace[i]-loglik.trace[i-1])/(loglik.trace[i-1]-loglik.trace[i-2]))
+ − 1490 to.run <- abs(l.inf - loglik.trace[i]) > eps
+ − 1491 cat("para=", "p=", para$p, " rho1=", para$rho1, " rho2=", rho2, "\n")
+ − 1492 #cat("l.inf=", l.inf, "\n")
+ − 1493 #cat(l.inf-loglik.trace[i], "\n")
+ − 1494 }
+ − 1495
+ − 1496 i <- i+1
+ − 1497 }
+ − 1498
+ − 1499 bic <- -2*l1 + (2*(length(breaks)-1+1)+1-fix.p-fix.rho2)*log(length(x)) # parameters
+ − 1500 return(list(para=list(p=para$p, rho1=para$rho1, rho2=rho2),
+ − 1501 loglik=l1, bic=bic, e.z=e.z, conf.z = para$conf.z,
+ − 1502 loglik.trace=loglik.trace, x.mar=para$x.mar, y.mar=para$y.mar,
+ − 1503 breaks=breaks))
+ − 1504 }
+ − 1505
+ − 1506
+ − 1507 #######################
+ − 1508 ####################### fit EM procedure for the matched peaks
+ − 1509 #######################
+ − 1510
+ − 1511 # remove the unmatched ones
+ − 1512 #rm.unmatch <- function(sample1, sample2, p.value.impute=0){
+ − 1513 #
+ − 1514 # sample1.prune <- sample1[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,]
+ − 1515 # sample2.prune <- sample2[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,]
+ − 1516 #
+ − 1517 # invisible(list(sample1=sample1.prune$sig.value, sample2=sample2.prune$sig.value))
+ − 1518 #}
+ − 1519
+ − 1520
+ − 1521 # fit 2-component model
+ − 1522 #fit.em <- function(sample12, fix.rho2=T){
+ − 1523 #
+ − 1524 # prune.sample <- rm.unmatch(sample12$merge1, sample12$merge2)
+ − 1525 #
+ − 1526 # em.fit <- em.2gaussian.quick(-prune.sample$sample1, -prune.sample$sample2,
+ − 1527 # p0=0.5, rho1.0=0.7, rho2.0=0, eps=0.01, fix.p=F, stoc=F, fix.rho2)
+ − 1528 #
+ − 1529 # invisible(list(em.fit=em.fit, data.pruned=prune.sample))
+ − 1530 #}
+ − 1531
+ − 1532
+ − 1533 rm.unmatch <- function(sample1, sample2, p.value.impute=0){
+ − 1534
+ − 1535 sample1.prune <- sample1[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,]
+ − 1536 sample2.prune <- sample2[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,]
+ − 1537
+ − 1538 invisible(list(sample1=sample1.prune, sample2=sample2.prune))
+ − 1539 }
+ − 1540
+ − 1541
+ − 1542 # fit 2-component model
+ − 1543 fit.em <- function(sample12, fix.rho2=T){
+ − 1544
+ − 1545 prune.sample <- rm.unmatch(sample12$merge1, sample12$merge2)
+ − 1546
+ − 1547 em.fit <- em.2gaussian.quick(-prune.sample$sample1$sig.value, -prune.sample$sample2$sig.value,
+ − 1548 p0=0.5, rho1.0=0.7, rho2.0=0, eps=0.01, fix.p=F, stoc=F, fix.rho2)
+ − 1549
+ − 1550 invisible(list(em.fit=em.fit, data.pruned=prune.sample))
+ − 1551 }
+ − 1552
+ − 1553
+ − 1554
+ − 1555 fit.2copula.em <- function(sample12, fix.rho2=T, copula.txt){
+ − 1556
+ − 1557 prune.sample <- rm.unmatch(sample12$merge1, sample12$merge2)
+ − 1558
+ − 1559 # o <- order(prune.sample$sample1)
+ − 1560 # n <- length(prune.sample$sample1)
+ − 1561
+ − 1562 # para <- init(prune.sample$sample1$sig.value, prune.sample$sample2$sig.value, c(rep(0, round(n/3)), rep(c(0,1), round(n/6)), rep(1, n-round(n/3)-round(n/6))))
+ − 1563
+ − 1564 # temp <- init.dist(f0, f1)
+ − 1565 para <- list()
+ − 1566 para$rho <- 0.6
+ − 1567 para$p <- 0.3
+ − 1568 para$mu <- 2.5
+ − 1569 para$sigma <- 1
+ − 1570 ## para$mu <- -temp$mu
+ − 1571 ## para$sigma <- temp$sigma
+ − 1572 #cat("mu=", para$mu, "sigma=", para$sigma, "\n")
+ − 1573
+ − 1574 # em.fit <- em.transform.1loop(-prune.sample$sample1, -prune.sample$sample2,
+ − 1575 cat("EM is running")
+ − 1576 em.fit <- em.transform(prune.sample$sample1$sig.value, prune.sample$sample2$sig.value, para$mu, para$sigma, para$rho, para$p, eps=0.01)
+ − 1577
+ − 1578 invisible(list(em.fit=em.fit, data.pruned=prune.sample))
+ − 1579 }
+ − 1580
+ − 1581
+ − 1582
+ − 1583
+ − 1584 # fit 1-component model
+ − 1585 fit.1.component <- function(data.pruned, breaks){
+ − 1586
+ − 1587 # gaussian.1 <- fit.gaussian.1(-data.pruned$sample1$sig.value, -data.pruned$sample2$sig.value, breaks)
+ − 1588 # clayton.1 <- fit.clayton.1(-data.pruned$sample1$sig.value, -data.pruned$sample2$sig.value, breaks)
+ − 1589
+ − 1590 gaussian.1 <- fit.gaussian.1(-data.pruned$sample1, -data.pruned$sample2, breaks)
+ − 1591 clayton.1 <- fit.clayton.1(-data.pruned$sample1, -data.pruned$sample2, breaks)
+ − 1592
+ − 1593 return(list(gaussian.1=gaussian.1, clayton.1=clayton.1))
+ − 1594 }
+ − 1595
+ − 1596
+ − 1597
+ − 1598 #################
+ − 1599 # Fit a single component
+ − 1600 #################
+ − 1601
+ − 1602 # a single gaussian copula
+ − 1603 # if breaks=NULL, use empirical pdf, otherwise use histogram estimate
+ − 1604 fit.gaussian.1 <- function(x, y, breaks=NULL){
+ − 1605
+ − 1606 # rank transformed and compute the empirical cdf
+ − 1607 t <- emp.mar.cdf.rank(x)
+ − 1608 s <- emp.mar.cdf.rank(y)
+ − 1609
+ − 1610 mle.rho <- mle.gaussian.copula(t, s, rep(1, length(t)))
+ − 1611
+ − 1612 c1 <- gaussian.cop.den(t, s, mle.rho)
+ − 1613 cat("c1", sum(log(c1)), "\n")
+ − 1614
+ − 1615 if(is.null(breaks)){
+ − 1616 f1 <- emp.mar.pdf.rank(t)
+ − 1617 f2 <- emp.mar.pdf.rank(s)
+ − 1618 } else {
+ − 1619 x.mar <- est.mar.hist(rank(x), rep(1, length(x)), breaks)
+ − 1620 y.mar <- est.mar.hist(rank(y), rep(1, length(y)), breaks)
+ − 1621
+ − 1622 f1 <- x.mar$f1.value$pdf # only one component
+ − 1623 f2 <- y.mar$f1.value$pdf
+ − 1624 }
+ − 1625
+ − 1626
+ − 1627 cat("f1", sum(log(f1)), "\n")
+ − 1628 cat("f2", sum(log(f2)), "\n")
+ − 1629
+ − 1630 loglik <- sum(log(c1)+log(f1)+log(f2))
+ − 1631
+ − 1632 bic <- -2*loglik + log(length(t))*(1+length(breaks)-1)
+ − 1633
+ − 1634 return(list(rho=mle.rho, loglik=loglik, bic=bic))
+ − 1635 }
+ − 1636
+ − 1637
+ − 1638 # a single Clayton copula
+ − 1639 fit.clayton.1 <- function(x, y, breaks=NULL){
+ − 1640
+ − 1641 # rank transformed and compute the empirical cdf
+ − 1642 t <- emp.mar.cdf.rank(x)
+ − 1643 s <- emp.mar.cdf.rank(y)
+ − 1644
+ − 1645 mle.rho <- mle.clayton.copula(t, s, rep(1, length(t)))
+ − 1646
+ − 1647 c1 <- clayton.cop.den(t, s, mle.rho)
+ − 1648
+ − 1649 if(is.null(breaks)){
+ − 1650 f1 <- emp.mar.pdf.rank(t)
+ − 1651 f2 <- emp.mar.pdf.rank(s)
+ − 1652 } else {
+ − 1653 x.mar <- est.mar.hist(rank(x), rep(1, length(x)), breaks)
+ − 1654 y.mar <- est.mar.hist(rank(y), rep(1, length(y)), breaks)
+ − 1655
+ − 1656 f1 <- x.mar$f1.value$pdf # only one component
+ − 1657 f2 <- y.mar$f1.value$pdf
+ − 1658 }
+ − 1659
+ − 1660 loglik <- sum(log(c1)+log(f1)+log(f2))
+ − 1661
+ − 1662 bic <- -2*loglik + log(length(t))*(1+length(breaks)-1)
+ − 1663
+ − 1664 return(list(rho=mle.rho, tau=rho/(rho+2), loglik=loglik, bic=bic))
+ − 1665 }
+ − 1666
+ − 1667 ## obsolete function (01-06-2010)
+ − 1668 ## compute the average posterior probability to belong to the random component
+ − 1669 ## for peaks selected at different cutoffs
+ − 1670 comp.uri.ez <- function(tt, u, v, e.z){
+ − 1671
+ − 1672 u.t <- quantile(u, prob=(1-tt))
+ − 1673 v.t <- quantile(v, prob=(1-tt))
+ − 1674
+ − 1675 # ez <- mean(e.z[u >= u.t & v >=u.t]) Is this wrong?
+ − 1676 ez <- mean(e.z[u >= u.t & v >=v.t])
+ − 1677
+ − 1678 return(ez)
+ − 1679 }
+ − 1680
+ − 1681 ## obsolete function (01-06-2010)
+ − 1682 # compute the largest posterior error probability corresponding to
+ − 1683 # the square centered at the origin and spanned top tt% on both coordinates
+ − 1684 # so the consistent low rank ones are excluded
+ − 1685 # boundary.txt: either "max" or "min", if it is error prob, use "max"
+ − 1686 comp.ez.cutoff <- function(tt, u, v, e.z, boundary.txt){
+ − 1687
+ − 1688 u.t <- quantile(u, prob=(1-tt))
+ − 1689 v.t <- quantile(v, prob=(1-tt))
+ − 1690
+ − 1691 if(boundary.txt == "max"){
+ − 1692 # ez.bound <- max(e.z[u >= u.t & v >=u.t])
+ − 1693 ez.bound <- max(e.z[u >= u.t & v >=v.t])
+ − 1694 } else {
+ − 1695 # ez.bound <- min(e.z[u >= u.t & v >=u.t])
+ − 1696 ez.bound <- min(e.z[u >= u.t & v >=v.t])
+ − 1697 }
+ − 1698
+ − 1699 return(ez.bound)
+ − 1700
+ − 1701 }
+ − 1702
+ − 1703 # obsolete functions: 01-06-2010
+ − 1704 # compute the error rate
+ − 1705 # u.t and v.t are the quantiles
+ − 1706 # this one is used for the plots generated initially in the brief writeup
+ − 1707 # and it was used for processing merged data in July before the IDR definition
+ − 1708 # is formalized
+ − 1709 # It does not implement the current definition of IDR
+ − 1710 get.ez.tt.old <- function(em.fit, reverse=T, fdr.level=c(0.01, 0.05, 0.1)){
+ − 1711
+ − 1712 u <- em.fit$data.pruned$sample1
+ − 1713 v <- em.fit$data.pruned$sample2
+ − 1714
+ − 1715 tt <- seq(0.01, 0.99, by=0.01)
+ − 1716 if(reverse){
+ − 1717 e.z <- 1-em.fit$em.fit$e.z # this is the error prob
+ − 1718 uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z)
+ − 1719 ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="max")
+ − 1720 } else {
+ − 1721 e.z <- em.fit$em.fit$e.z
+ − 1722 uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z)
+ − 1723 ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="min")
+ − 1724 }
+ − 1725
+ − 1726 u.t <- quantile(u, prob=(1-tt))
+ − 1727 v.t <- quantile(v, prob=(1-tt))
+ − 1728
+ − 1729 # find the levels on the two replicates
+ − 1730 sig.value1 <- c()
+ − 1731 sig.value2 <- c()
+ − 1732 error.prob.cutoff <- c()
+ − 1733 n.selected.match <- c()
+ − 1734
+ − 1735 for(i in 1:length(fdr.level)){
+ − 1736
+ − 1737 # find which uri.ez is closet to fdr.level
+ − 1738 index <- which.min(abs(uri.ez - fdr.level[i]))
+ − 1739 sig.value1[i] <- u.t[index]
+ − 1740 sig.value2[i] <- v.t[index]
+ − 1741 error.prob.cutoff[i] <- ez.bound[index]
+ − 1742 if(reverse){
+ − 1743 n.selected.match[i] <- sum(e.z<=ez.bound[index])
+ − 1744 } else {
+ − 1745 n.selected.match[i] <- sum(e.z>=ez.bound[index])
+ − 1746 }
+ − 1747 }
+ − 1748
+ − 1749 # output the cutoff of posterior probability, signal values on two replicates
+ − 1750 map.uv <- cbind(error.prob.cutoff, sig.value1, sig.value2, n.selected.match)
+ − 1751
+ − 1752 return(list(n=tt*length(u), uri.ez=uri.ez, u.t=u.t, v.t=v.t, tt=tt, fdr.level=fdr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff))
+ − 1753 }
+ − 1754
+ − 1755 # created: 01-06-2010
+ − 1756 # Output IDR at various number of selected peaks
+ − 1757 # Find cutoff (idr cutoff, sig.value cutoff on each replicate) for specified IDR level
+ − 1758 # IDR definition is similar to FDR
+ − 1759 get.ez.tt <- function(em.fit, idr.level=c(0.01, 0.05, 0.1)){
+ − 1760
+ − 1761 # u <- em.fit$data.pruned$sample1$sig.value
+ − 1762 # v <- em.fit$data.pruned$sample2$sig.value
+ − 1763 u <- em.fit$data.pruned$sample1
+ − 1764 v <- em.fit$data.pruned$sample2
+ − 1765
+ − 1766 e.z <- 1-em.fit$em.fit$e.z # this is the error prob
+ − 1767
+ − 1768 o <- order(e.z)
+ − 1769 e.z.ordered <- e.z[o]
+ − 1770 n.select <- c(1:length(e.z))
+ − 1771 IDR <- cumsum(e.z.ordered)/n.select
+ − 1772
+ − 1773 u.o <- u[o]
+ − 1774 v.o <- v[o]
+ − 1775
+ − 1776 n.level <- length(idr.level)
+ − 1777 # sig.value1 <- rep(NA, n.level)
+ − 1778 # sig.value2 <- rep(NA, n.level)
+ − 1779 ez.cutoff <- rep(NA, n.level)
+ − 1780 n.selected <- rep(NA, n.level)
+ − 1781
+ − 1782 for(i in 1:length(idr.level)){
+ − 1783
+ − 1784 # find which uri.ez is closet to fdr.level
+ − 1785 index <- which.min(abs(IDR - idr.level[i]))
+ − 1786 # sig.value1[i] <- min(u.o[1:index])
+ − 1787 # sig.value2[i] <- min(v.o[1:index])
+ − 1788 ez.cutoff[i] <- e.z[index]
+ − 1789 n.selected[i] <- sum(e.z<=ez.cutoff[i])
+ − 1790 }
+ − 1791
+ − 1792 # output the cutoff of posterior probability, number of selected overlapped peaks
+ − 1793 # map.uv <- cbind(ez.cutoff, sig.value1, sig.value2, n.selected)
+ − 1794
+ − 1795 map.uv <- cbind(ez.cutoff, n.selected)
+ − 1796
+ − 1797 return(list(n=n.select, IDR=IDR, idr.level=idr.level, map.uv=map.uv))
+ − 1798 }
+ − 1799
+ − 1800 # return(list(n=tt*length(u), uri.ez=uri.ez, fdr.level=fdr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff))
+ − 1801
+ − 1802
+ − 1803
+ − 1804
+ − 1805
+ − 1806 ### compute the mean of the marginals
+ − 1807 get.mar.mean <- function(em.out){
+ − 1808
+ − 1809 x.f1 <- em.out$x.mar$f1
+ − 1810 x.f2 <- em.out$x.mar$f2
+ − 1811
+ − 1812 y.f1 <- em.out$y.mar$f1
+ − 1813 y.f2 <- em.out$y.mar$f2
+ − 1814
+ − 1815 x.stat1 <- get.hist.mean(x.f1)
+ − 1816 x.stat2 <- get.hist.mean(x.f2)
+ − 1817 y.stat1 <- get.hist.mean(y.f1)
+ − 1818 y.stat2 <- get.hist.mean(y.f2)
+ − 1819
+ − 1820 return(list(x.mean1=x.stat1$mean, x.mean2=x.stat2$mean,
+ − 1821 y.mean1=y.stat1$mean, y.mean2=y.stat2$mean,
+ − 1822 x.sd1=x.stat1$sd, x.sd2=x.stat2$sd,
+ − 1823 y.sd1=y.stat1$sd, y.sd2=y.stat2$sd
+ − 1824 ))
+ − 1825
+ − 1826 }
+ − 1827
+ − 1828
+ − 1829 # compute the mean of marginals
+ − 1830 get.hist.mean <- function(x.f){
+ − 1831
+ − 1832 nbreaks <- length(x.f$breaks)
+ − 1833 x.bin <- x.f$breaks[-1]-x.f$breaks[-nbreaks]
+ − 1834
+ − 1835 x.mid <- (x.f$breaks[-nbreaks]+x.f$breaks[-1])/2
+ − 1836 x.mean <- sum(x.mid*x.f$density*x.bin)
+ − 1837 x.sd <- sqrt(sum(x.mid*x.mid*x.f$density*x.bin)-x.mean^2)
+ − 1838
+ − 1839 return(list(mean=x.mean, sd=x.sd))
+ − 1840 }
+ − 1841
+ − 1842 get.hist.var <- function(x.f){
+ − 1843
+ − 1844 nbreaks <- length(x.f$breaks)
+ − 1845 x.bin <- x.f$breaks[-1]-x.f$breaks[-nbreaks]
+ − 1846
+ − 1847 x.mid <- (x.f$breaks[-nbreaks]+x.f$breaks[-1])/2
+ − 1848 x.mean <- sum(x.mid*x.f$density*x.bin)
+ − 1849
+ − 1850 return(mean=x.mean)
+ − 1851 }
+ − 1852
+ − 1853 # obsolete function (01-06-2010)
+ − 1854 # plot
+ − 1855 plot.ez.group.old <- function(ez.list, plot.dir, file.name=NULL, legend.txt, y.lim=NULL, xlab.txt="num of significant peaks", ylab.txt="avg posterior prob of being random", col.txt=NULL, title.txt=NULL){
+ − 1856
+ − 1857 if(is.null(col.txt))
+ − 1858 col.txt <- c("black", "red", "purple", "green", "blue", "cyan", "magenta", "orange", "grey")
+ − 1859
+ − 1860 x <- c()
+ − 1861 y <- c()
+ − 1862
+ − 1863 for(i in 1:length(ez.list)){
+ − 1864 x <- c(x, ez.list[[i]]$n)
+ − 1865
+ − 1866 y <- c(y, ez.list[[i]]$uri.ez)
+ − 1867 }
+ − 1868
+ − 1869 if(is.null(y.lim))
+ − 1870 y.lim <- c(0, max(y))
+ − 1871
+ − 1872 if(!is.null(file.name)){
+ − 1873 postscript(paste(plot.dir, "ez.", file.name, sep=""))
+ − 1874 par(mfrow=c(1,1), mar=c(5,5,4,2))
+ − 1875 }
+ − 1876
+ − 1877 plot(x, y, ylim=y.lim, type="n", xlab=xlab.txt, ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2)
+ − 1878
+ − 1879 for(i in 1:length(ez.list)){
+ − 1880 lines(ez.list[[i]]$n, ez.list[[i]]$uri.ez, col=col.txt[i], cex=2, lwd=5)
+ − 1881 }
+ − 1882
+ − 1883 # plot(ez.list[[1]]$u.t, y, ylim=y.lim, type="l", xlab="rep-sig", ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2)
+ − 1884 # plot(ez.list[[1]]$v.t, y, ylim=y.lim, type="l", xlab="rep-sig", ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2)
+ − 1885
+ − 1886
+ − 1887 legend(0, y.lim[2], legend=legend.txt, col=col.txt[1:length(col.txt)], lty=1, lwd=5, cex=2)
+ − 1888
+ − 1889 if(!is.null(title))
+ − 1890 title(title.txt)
+ − 1891
+ − 1892 if(!is.null(file.name)){
+ − 1893 dev.off()
+ − 1894 }
+ − 1895
+ − 1896 }
+ − 1897
+ − 1898
+ − 1899 plot.ez.group <- function(ez.list, plot.dir, file.name=NULL, legend.txt, y.lim=NULL, xlab.txt="num of significant peaks", ylab.txt="IDR", col.txt=NULL, title.txt=NULL){
+ − 1900
+ − 1901 if(is.null(col.txt))
+ − 1902 col.txt <- c("black", "red", "purple", "green", "blue", "cyan", "magenta", "orange", "grey")
+ − 1903
+ − 1904 n.entry <- length(ez.list)
+ − 1905 x <- rep(NA, n.entry)
+ − 1906 y.max <- rep(NA, n.entry)
+ − 1907
+ − 1908 for(i in 1:n.entry){
+ − 1909 x[i] <- max(ez.list[[i]]$n)
+ − 1910
+ − 1911 y.max[i] <- max(ez.list[[i]]$IDR)
+ − 1912
+ − 1913 }
+ − 1914
+ − 1915 if(is.null(y.lim))
+ − 1916 y.lim <- c(0, max(y.max))
+ − 1917
+ − 1918 if(!is.null(file.name)){
+ − 1919 postscript(paste(plot.dir, "ez.", file.name, sep=""))
+ − 1920 par(mfrow=c(1,1), mar=c(5,5,4,2))
+ − 1921 }
+ − 1922
+ − 1923
+ − 1924
+ − 1925 plot(c(0, max(x)), y.lim, ylim=y.lim, type="n", xlab=xlab.txt, ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2)
+ − 1926
+ − 1927 q <- seq(0.01, 0.99, by=0.01)
+ − 1928
+ − 1929 for(i in 1:length(ez.list)){
+ − 1930
+ − 1931 n.plot <- round(quantile(ez.list[[i]]$n, prob=q))
+ − 1932 IDR.plot <- ez.list[[i]]$IDR[n.plot]
+ − 1933 lines(n.plot, IDR.plot, col=col.txt[i], cex=2, lwd=5)
+ − 1934 }
+ − 1935
+ − 1936
+ − 1937 legend(0, y.lim[2], legend=legend.txt, col=col.txt[1:length(col.txt)], lty=1, lwd=5, cex=2)
+ − 1938
+ − 1939 if(!is.null(title))
+ − 1940 title(title.txt)
+ − 1941
+ − 1942 if(!is.null(file.name)){
+ − 1943 dev.off()
+ − 1944 }
+ − 1945
+ − 1946 }
+ − 1947
+ − 1948
+ − 1949
+ − 1950 #############################################################################
+ − 1951 #############################################################################
+ − 1952 # statistics about peaks selected on the individual replicates
+ − 1953 #
+ − 1954 # idr.level: the consistency cutoff, say 0.05
+ − 1955 # uri.output: a list of uri.output from consistency analysis generated by batch-consistency-analysis.r
+ − 1956 # ez.list : a list of IDRs computed from get.ez.tt using the same idr.level
+ − 1957 #
+ − 1958 ##################
+ − 1959
+ − 1960
+ − 1961 # obsolete?
+ − 1962 # compute the error rate
+ − 1963 # u.t and v.t are the quantiles
+ − 1964 #
+ − 1965 # map back to all peaks and report the number of peaks selected
+ − 1966 get.ez.tt.all.old <- function(em.fit, all.data1, all.data2, idr.level){
+ − 1967
+ − 1968 u <- em.fit$data.pruned$sample1
+ − 1969 v <- em.fit$data.pruned$sample2
+ − 1970
+ − 1971 tt <- seq(0.01, 0.99, by=0.01)
+ − 1972 # if(reverse){
+ − 1973 e.z <- 1-em.fit$em.fit$e.z # this is the error prob
+ − 1974 uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z)
+ − 1975 ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="max")
+ − 1976 # } else {
+ − 1977 # e.z <- em.fit$em.fit$e.z
+ − 1978 # uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z)
+ − 1979 # ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="min")
+ − 1980 # }
+ − 1981
+ − 1982 u.t <- quantile(u, prob=(1-tt))
+ − 1983 v.t <- quantile(v, prob=(1-tt))
+ − 1984
+ − 1985 # find the levels on the two replicates
+ − 1986 sig.value1 <- c()
+ − 1987 sig.value2 <- c()
+ − 1988 error.prob.cutoff <- c()
+ − 1989 n.selected.match <- c()
+ − 1990 npeak.rep1 <- c()
+ − 1991 npeak.rep2 <- c()
+ − 1992
+ − 1993 for(i in 1:length(idr.level)){
+ − 1994
+ − 1995 # find which uri.ez is closet to idr.level
+ − 1996 index <- which.min(abs(uri.ez - as.numeric(idr.level[i])))
+ − 1997
+ − 1998 sig.value1[i] <- u.t[index]
+ − 1999 sig.value2[i] <- v.t[index]
+ − 2000 error.prob.cutoff[i] <- ez.bound[index]
+ − 2001 n.selected.match[i] <- sum(u>= u.t[index] & v>=v.t[index])
+ − 2002
+ − 2003 npeak.rep1[i] <- sum(all.data1["sig.value"] >= sig.value1[i])
+ − 2004 npeak.rep2[i] <- sum(all.data2["sig.value"] >= sig.value2[i])
+ − 2005 }
+ − 2006
+ − 2007
+ − 2008 # output the cutoff of posterior probability, signal values on two replicates
+ − 2009 map.uv <- cbind(error.prob.cutoff, sig.value1, sig.value2, n.selected.match, npeak.rep1, npeak.rep2)
+ − 2010
+ − 2011 return(list(n=tt*length(u), uri.ez=uri.ez, u.t=u.t, v.t=v.t, tt=tt, idr.level=idr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff))
+ − 2012 }
+ − 2013
+ − 2014
+ − 2015 get.ez.tt.all <- function(em.fit, all.data1, all.data2, idr.level=c(0.01, 0.05, 0.1)){
+ − 2016
+ − 2017 u <- em.fit$data.pruned$sample1$sig.value
+ − 2018 v <- em.fit$data.pruned$sample2$sig.value
+ − 2019 # u <- em.fit$data.pruned$sample1
+ − 2020 # v <- em.fit$data.pruned$sample2
+ − 2021
+ − 2022 e.z <- 1-em.fit$em.fit$e.z # this is the error prob
+ − 2023
+ − 2024 o <- order(e.z)
+ − 2025 e.z.ordered <- e.z[o]
+ − 2026 n.select <- c(1:length(e.z))
+ − 2027 IDR <- cumsum(e.z.ordered)/n.select
+ − 2028
+ − 2029 u.o <- u[o]
+ − 2030 v.o <- v[o]
+ − 2031
+ − 2032 n.level <- length(idr.level)
+ − 2033 # sig.value1 <- rep(NA, n.level)
+ − 2034 # sig.value2 <- rep(NA, n.level)
+ − 2035 ez.cutoff <- rep(NA, n.level)
+ − 2036 n.selected <- rep(NA, n.level)
+ − 2037 npeak.rep1 <- rep(NA, n.level)
+ − 2038 npeak.rep2 <- rep(NA, n.level)
+ − 2039
+ − 2040 for(i in 1:length(idr.level)){
+ − 2041
+ − 2042 # find which uri.ez is closet to fdr.level
+ − 2043 index <- which.min(abs(IDR - idr.level[i]))
+ − 2044 # sig.value1[i] <- min(u.o[1:index])
+ − 2045 # sig.value2[i] <- min(v.o[1:index])
+ − 2046 ez.cutoff[i] <- e.z.ordered[index] # fixed on 02/20/10
+ − 2047 n.selected[i] <- sum(e.z<=ez.cutoff[i])
+ − 2048 # npeak.rep1[i] <- sum(all.data1["sig.value"] >= sig.value1[i])
+ − 2049 # npeak.rep2[i] <- sum(all.data2["sig.value"] >= sig.value2[i])
+ − 2050 }
+ − 2051
+ − 2052 # output the cutoff of posterior probability, number of selected overlapped peaks
+ − 2053 map.uv <- cbind(ez.cutoff, n.selected)
+ − 2054
+ − 2055 return(list(n=n.select, IDR=IDR, idr.level=idr.level, map.uv=map.uv))
+ − 2056 }
+ − 2057
+ − 2058 # return(list(n=tt*length(u), uri.ez=uri.ez, fdr.level=fdr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff))
+ − 2059
+ − 2060
+ − 2061
+ − 2062
+ − 2063
+ − 2064
+ − 2065 ####### the following is for determining thresholds for merged dataset
+ − 2066
+ − 2067 ############# select peaks above a given threshold
+ − 2068 #
+ − 2069 # pass.threshold: a simple method, passing the threshold on the threshold on the individual replicate to the pooled sample
+ − 2070 #
+ − 2071 # sig.map.list: a list of matrix to include all the cutoff values, each row corresponds to a cutoff. The first column is idr.level
+ − 2072 # the 2nd column is the cutoff of ez, the rest of columns are consistency analysis for other replicates
+ − 2073 # sig.value.name: the name of the sig.value column
+ − 2074 # combined: combined dataset
+ − 2075 # nrep: number of pairs of comparisons
+ − 2076 #
+ − 2077 # Procedure:
+ − 2078 # 1. Find the significant threshold corresponding to the idr cutoff on the matched peaks.
+ − 2079 # 2. Each time we will get two or more (if >2 replicates) cutoffs and will report the most stringent and the least stringent
+ − 2080 # cutoff and the number of peaks selected at those two cutoffs
+ − 2081 #############
+ − 2082
+ − 2083 pass.threshold <- function(sig.map.list, sig.value.name, combined, idr.level, nrep, chr.size){
+ − 2084
+ − 2085 sig.map <- c()
+ − 2086
+ − 2087 # choose idr.level
+ − 2088 idr.index <- which(rbind(sig.map.list[[1]])[,1] == idr.level)
+ − 2089 if(length(i) ==0){
+ − 2090 print("no level matches specified idr.level")
+ − 2091 return(-1)
+ − 2092 }
+ − 2093
+ − 2094 for(i in 1:length(sig.map.list))
+ − 2095 sig.map <- c(sig.map, rbind(sig.map.list[[i]])[idr.index, c("sig.value1", "sig.value2")])
+ − 2096
+ − 2097
+ − 2098 npeak.tight <- c()
+ − 2099 npeak.loose <- c()
+ − 2100
+ − 2101
+ − 2102 max.sig <- max(sig.map)
+ − 2103 min.sig <- min(sig.map)
+ − 2104 selected.sig.tight <- combined[combined[,sig.value.name]>=max.sig, ]
+ − 2105 selected.sig.loose <- combined[combined[,sig.value.name]>=min.sig, ]
+ − 2106
+ − 2107 selected.sig.tight <- deconcatenate.chr(selected.sig.tight, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")]
+ − 2108 selected.sig.loose <- deconcatenate.chr(selected.sig.loose, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")]
+ − 2109
+ − 2110 npeak.tight <- nrow(selected.sig.tight)
+ − 2111 npeak.loose <- nrow(selected.sig.loose)
+ − 2112
+ − 2113
+ − 2114 npeak.stat <- list(idr.level=idr.level, max.sig=max.sig, min.sig=min.sig, npeak.tight=npeak.tight, npeak.loose=npeak.loose)
+ − 2115
+ − 2116 invisible(list(npeak.stat=npeak.stat, combined.selected.tight=selected.sig.tight, combined.selected.loose=selected.sig.loose))
+ − 2117 }
+ − 2118
+ − 2119 #################
+ − 2120 # pass the regions selected from consistency analysis to combined data
+ − 2121 # Threshold is determined on the replicates, the regions above the threshold are selected
+ − 2122 # then peaks on the combined data are selected from the selected regions
+ − 2123 #
+ − 2124 # To avoid being too stringent, regions satisfying the following conditions are selected
+ − 2125 # 1. regions above the significant threshold determined by consistency analysis on either replicate
+ − 2126 # 2. regions that have consistent low peaks, i.e. posterior prob > threshold but not passing the significant threshold
+ − 2127 #
+ − 2128 # This method doesn't make a difference when using different thresholds
+ − 2129 #################
+ − 2130
+ − 2131 pass.region <- function(sig.map.list, uri.output, ez.list, em.output, combined, idr.level, sig.value.impute=0, chr.size){
+ − 2132
+ − 2133 combined <- combined[, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+ − 2134 npair <- length(uri.output) # number of pairs of consistency analysis
+ − 2135 combined.region <- c()
+ − 2136
+ − 2137 # choose idr.level
+ − 2138 idr.index <- which(rbind(sig.map.list[[1]])[,1] == idr.level)
+ − 2139 if(length(idr.index) ==0){
+ − 2140 print("no level matches specified idr.level")
+ − 2141 return(-1)
+ − 2142 }
+ − 2143
+ − 2144 for(j in 1:npair){
+ − 2145 # select peaks from individual replicates using individual cutoff
+ − 2146 above.1 <- uri.output[[j]]$data12.enrich$merge1["sig.value"] >= ez.list[[j]]$map.uv[idr.index,"sig.value1"]
+ − 2147 above.2 <- uri.output[[j]]$data12.enrich$merge1["sig.value"] >= ez.list[[j]]$map.uv[idr.index,"sig.value2"]
+ − 2148 selected.sig.rep1 <- uri.output[[j]]$data12.enrich$merge1[above.1, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+ − 2149 selected.sig.rep2 <- uri.output[[j]]$data12.enrich$merge2[above.2, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+ − 2150
+ − 2151 # find the peaks that are overlapped with reliable peaks in the individual replicates
+ − 2152 overlap.1 <- pair.peaks(selected.sig.rep1, combined)$merge2
+ − 2153 overlap.2 <- pair.peaks(selected.sig.rep2, combined)$merge2
+ − 2154
+ − 2155 # choose the ones with significant value > 0, which are the overlapped ones
+ − 2156
+ − 2157 combined.in1 <- overlap.1[overlap.1$sig.value > sig.value.impute, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+ − 2158 combined.in2 <- overlap.2[overlap.2$sig.value > sig.value.impute, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+ − 2159
+ − 2160 ## consistent low significant ones
+ − 2161 ## first find consistenct ones, ie. high posterior prob
+ − 2162 # is.consistent <- ez.list[[j]]$e.z < ez.list[[j]]$ez.cutoff
+ − 2163
+ − 2164 # data.matched <- keep.match(uri.output[[j]]$data12.enrich$merge1[!above.1, ], uri.output[[j]]$data12.enrich$merge2[!above.2, ], sig.value.impute=0)
+ − 2165 # data.matched$sample1 <- data.matched$sample1[, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+ − 2166 # data.matched$sample2 <- data.matched$sample2[, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+ − 2167
+ − 2168 # consistent.in1 <- data.matched$sample1[is.consistent, ]
+ − 2169 # consistent.in2 <- data.matched$sample2[is.consistent, ]
+ − 2170
+ − 2171 # overlap.consistent.1 <- pair.peaks(consistent.in1, combined)$merge2
+ − 2172 # overlap.consistent.2 <- pair.peaks(consistent.in2, combined)$merge2
+ − 2173
+ − 2174 ## choose the ones with significant value > 0, which are the overlapped ones
+ − 2175
+ − 2176 # combined.consistent.in1 <- overlap.consistent.1[overlap.consistent.1$sig.value > sig.value.impute, ]
+ − 2177 # combined.consistent.in2 <- overlap.consistent.2[overlap.consistent.2$sig.value > sig.value.impute, ]
+ − 2178
+ − 2179 # combined.region <- rbind(combined.region, combined.in1, combined.in2, combined.consistent.in1, combined.consistent.in2)
+ − 2180
+ − 2181 combined.region <- rbind(combined.region, combined.in1, combined.in2)
+ − 2182
+ − 2183 is.repeated <- duplicated(combined.region$start)
+ − 2184 combined.region <- combined.region[!is.repeated, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
+ − 2185
+ − 2186 }
+ − 2187 npeak <- nrow(combined.region)
+ − 2188
+ − 2189 sig.combined <- c(min(combined.region[,"sig.value"], na.rm=T), max(combined.region[,"sig.value"], na.rm=T))
+ − 2190
+ − 2191 # idr.combined <- c(min(combined.region[,"q.value"], na.rm=T), max(combined.region[,"q.value"], na.rm=T))
+ − 2192
+ − 2193 npeak.stat <- list(idr.level=idr.level, npeak=npeak)
+ − 2194
+ − 2195 combined.region <- deconcatenate.chr(combined.region, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")]
+ − 2196
+ − 2197 invisible(list(npeak.stat=npeak.stat, combined.selected=combined.region, sig.combined=sig.combined))
+ − 2198 }
+ − 2199
+ − 2200 ################
+ − 2201 # pass structure: this method does another round of inference on the combined data
+ − 2202 #
+ − 2203 # To make the mixture structure comparable on the replicates and the combined data, the 2nd inference is done on the peaks
+ − 2204 # at the reliable regions on the combined data, using rank transformed significant values. The mixture structure is estimated using my consistency analysis, which
+ − 2205 # estimates marginal distributions of ranks using nonparametric ways. Then the significant values are found out.
+ − 2206 # There are several advantages to do it this way:
+ − 2207 # 1. The premise of passing structure is that the means and variance (i.e. distribution) of two replicates should be the same
+ − 2208 # The significant values on the two replicates clearly have different distributions. The structure estimated from consistency
+ − 2209 # analysis will generate similar rank distribution on two replicates by its setup (i.e. same number of peaks are paired up).
+ − 2210 # 2. Because pooled sample is a black box, the structure is more likely to be followed in the matched regions than other locations,
+ − 2211 # after all, we don't know what other things are. If even the structure doesn't hold on the matched regions,
+ − 2212 # which is possible, let alone the other regions. Focusing on the reliable regions helps to get rid of those unknown noises.
+ − 2213 #
+ − 2214 #
+ − 2215 # modified on 2-20-10: reverse rank.combined, make big sig.value with small
+ − 2216 # ranks, to be consistent with f1 and f2
+ − 2217 ################
+ − 2218
+ − 2219 pass.structure <- function(uri.output, em.output, combined, idr.level, sig.value.impute, chr.size, overlap.ratio=0){
+ − 2220
+ − 2221 columns.keep <- c("sig.value", "start", "stop", "signal.value", "p.value", "q.value", "chr", "start.ori", "stop.ori")
+ − 2222 combined <- combined[, columns.keep]
+ − 2223 combined.selected.all <- c()
+ − 2224
+ − 2225 for(j in 1:npair){
+ − 2226
+ − 2227 sample1 <- uri.output[[j]]$data12.enrich$merge1[, columns.keep]
+ − 2228 sample2 <- uri.output[[j]]$data12.enrich$merge2[, columns.keep]
+ − 2229
+ − 2230 # find peaks on the matched region on the combined one
+ − 2231 data.matched <- keep.match(sample1, sample2, sig.value.impute=sig.value.impute)
+ − 2232
+ − 2233 data.matched$sample1 <- data.matched$sample1[, columns.keep]
+ − 2234 data.matched$sample2 <- data.matched$sample2[, columns.keep]
+ − 2235
+ − 2236 overlap.1 <- pair.peaks.filter(data.matched$sample1, combined, p.value.impute=sig.value.impute, overlap.ratio)$merge2
+ − 2237 overlap.2 <- pair.peaks.filter(data.matched$sample2, combined, p.value.impute=sig.value.impute, overlap.ratio)$merge2
+ − 2238
+ − 2239 # choose the ones with significant value > sig.value.impute, which are the overlapped ones
+ − 2240
+ − 2241 combined.in1 <- overlap.1[overlap.1$sig.value > sig.value.impute, ]
+ − 2242 combined.in2 <- overlap.2[overlap.2$sig.value > sig.value.impute, ]
+ − 2243
+ − 2244 combined.region <- rbind(combined.in1, combined.in2)
+ − 2245
+ − 2246 is.repeated <- duplicated(combined.region$start)
+ − 2247 combined.region <- combined.region[!is.repeated,]
+ − 2248
+ − 2249 # now rank the peaks in matched region
+ − 2250 rank.combined <- rank(-combined.region$sig.value)
+ − 2251
+ − 2252 # now transform the parameters estimated into the new scale
+ − 2253 npeaks.overlap <- nrow(combined.region)
+ − 2254 npeaks.consistent <- nrow(cbind(em.output[[j]]$data.pruned$sample1))
+ − 2255
+ − 2256
+ − 2257 # the breaks are the same for x and y
+ − 2258 f1 <- list(breaks=em.output[[j]]$em.fit$x.mar$f1$breaks*npeaks.overlap/npeaks.consistent, density=(em.output[[j]]$em.fit$x.mar$f1$density+em.output[[j]]$em.fit$y.mar$f1$density)/2)
+ − 2259 # the first break boundary goes up when changing scale, need set it back to be a bit smaller than 1
+ − 2260 f1$breaks[1] <- min(f1$breaks[1], 0.95)
+ − 2261
+ − 2262 f2 <- list(breaks=em.output[[j]]$em.fit$x.mar$f2$breaks*npeaks.overlap/npeaks.consistent, density=(em.output[[j]]$em.fit$x.mar$f2$density+em.output[[j]]$em.fit$y.mar$f2$density)/2)
+ − 2263 # the first break boundary goes up when changing scale, need set it back to be a bit smaller than 1
+ − 2264 f2$breaks[1] <- min(f2$breaks[1], 0.95)
+ − 2265
+ − 2266 p <- em.output[[j]]$em.fit$para$p
+ − 2267
+ − 2268 # find the posterior probability
+ − 2269 errorprob.combined <- get.comp2.prob(rank.combined, p, f1, f2)
+ − 2270
+ − 2271 # compute the FDR and find cutoff of posterior prob and the sig value
+ − 2272 o <- order(errorprob.combined)
+ − 2273 idr <- cumsum(errorprob.combined[o])/c(1:length(o))
+ − 2274 idr.index <- which(idr > idr.level)[1]
+ − 2275 errorprob.cutoff <- errorprob.combined[o][idr.index]
+ − 2276
+ − 2277 # find the minimum significant measure among selected peaks
+ − 2278 sig.value <- min(combined.region$sig.value[o][1:idr.index])
+ − 2279 # sig.value <- quantile(combined.region$sig.value[o][1:idr.index], prob=0.05)
+ − 2280 #sig.value <- quantile(combined.region$sig.value[errorprob.combined<=errorprob.cutoff], prob=0.05)
+ − 2281
+ − 2282 # apply the significant value on the whole pooled list
+ − 2283 combined.selected <- combined[combined$sig.value >= sig.value,]
+ − 2284
+ − 2285 combined.selected.all <- rbind(combined.selected.all, combined.selected)
+ − 2286 }
+ − 2287
+ − 2288 is.repeated <- duplicated(combined.selected.all$start)
+ − 2289 combined.selected.all <- combined.selected.all[!is.repeated,]
+ − 2290
+ − 2291 npeak <- nrow(combined.selected.all)
+ − 2292
+ − 2293 npeak.stat <- list(idr.level=idr.level, npeak=npeak)
+ − 2294
+ − 2295 sig.combined <- c(min(combined.selected.all[,"sig.value"], na.rm=T), max(combined.selected.all[,"sig.value"], na.rm=T))
+ − 2296
+ − 2297 # idr.combined <- c(min(combined.selected.all[,"q.value"], na.rm=T), max(combined.selected.all[,"q.value"], na.rm=T))
+ − 2298 # combined.selected.all <- deconcatenate.chr(combined.selected.all, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")]
+ − 2299
+ − 2300 combined.selected.all <- combined.selected.all[, c("chr", "start.ori", "stop.ori", "signal.value", "p.value", "q.value")]
+ − 2301 colnames(combined.selected.all) <- c("chr", "start", "stop", "signal.value", "p.value", "q.value")
+ − 2302
+ − 2303 invisible(list(npeak.stat=npeak.stat, combined.selected=combined.selected.all, sig.combined=sig.combined))
+ − 2304 }
+ − 2305
+ − 2306
+ − 2307
+ − 2308 # get the posterior probability of the 2nd component
+ − 2309 get.comp2.prob <- function(x, p, f1, f2){
+ − 2310
+ − 2311 # get pdf and cdf of each component from functions in the corresponding component
+ − 2312 px.1 <- sapply(x, get.pdf, df=f1)
+ − 2313 px.2 <- sapply(x, get.pdf, df=f2)
+ − 2314
+ − 2315 comp2prob <- 1 - p*px.1/(p*px.1+(1-p)*px.2)
+ − 2316
+ − 2317 return(comp2prob)
+ − 2318 }
+ − 2319
+ − 2320 keep.match <- function(sample1, sample2, sig.value.impute=0){
+ − 2321
+ − 2322 sample1.prune <- sample1[sample1$sig.value > sig.value.impute & sample2$sig.value > sig.value.impute,]
+ − 2323 sample2.prune <- sample2[sample1$sig.value > sig.value.impute & sample2$sig.value > sig.value.impute,]
+ − 2324
+ − 2325 invisible(list(sample1=sample1.prune, sample2=sample2.prune))
+ − 2326 }
+ − 2327
+ − 2328
+ − 2329 ##############################################
+ − 2330 #
+ − 2331 # The following is for simulation
+ − 2332 #
+ − 2333 ##############################################
+ − 2334
+ − 2335
+ − 2336 # simulate gaussian copula
+ − 2337 # u is the uniform random variable and rho is correlation coefficient
+ − 2338 simu.gaussian.copula <- function(u, rho){
+ − 2339
+ − 2340 n <- length(u)
+ − 2341
+ − 2342 # simulate y given x=qnorm(u)
+ − 2343 y <- qnorm(u)*rho + rnorm(n)*sqrt(1-rho^2)
+ − 2344
+ − 2345 v <- pnorm(y)
+ − 2346
+ − 2347 invisible(v)
+ − 2348 }
+ − 2349
+ − 2350 ## simulate Clayton copula from its generating function
+ − 2351 ## Genest and MacKay (1986)
+ − 2352
+ − 2353 phi.ori <- function(t, s){
+ − 2354
+ − 2355 (t^(-s) -1)/s
+ − 2356 }
+ − 2357
+ − 2358
+ − 2359 phi.inv <- function(y, s){
+ − 2360
+ − 2361 exp(-log(s*y+1)/s)
+ − 2362 }
+ − 2363
+ − 2364 phi.der <- function(t, s){
+ − 2365
+ − 2366 -t^(-s-1)
+ − 2367 }
+ − 2368
+ − 2369 phi.der.inv <- function(y, s){
+ − 2370
+ − 2371 exp(log(-y)/(-s-1))
+ − 2372 }
+ − 2373
+ − 2374 get.w <- function(u, t, s){
+ − 2375
+ − 2376 phi.der.inv(phi.der(u, s)/t, s)
+ − 2377 }
+ − 2378
+ − 2379 get.v <- function(w, u, s){
+ − 2380
+ − 2381 phi.inv(phi.ori(w, s) - phi.ori(u, s), s)
+ − 2382 }
+ − 2383
+ − 2384 # u is a uniform random variable, s is the association parameter
+ − 2385 simu.clayton.copula <- function(u, s){
+ − 2386
+ − 2387 t <- runif(length(u))
+ − 2388
+ − 2389 if(s>0){
+ − 2390 w <- get.w(u, t, s)
+ − 2391 v <- get.v(w, u, s)
+ − 2392 return(v)
+ − 2393 }
+ − 2394
+ − 2395 if(s==0){
+ − 2396 return(t)
+ − 2397 }
+ − 2398
+ − 2399 if(s <0){
+ − 2400 print("Invalid association parameters for clayton copula")
+ − 2401 }
+ − 2402
+ − 2403 }
+ − 2404
+ − 2405
+ − 2406
+ − 2407 ###### 09-09-09
+ − 2408
+ − 2409 # simulate a two-component copula mixture:
+ − 2410 # - marginal distributions for the two variables in each component are both
+ − 2411 # normal and with the same parameters
+ − 2412 # p is the mixing proportion of component 1
+ − 2413 # n is the total sample size
+ − 2414 simu.copula.2mix <- function(s1, s2, p, n, mu1, mu2, sd1, sd2, copula.txt){
+ − 2415
+ − 2416 n1 <- round(n*p)
+ − 2417 n2 <- n-n1
+ − 2418
+ − 2419 u1 <- runif(n1)
+ − 2420
+ − 2421 if(copula.txt =="clayton")
+ − 2422 v1 <- simu.clayton.copula(u1, s1)
+ − 2423 else{
+ − 2424 if(copula.txt =="gaussian")
+ − 2425 v1 <- simu.gaussian.copula(u1, s1)
+ − 2426 }
+ − 2427
+ − 2428 u2 <- runif(n2)
+ − 2429
+ − 2430 if(copula.txt =="clayton")
+ − 2431 v2 <- simu.clayton.copula(u2, s2)
+ − 2432 else{
+ − 2433 if(copula.txt =="gaussian")
+ − 2434 v2 <- simu.gaussian.copula(u2, s2)
+ − 2435 }
+ − 2436
+ − 2437 # generate test statistics
+ − 2438 sample1.1 <- qnorm(u1, mu1, sd1)
+ − 2439 sample1.2 <- qnorm(v1, mu1, sd1)
+ − 2440
+ − 2441 sample2.1 <- qnorm(u2, mu2, sd2)
+ − 2442 sample2.2 <- qnorm(v2, mu2, sd2)
+ − 2443
+ − 2444 return(list(u=c(u1, u2), v=c(v1, v2),
+ − 2445 u.inv=c(sample1.1, sample2.1), v.inv=c(sample1.2, sample2.2),
+ − 2446 label=c(rep(1, n1), rep(2, n2))))
+ − 2447 }
+ − 2448
+ − 2449 # using inverse of the cdf to generate original observations
+ − 2450
+ − 2451 simu.copula.2mix.inv <- function(s1, s2, p, n, cdf1.x, cdf1.y, cdf2.x, cdf2.y, copula.txt){
+ − 2452
+ − 2453 n1 <- round(n*p)
+ − 2454 n2 <- n-n1
+ − 2455
+ − 2456 u1 <- runif(n1)
+ − 2457
+ − 2458 if(copula.txt =="clayton")
+ − 2459 v1 <- simu.clayton.copula(u1, s1)
+ − 2460 else{
+ − 2461 if(copula.txt =="gaussian")
+ − 2462 v1 <- simu.gaussian.copula(u1, s1)
+ − 2463 }
+ − 2464
+ − 2465 u2 <- runif(n2)
+ − 2466
+ − 2467 if(copula.txt =="clayton")
+ − 2468 v2 <- simu.clayton.copula(u2, s2)
+ − 2469 else{
+ − 2470 if(copula.txt =="gaussian")
+ − 2471 v2 <- simu.gaussian.copula(u2, s2)
+ − 2472 }
+ − 2473
+ − 2474 # generate test statistics
+ − 2475 # sample1.1 <- qnorm(u1, mu1, sd1)
+ − 2476 # sample1.2 <- qnorm(v1, mu1, sd1)
+ − 2477
+ − 2478 # sample2.1 <- qnorm(u2, mu2, sd2)
+ − 2479 # sample2.2 <- qnorm(v2, mu2, sd2)
+ − 2480
+ − 2481 sample1.x <- inv.cdf.vec(u1, cdf1.x)
+ − 2482 sample1.y <- inv.cdf.vec(v1, cdf1.y)
+ − 2483
+ − 2484 sample2.x <- inv.cdf.vec(u2, cdf2.x)
+ − 2485 sample2.y <- inv.cdf.vec(v2, cdf2.y)
+ − 2486
+ − 2487
+ − 2488 return(list(u=c(u1, u2), v=c(v1, v2),
+ − 2489 u.inv=c(sample1.x, sample2.x), v.inv=c(sample1.y, sample2.y),
+ − 2490 label=c(rep(1, n1), rep(2, n2))))
+ − 2491 }
+ − 2492
+ − 2493 # obtain original observation by converting cdf into quantiles
+ − 2494 # u is one cdf
+ − 2495 # u.cdf is a cdf (assuming it is a histogram) and has the break points (cdf$cdf and cdf$breaks)
+ − 2496 # the smallest value of cdf=0 and the largest =1
+ − 2497 inv.cdf <- function(u, u.cdf){
+ − 2498
+ − 2499 # which bin it falls into
+ − 2500 i <- which(u.cdf$cdf> u)[1]
+ − 2501 q.u <- (u - u.cdf$cdf[i-1])/(u.cdf$cdf[i] - u.cdf$cdf[i-1])* (u.cdf$breaks[i]-u.cdf$breaks[i-1]) + u.cdf$breaks[i-1]
+ − 2502
+ − 2503 return(q.u)
+ − 2504 }
+ − 2505
+ − 2506 inv.cdf.vec <- function(u, u.cdf){
+ − 2507
+ − 2508 # check if cdf has the right range (0, 1)
+ − 2509 ncdf <- length(u.cdf$cdf)
+ − 2510 nbreaks <- length(u.cdf$breaks)
+ − 2511
+ − 2512 if(ncdf == nbreaks-1 & u.cdf$cdf[ncdf]< 1)
+ − 2513 u.cdf[ncdf] <- 1
+ − 2514
+ − 2515 q.u <- sapply(u, inv.cdf, u.cdf)
+ − 2516
+ − 2517 return(q.u)
+ − 2518 }
+ − 2519
+ − 2520 # here we simulate a likely real situation
+ − 2521 # the test statistics from two normal distributions
+ − 2522 # according to their labels, then convert them into p-values w.r.t H0 using
+ − 2523 # one-sided test.
+ − 2524 # The test statistics are correlated for the signal component and independent
+ − 2525 # for the noise component
+ − 2526 # For the signal component, Y = X + eps, where eps ~ N(0, sigma^2)
+ − 2527 simu.test.stat <- function(p, n, mu1, sd1, mu0, sd0, sd.e){
+ − 2528
+ − 2529 # first component - signal
+ − 2530 n.signal <- round(n*p)
+ − 2531 n.noise <- n - n.signal
+ − 2532
+ − 2533 # labels
+ − 2534 labels <- c(rep(1, n.signal), rep(0, n.noise))
+ − 2535
+ − 2536 # test statistics for signal and noise
+ − 2537 mu.signal <- rnorm(n.signal, mu1, sd1)
+ − 2538 x.signal <- mu.signal + rnorm(n.signal, 0, sd.e)
+ − 2539 x.noise <- rnorm(n.noise, mu0, sd0) + rnorm(n.noise, 0, sd.e)
+ − 2540
+ − 2541 y.signal <- mu.signal + rnorm(n.signal, 0, sd.e)
+ − 2542 # sd.e can be dependent on signal
+ − 2543 y.noise <- rnorm(n.noise, mu0, sd0) + rnorm(n.noise, 0, sd.e)
+ − 2544
+ − 2545 # concatenate
+ − 2546 x <- c(x.signal, x.noise)
+ − 2547 y <- c(y.signal, y.noise)
+ − 2548
+ − 2549 # convert to p-values based on H0
+ − 2550 p.x <- 1-pnorm(x, mu0, sqrt(sd0^2+sd.e^2))
+ − 2551 p.y <- 1-pnorm(y, mu0, sqrt(sd0^2+sd.e^2))
+ − 2552
+ − 2553 return(list(p.x=p.x, p.y=p.y, x=x, y=y, labels=labels))
+ − 2554
+ − 2555 }
+ − 2556
+ − 2557 # compute the tradeoff and calibration
+ − 2558 forward.decoy.tradeoff.ndecoy <- function(xx, labels, ndecoy){
+ − 2559
+ − 2560 xx <- round(xx, 5)
+ − 2561 o <- order(xx, decreasing=T)
+ − 2562
+ − 2563 rand <- 1-labels # if rand==0, consistent
+ − 2564 # order the random indicator in the same order
+ − 2565 rand.o <- rand[o]
+ − 2566
+ − 2567 if(sum(rand.o) > ndecoy){
+ − 2568 index.decoy <- which(cumsum(rand.o)==ndecoy)
+ − 2569 } else {
+ − 2570 index.decoy <- which(cumsum(rand.o)==sum(rand.o))
+ − 2571 }
+ − 2572
+ − 2573 cutoff.decoy <- xx[o][index.decoy]
+ − 2574
+ − 2575 # only consider the unique ones
+ − 2576 cutoff.unique <- unique(xx[o])
+ − 2577
+ − 2578 cutoff <- cutoff.unique[cutoff.unique >= cutoff.decoy[length(cutoff.decoy)]]
+ − 2579
+ − 2580 get.decoy.count <- function(cut.off){
+ − 2581 above <- rep(0, length(xx))
+ − 2582 above[xx >= cut.off] <- 1
+ − 2583 decoy.count <- sum(above==1 & rand==1)
+ − 2584 return(decoy.count)
+ − 2585 }
+ − 2586
+ − 2587 get.forward.count <- function(cut.off){
+ − 2588 above <- rep(0, length(xx))
+ − 2589 above[xx >= cut.off] <- 1
+ − 2590 forward.count <- sum(above==1 & rand==0)
+ − 2591 return(forward.count)
+ − 2592 }
+ − 2593
+ − 2594 get.est.fdr <- function(cut.off){
+ − 2595 above <- rep(0, length(xx))
+ − 2596 above[xx >= cut.off] <- 1
+ − 2597 est.fdr <- 1-mean(xx[above==1])
+ − 2598 return(est.fdr)
+ − 2599 }
+ − 2600
+ − 2601 # assuming rand=0 is right
+ − 2602 get.false.neg.count <- function(cut.off){
+ − 2603 below <- rep(0, length(xx))
+ − 2604 below[xx < cut.off] <- 1
+ − 2605 false.neg.count <- sum(below==1 & rand==0)
+ − 2606 return(false.neg.count)
+ − 2607 }
+ − 2608
+ − 2609 get.false.pos.count <- function(cut.off){
+ − 2610 above <- rep(0, length(xx))
+ − 2611 above[xx >= cut.off] <- 1
+ − 2612 false.pos.count <- sum(above==1 & rand==1)
+ − 2613 return(false.pos.count)
+ − 2614 }
+ − 2615
+ − 2616 decoy <- sapply(cutoff, get.decoy.count)
+ − 2617 forward <- sapply(cutoff, get.forward.count)
+ − 2618
+ − 2619 est.fdr <- sapply(cutoff, get.est.fdr)
+ − 2620 emp.fdr <- decoy/(decoy+forward)
+ − 2621
+ − 2622 # compute specificity and sensitivity
+ − 2623 # assuming rand=1 is wrong and rand=0 is right
+ − 2624 false.neg <- sapply(cutoff, get.false.neg.count)
+ − 2625 false.pos <- sapply(cutoff, get.false.pos.count)
+ − 2626
+ − 2627 true.pos <- sum(rand==0)-false.neg
+ − 2628 true.neg <- sum(rand==1)-false.pos
+ − 2629
+ − 2630 sensitivity <- true.pos/(true.pos+false.neg)
+ − 2631 specificity <- true.neg/(true.neg+false.pos)
+ − 2632
+ − 2633 return(list(decoy=decoy, forward=forward, cutoff=cutoff, est.fdr=est.fdr, emp.fdr=emp.fdr, sensitivity=sensitivity, specificity=specificity))
+ − 2634 }
+ − 2635
+ − 2636
+ − 2637 # compute the em for jackknife and all data, and find FDR
+ − 2638 get.emp.jack <- function(a, p0){
+ − 2639
+ − 2640 nobs <- length(a$labels)
+ − 2641 est <- list()
+ − 2642 est.all <- list()
+ − 2643
+ − 2644 temp.all <- em.transform(-a$p.x, -a$p.y, mu=1.5, sigma=1.4, rho=0.4, p=0.7, eps=0.01)
+ − 2645 # temp.all <- em.2copula.quick(a$p.x, a$p.y, p0=p0, rho1.0=0.7,
+ − 2646 # rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian")
+ − 2647
+ − 2648 est.all$p <- temp.all$para$p
+ − 2649 est.all$rho1 <- temp.all$para$rho1
+ − 2650 est.all$FDR <- get.FDR(temp.all$e.z)
+ − 2651
+ − 2652 FDR <- list()
+ − 2653 p <- c()
+ − 2654 rho1 <- c()
+ − 2655
+ − 2656
+ − 2657 for(i in 1:nobs){
+ − 2658
+ − 2659 temp <- em.transform(-a$p.x[-i], -a$p.y[-i], mu=1.5, sigma=1.4, rho=0.4, p=0.7, eps=0.01)
+ − 2660 # temp <- em.2copula.quick(a$p.x[-i], a$p.y[-i], p0=p0, rho1.0=0.7,
+ − 2661 # rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian")
+ − 2662
+ − 2663 est[[i]] <- list(p=temp$para$p, rho1=temp$para$rho1, FDR=get.FDR(temp$e.z))
+ − 2664
+ − 2665 FDR[[i]] <- est[[i]]$FDR # this is the FDR for top n peaks
+ − 2666 p[i] <- est[[i]]$p
+ − 2667 rho1[i] <- est[[i]]$rho1
+ − 2668 }
+ − 2669
+ − 2670 est.jack <- list(FDR=FDR, p=p, rho1=rho1)
+ − 2671 return(list(est.jack=est.jack, est.all=est.all))
+ − 2672 }
+ − 2673
+ − 2674
+ − 2675 # get the npeaks corresponding to the nominal FDR estimated from the sample
+ − 2676 # and find the corresponding FDR from the entire data
+ − 2677 get.FDR.jack <- function(est, FDR.nominal){
+ − 2678
+ − 2679 nobs <- length(est$est.jack$FDR)
+ − 2680 FDR.all <- c()
+ − 2681 top.n <- c()
+ − 2682
+ − 2683 for(i in 1:nobs){
+ − 2684 top.n[i] <- max(which(est$est.jack$FDR[[i]] <= FDR.nominal))
+ − 2685 FDR.all[i] <- est$est.all$FDR[top.n[i]]
+ − 2686 }
+ − 2687
+ − 2688 invisible(list(FDR.all=FDR.all, top.n=top.n))
+ − 2689 }
+ − 2690
+ − 2691 # compute Jackknife peudonumber
+ − 2692 # a is the dataset
+ − 2693 get.emp.IF <- function(a, p0){
+ − 2694
+ − 2695 nobs <- length(a$labels)
+ − 2696 est <- list()
+ − 2697 est.all <- list()
+ − 2698
+ − 2699 temp.all <- em.2copula.quick(a$p.x, a$p.y, p0=p0, rho1.0=0.7,
+ − 2700 rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian")
+ − 2701
+ − 2702 est.all$p <- temp.all$para$p
+ − 2703 est.all$rho1 <- temp.all$para$rho1
+ − 2704 est.all$FDR <- get.FDR(temp.all$e.z)
+ − 2705
+ − 2706 IF.FDR <- list()
+ − 2707 IF.p <- c()
+ − 2708 IF.rho1 <- c()
+ − 2709
+ − 2710 for(i in 1:nobs){
+ − 2711
+ − 2712 temp <- em.2copula.quick(a$p.x[-i], a$p.y[-i], p0=p0, rho1.0=0.7,
+ − 2713 rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian")
+ − 2714
+ − 2715 est[[i]] <- list(p=temp$para$p, rho1=temp$para$rho1, FDR=get.FDR(temp$e.z))
+ − 2716
+ − 2717 IF.FDR[[i]] <- (nobs-1)*(est.all$FDR[-nobs] - est[[i]]$FDR) # this is the FDR for top n peaks
+ − 2718 IF.p[i] <- (nobs-1)*(est.all$p - est[[i]]$p)
+ − 2719 IF.rho1[i] <- (nobs-1)*(est.all$rho1 - est[[i]]$rho1)
+ − 2720 }
+ − 2721
+ − 2722 emp.IF <- list(FDR=IF.FDR, p=IF.p, rho1=IF.rho1)
+ − 2723
+ − 2724 invisible(list(emp.IF=emp.IF, est.all=est.all, est=est))
+ − 2725 }
+ − 2726
+ − 2727 # e.z is the posterior probability of being in signal component
+ − 2728 get.FDR <- function(e.z){
+ − 2729
+ − 2730 e.z.o <- order(1-e.z)
+ − 2731 FDR <- cumsum(1-e.z[e.z.o])/c(1:length(e.z.o))
+ − 2732
+ − 2733 invisible(FDR)
+ − 2734 }
+ − 2735
+ − 2736 # get the FDR of selecting the top n peaks
+ − 2737 # IF.est is the sample influence function
+ − 2738 # top.n
+ − 2739 get.IF.FDR <- function(IF.est, top.n){
+ − 2740
+ − 2741 nobs <- length(IF.est$emp.IF$FDR)
+ − 2742 FDR <- c()
+ − 2743
+ − 2744 # influence function of p
+ − 2745 for(i in 1:nobs)
+ − 2746 FDR[i] <- IF.est$emp.IF$FDR[[i]][top.n]
+ − 2747
+ − 2748 invisible(FDR)
+ − 2749 }
+ − 2750
+ − 2751 # get the sample influence function for FDR at a given FDR size
+ − 2752 # 1. find the number of peaks selected at a given FDR computed from all obs
+ − 2753 # 2. use the number to find the sample influence function for FDR
+ − 2754 # IF.est$est.all is the FDR with all peaks
+ − 2755 get.IF.FDR.all <- function(IF.est, FDR.size){
+ − 2756
+ − 2757 top.n <- which.min(abs(IF.est$est.all$FDR -FDR.size))
+ − 2758 nobs <- length(IF.est$est.all$FDR)
+ − 2759 FDR <- c()
+ − 2760
+ − 2761 # influence function of p
+ − 2762 for(i in 1:nobs)
+ − 2763 FDR[i] <- IF.est$emp.IF$FDR[[i]][top.n]
+ − 2764
+ − 2765 invisible(list(FDR=FDR, top.n=top.n))
+ − 2766 }
+ − 2767
+ − 2768 plot.simu.uri <- function(x, y){
+ − 2769
+ − 2770 tt <- seq(0.01, 0.99, by=0.01)
+ − 2771 uri <- sapply(tt, comp.uri.prob, u=x, v=y)
+ − 2772 uri.thin <- uri[seq(1, length(tt), by=3)]
+ − 2773 tt.thin <- tt[seq(1, length(tt), by=3)]
+ − 2774 duri <- (uri.thin[-1]-uri.thin[-length(uri.thin)])/(tt.thin[-1]-tt.thin[-length(tt.thin)])
+ − 2775 uri.spl <- smooth.spline(tt, uri, df=6.4)
+ − 2776 uri.der <- predict(uri.spl, tt, deriv=1)
+ − 2777
+ − 2778 par(mfrow=c(2,2))
+ − 2779 plot(x[1:n0], y[1:n0])
+ − 2780 points(x[(n0+1):n], y[(n0+1):n], col=2)
+ − 2781 plot(rank(-x)[1:n0], rank(-y)[1:n0])
+ − 2782 points(rank(-x)[(1+n0):n], rank(-y)[(1+n0):n])
+ − 2783 plot(tt, uri)
+ − 2784 lines(c(0,1), c(0,1), lty=2)
+ − 2785 title(paste("rho1=", rho1, " rho2=", rho2, "p=", p, sep=""))
+ − 2786 plot(tt.thin[-1], duri)
+ − 2787 lines(uri.der)
+ − 2788 abline(h=1)
+ − 2789 invisible(list(x=x, y=y, uri=uri, tt=tt, duri=duri, tt.thin=tt.thin, uri.der=uri.der))
+ − 2790
+ − 2791 }
+ − 2792
+ − 2793
+ − 2794 ###### new fitting procedure
+ − 2795
+ − 2796
+ − 2797
+ − 2798
+ − 2799 # 1. rank pairs
+ − 2800
+ − 2801 # 2. initialization
+ − 2802 # 3. convert to pseudo-number
+ − 2803
+ − 2804 # 4. EM
+ − 2805
+ − 2806 # need plugin and test
+ − 2807 # find the middle point between the bins
+ − 2808 get.pseudo.mix <- function(x, mu, sigma, rho, p){
+ − 2809
+ − 2810
+ − 2811 # first compute cdf for points on the grid
+ − 2812 # generate 200 points between [-3, mu+3*sigma]
+ − 2813 nw <- 1000
+ − 2814 w <- seq(min(-3, mu-3*sigma), max(mu+3*sigma, 3), length=nw)
+ − 2815 w.cdf <- p*pnorm(w, mean=mu, sd=sigma) + (1-p)*pnorm(w, mean=0, sd=1)
+ − 2816
+ − 2817 i <- 1
+ − 2818
+ − 2819 quan.x <- rep(NA, length(x))
+ − 2820
+ − 2821 for(i in c(1:nw)){
+ − 2822 index <- which(x >= w.cdf[i] & x < w.cdf[i+1])
+ − 2823 quan.x[index] <- (x[index]-w.cdf[i])*(w[i+1]-w[i])/(w.cdf[i+1]-w.cdf[i]) +w[i]
+ − 2824 }
+ − 2825
+ − 2826 index <- which(x < w.cdf[1])
+ − 2827 if(length(index)>0)
+ − 2828 quan.x[index] <- w[1]
+ − 2829
+ − 2830 index <- which(x > w.cdf[nw])
+ − 2831 if(length(index)>0)
+ − 2832 quan.x[index] <- w[nw]
+ − 2833
+ − 2834 # linear.ext <- function(x, w, w.cdf){
+ − 2835 # linear interpolation
+ − 2836 # index.up <- which(w.cdf>= x)[1]
+ − 2837 # left.index <- which(w.cdf <=x)
+ − 2838 # index.down <- left.index[length(left.index)]
+ − 2839 # quan.x <- (w[index.up] + w[index.down])/2
+ − 2840 # }
+ − 2841
+ − 2842 # x.pseudo <- sapply(x, linear.ext, w=w, w.cdf=w.cdf)
+ − 2843
+ − 2844 # invisible(x.pseudo)
+ − 2845 invisible(quan.x)
+ − 2846 }
+ − 2847
+ − 2848
+ − 2849 # EM to compute the latent structure
+ − 2850 # steps:
+ − 2851 # 1. raw values are first transformed into pseudovalues
+ − 2852 # 2. EM is used to compute the underlining structure, which is a mixture
+ − 2853 # of two normals
+ − 2854 em.transform <- function(x, y, mu, sigma, rho, p, eps){
+ − 2855
+ − 2856 x.cdf.func <- ecdf(x)
+ − 2857 y.cdf.func <- ecdf(y)
+ − 2858 afactor <- length(x)/(length(x)+1)
+ − 2859 x.cdf <- x.cdf.func(x)*afactor
+ − 2860 y.cdf <- y.cdf.func(y)*afactor
+ − 2861
+ − 2862 # initialization
+ − 2863 para <- list()
+ − 2864 para$mu <- mu
+ − 2865 para$sigma <- sigma
+ − 2866 para$rho <- rho
+ − 2867 para$p <- p
+ − 2868
+ − 2869 j <- 1
+ − 2870 to.run <- T
+ − 2871 loglik.trace <- c()
+ − 2872 loglik.inner.trace <- c()
+ − 2873
+ − 2874 #to.run.inner <- T
+ − 2875 z.1 <- get.pseudo.mix(x.cdf, para$mu, para$sigma, para$rho, para$p)
+ − 2876 z.2 <- get.pseudo.mix(y.cdf, para$mu, para$sigma, para$rho, para$p)
+ − 2877
+ − 2878 # cat("length(z1)", length(z.1), "\n")
+ − 2879 while(to.run){
+ − 2880
+ − 2881 # get pseudo value in each cycle
+ − 2882 # z.1 <- get.pseudo.mix(x.cdf, para$mu, para$sigma, para$rho, para$p)
+ − 2883 # z.2 <- get.pseudo.mix(y.cdf, para$mu, para$sigma, para$rho, para$p)
+ − 2884
+ − 2885 i <- 1
+ − 2886 while(to.run){
+ − 2887
+ − 2888 # EM for latent structure
+ − 2889 e.z <- e.step.2normal(z.1, z.2, para$mu, para$sigma, para$rho, para$p)
+ − 2890 para <- m.step.2normal(z.1, z.2, e.z)
+ − 2891 #para$rho <- rho
+ − 2892 #para$p <- p
+ − 2893 #para$mu <- mu
+ − 2894 #para$sigma <- sigma
+ − 2895 if(i > 1)
+ − 2896 l.old <- l.new
+ − 2897
+ − 2898 # this is just the mixture likelihood of two-component Gaussian
+ − 2899 l.new <- loglik.2binormal(z.1, z.2, para$mu, para$sigma, para$rho, para$p)
+ − 2900
+ − 2901 loglik.inner.trace[i] <- l.new
+ − 2902
+ − 2903 if(i > 1){
+ − 2904 to.run <- loglik.inner.trace[i]-loglik.inner.trace[i-1]>eps
+ − 2905 }
+ − 2906
+ − 2907
+ − 2908 # if(i > 2){
+ − 2909 # l.inf <- loglik.inner.trace[i-2] + (loglik.inner.trace[i-1] - loglik.inner.trace[i-2])/(1-(loglik.inner.trace[i]-loglik.inner.trace[i-1])/(loglik.inner.trace[i-1]-loglik.inner.trace[i-2]))
+ − 2910
+ − 2911 # if(loglik.inner.trace[i-1]!=loglik.inner.trace[i-2])
+ − 2912 # to.run <- abs(l.inf - loglik.inner.trace[i]) > eps
+ − 2913 # else
+ − 2914 # to.run <- F
+ − 2915
+ − 2916 # }
+ − 2917
+ − 2918 cat("loglik.inner.trace[", i, "]=", loglik.inner.trace[i], "\n")
+ − 2919 cat("mu=", para$mu, "sigma=", para$sigma, "p=", para$p, "rho=", para$rho, "\n\n")
+ − 2920
+ − 2921 i <- i+1
+ − 2922 }
+ − 2923
+ − 2924
+ − 2925 # get pseudo value in each cycle
+ − 2926 z.1 <- get.pseudo.mix(x.cdf, para$mu, para$sigma, para$rho, para$p)
+ − 2927 z.2 <- get.pseudo.mix(y.cdf, para$mu, para$sigma, para$rho, para$p)
+ − 2928
+ − 2929 if(j > 1)
+ − 2930 l.old.outer <- l.new.outer
+ − 2931
+ − 2932 l.new.outer <- loglik.2binormal(z.1, z.2, para$mu, para$sigma, para$rho, para$p)
+ − 2933
+ − 2934 loglik.trace[j] <- l.new.outer
+ − 2935
+ − 2936 if(j == 1)
+ − 2937 to.run <- T
+ − 2938 else{ # stop when iteration>100
+ − 2939 if(j > 100)
+ − 2940 to.run <- F
+ − 2941 else
+ − 2942 to.run <- l.new.outer - l.old.outer > eps
+ − 2943 }
+ − 2944
+ − 2945 # if(j %% 10==0)
+ − 2946 cat("loglik.trace[", j, "]=", loglik.trace[j], "\n")
+ − 2947 cat("mu=", para$mu, "sigma=", para$sigma, "p=", para$p, "rho=", para$rho, "\n")
+ − 2948
+ − 2949 j <- j+1
+ − 2950 }
+ − 2951
+ − 2952 bic <- -2*l.new + 4*log(length(z.1))
+ − 2953
+ − 2954 return(list(para=list(p=para$p, rho=para$rho, mu=para$mu, sigma=para$sigma),
+ − 2955 loglik=l.new, bic=bic, e.z=e.z, loglik.trace=loglik.trace))
+ − 2956 }
+ − 2957
+ − 2958
+ − 2959
+ − 2960
+ − 2961 # compute log-likelihood for mixture of two bivariate normals
+ − 2962 loglik.2binormal <- function(z.1, z.2, mu, sigma, rho, p){
+ − 2963
+ − 2964 l.m <- sum(d.binormal(z.1, z.2, 0, 1, 0)+log(p*exp(d.binormal(z.1, z.2, mu, sigma, rho)-d.binormal(z.1, z.2, 0, 1, 0))+(1-p)))
+ − 2965
+ − 2966 # l.m <- sum((p*d.binormal(z.1, z.2, mu, sigma, rho) + (1-p)*d.binormal(z.1, z.2, 0, 1, 0)))
+ − 2967 return(l.m)
+ − 2968 }
+ − 2969
+ − 2970 # check this when rho=1
+ − 2971
+ − 2972 # density of binomial distribution with equal mean and sigma on both dimensions
+ − 2973 d.binormal <- function(z.1, z.2, mu, sigma, rho){
+ − 2974
+ − 2975 loglik <- (-log(2)-log(pi)-2*log(sigma) - log(1-rho^2)/2 - (0.5/(1-rho^2)/sigma^2)*((z.1-mu)^2 -2*rho*(z.1-mu)*(z.2-mu) + (z.2-mu)^2))
+ − 2976
+ − 2977 return(loglik)
+ − 2978 }
+ − 2979
+ − 2980 # E-step for computing the latent strucutre
+ − 2981 # e.z is the prob to be in the consistent group
+ − 2982 # e.step for estimating posterior prob
+ − 2983 # z.1 and z.2 can be vectors or scalars
+ − 2984 e.step.2normal <- function(z.1, z.2, mu, sigma, rho, p){
+ − 2985
+ − 2986 e.z <- p/((1-p)*exp(d.binormal(z.1, z.2, 0, 1, 0)-d.binormal(z.1, z.2, mu, sigma, rho))+ p)
+ − 2987
+ − 2988 invisible(e.z)
+ − 2989 }
+ − 2990
+ − 2991 # M-step for computing the latent structure
+ − 2992 # m.step for estimating proportion, mean, sd and correlation coefficient
+ − 2993 m.step.2normal <- function(z.1, z.2, e.z){
+ − 2994
+ − 2995 p <- mean(e.z)
+ − 2996 mu <- sum((z.1+z.2)*e.z)/2/sum(e.z)
+ − 2997 sigma <- sqrt(sum(e.z*((z.1-mu)^2+(z.2-mu)^2))/2/sum(e.z))
+ − 2998 rho <- 2*sum(e.z*(z.1-mu)*(z.2-mu))/(sum(e.z*((z.1-mu)^2+(z.2-mu)^2)))
+ − 2999
+ − 3000 return(list(p=p, mu=mu, sigma=sigma, rho=rho))
+ − 3001 }
+ − 3002
+ − 3003
+ − 3004 # assume top p percent of observations are true
+ − 3005 # x and y are ranks, estimate
+ − 3006 init <- function(x, y, x.label){
+ − 3007
+ − 3008 x.o <- order(x)
+ − 3009
+ − 3010 x.ordered <- x[x.o]
+ − 3011 y.ordered <- y[x.o]
+ − 3012 x.label.ordered <- x.label[x.o]
+ − 3013
+ − 3014 n <- length(x)
+ − 3015 p <- sum(x.label)/n
+ − 3016
+ − 3017 rho <- cor(x.ordered[1:ceiling(p*n)], y.ordered[1:ceiling(p*n)])
+ − 3018
+ − 3019 temp <- find.mu.sigma(x.ordered, x.label.ordered)
+ − 3020 mu <- temp$mu
+ − 3021 sigma <- temp$sigma
+ − 3022
+ − 3023 invisible(list(mu=mu, sigma=sigma, rho=rho, p=p))
+ − 3024
+ − 3025 }
+ − 3026
+ − 3027 # find mu and sigma if the distributions of marginal ranks are known
+ − 3028 # take the medians of the two dist and map back to the original
+ − 3029 init.dist <- function(f0, f1){
+ − 3030
+ − 3031 # take the median in f0
+ − 3032 index.median.0 <- which(f0$cdf>0.5)[1]
+ − 3033 q.0.small <- f0$cdf[index.median.0] # because f0 and f1 have the same bins
+ − 3034 q.1.small <- f1$cdf[index.median.0]
+ − 3035
+ − 3036 # take the median in f1
+ − 3037 index.median.1 <- which(f1$cdf>0.5)[1]
+ − 3038 q.0.big <- f0$cdf[index.median.1] # because f0 and f1 have the same bins
+ − 3039 q.1.big <- f1$cdf[index.median.1]
+ − 3040
+ − 3041 # find pseudo value for x.middle[1] on normal(0,1)
+ − 3042 pseudo.small.0 <- qnorm(q.0.small, mean=0, sd=1)
+ − 3043 pseudo.small.1 <- qnorm(q.1.small, mean=0, sd=1)
+ − 3044
+ − 3045 # find pseudo value for x.middle[2] on normal(0,1)
+ − 3046 pseudo.big.0 <- qnorm(q.0.big, mean=0, sd=1)
+ − 3047 pseudo.big.1 <- qnorm(q.1.big, mean=0, sd=1)
+ − 3048
+ − 3049 mu <- (pseudo.small.0*pseudo.big.1 - pseudo.small.1*pseudo.big.0)/(pseudo.big.1-pseudo.small.1)
+ − 3050
+ − 3051 sigma <- (pseudo.small.0-mu)/pseudo.small.1
+ − 3052
+ − 3053 return(list(mu=mu, sigma=sigma))
+ − 3054 }
+ − 3055
+ − 3056 # generate labels
+ − 3057
+ − 3058 # find the part of data with overlap
+ − 3059
+ − 3060 # find the percentile on noise and signal
+ − 3061
+ − 3062 # Suppose there are signal and noise components, with mean=0 and sd=1 for noise
+ − 3063 # x and x.label are the rank of the observations and their labels,
+ − 3064 # find the mean and sd of the other component
+ − 3065 # x.label takes values of 0 and 1
+ − 3066 find.mu.sigma <- function(x, x.label){
+ − 3067
+ − 3068 x.0 <- x[x.label==0]
+ − 3069 x.1 <- x[x.label==1]
+ − 3070
+ − 3071 n.x0 <- length(x.0)
+ − 3072 n.x1 <- length(x.1)
+ − 3073
+ − 3074 x.end <- c(min(x.0), min(x.1), max(x.0), max(x.1))
+ − 3075 o <- order(x.end)
+ − 3076 x.middle <- x.end[o][c(2,3)]
+ − 3077
+ − 3078 # the smaller end of the overlap
+ − 3079 q.1.small <- mean(x.1 <= x.middle[1])*n.x1/(n.x1+1)
+ − 3080 q.0.small <- mean(x.0 <= x.middle[1])*n.x0/(n.x0+1)
+ − 3081
+ − 3082 # the bigger end of the overlap
+ − 3083 q.1.big <- mean(x.1 <= x.middle[2])*n.x1/(n.x1+1)
+ − 3084 q.0.big <- mean(x.0 <= x.middle[2])*n.x0/(n.x0+1)
+ − 3085
+ − 3086 # find pseudo value for x.middle[1] on normal(0,1)
+ − 3087 pseudo.small.0 <- qnorm(q.0.small, mean=0, sd=1)
+ − 3088 pseudo.small.1 <- qnorm(q.1.small, mean=0, sd=1)
+ − 3089
+ − 3090 # find pseudo value for x.middle[2] on normal(0,1)
+ − 3091 pseudo.big.0 <- qnorm(q.0.big, mean=0, sd=1)
+ − 3092 pseudo.big.1 <- qnorm(q.1.big, mean=0, sd=1)
+ − 3093
+ − 3094 mu <- (pseudo.small.0*pseudo.big.1 - pseudo.small.1*pseudo.big.0)/(pseudo.big.1-pseudo.small.1)
+ − 3095
+ − 3096 sigma <- (pseudo.small.0-mu)/pseudo.small.1
+ − 3097
+ − 3098 return(list(mu=mu, sigma=sigma))
+ − 3099 }