3
|
1 # revised on 2-20-10
|
|
2 # - fix error in pass.structure: reverse rank.combined, so that big sig.value
|
|
3 # are ranked with small numbers (1, 2, ...)
|
|
4 # - fix error on get.ez.tt.all: get ez.cutoff from sorted e.z
|
|
5
|
|
6 #
|
|
7 # modified EM procedure to compute empirical CDF more precisely - 09/2009
|
|
8
|
|
9
|
|
10
|
|
11 # this file contains the functions for
|
|
12 # 1. computing the correspondence profile (upper rank intersection and derivatives)
|
|
13 # 2. inference of copula mixture model
|
|
14 #
|
|
15 # It also has functions for
|
|
16 # 1. reading peak caller results
|
|
17 # 2. processing and matching called peaks
|
|
18 # 3. plotting results
|
|
19
|
|
20
|
|
21 ################ read peak caller results
|
|
22
|
|
23 # process narrow peak format
|
|
24 # some peak callers may not report q-values, p-values or fold of enrichment
|
|
25 # need further process before comparison
|
|
26 #
|
|
27 # stop.exclusive: Is the basepair of peak.list$stop exclusive? In narrowpeak and broadpeak format they are exclusive.
|
|
28 # If it is exclusive, we need subtract peak.list$stop by 1 to avoid the same basepair being both a start and a stop of two
|
|
29 # adjacent peaks, which creates trouble for finding correct intersect
|
|
30 process.narrowpeak <- function(narrow.file, chr.size, half.width=NULL, summit="offset", stop.exclusive=T, broadpeak=F){
|
|
31
|
|
32
|
|
33 aa <- read.table(narrow.file)
|
|
34
|
|
35 if(broadpeak){
|
|
36 bb.ori <- data.frame(chr=aa$V1, start=aa$V2, stop=aa$V3, signal.value=aa$V7, p.value=aa$V8, q.value=aa$V9)
|
|
37 }else{
|
|
38 bb.ori <- data.frame(chr=aa$V1, start=aa$V2, stop=aa$V3, signal.value=aa$V7, p.value=aa$V8, q.value=aa$V9, summit=aa$V10)
|
|
39 }
|
|
40
|
|
41 if(summit=="summit"){
|
|
42 bb.ori$summit <- bb.ori$summit-bb.ori$start # change summit to offset to avoid error when concatenating chromosomes
|
|
43 }
|
|
44
|
|
45 bb <- concatenate.chr(bb.ori, chr.size)
|
|
46
|
|
47 #bb <- bb.ori
|
|
48
|
|
49 # remove the peaks that has the same start and stop value
|
|
50 bb <- bb[bb$start != bb$stop,]
|
|
51
|
|
52 if(stop.exclusive==T){
|
|
53 bb$stop <- bb$stop-1
|
|
54 }
|
|
55
|
|
56 if(!is.null(half.width)){
|
|
57 bb$start.ori <- bb$start
|
|
58 bb$stop.ori <- bb$stop
|
|
59
|
|
60 # if peak is narrower than the specified window, stay with its width
|
|
61 # otherwise chop wider peaks to specified width
|
|
62 width <- bb$stop-bb$start +1
|
|
63 is.wider <- width > 2*half.width
|
|
64
|
|
65 if(summit=="offset" | summit=="summit"){ # if summit is offset from start
|
|
66 bb$start[is.wider] <- bb$start.ori[is.wider] + bb$summit[is.wider]-half.width
|
|
67 bb$stop[is.wider] <- bb$start.ori[is.wider] + bb$summit[is.wider]+half.width
|
|
68 } else {
|
|
69 if(summit=="unknown"){
|
|
70 bb$start[is.wider] <- bb$start.ori[is.wider]+round(width[is.wider]/2) - half.width
|
|
71 bb$stop[is.wider] <- bb$start.ori[is.wider]+round(width[is.wider]/2) + half.width
|
|
72 }
|
|
73 }
|
|
74 }
|
|
75
|
|
76 bb <- clean.data(bb)
|
|
77 invisible(list(data.ori=bb.ori, data.cleaned=bb))
|
|
78 }
|
|
79
|
|
80 # clean data
|
|
81 # and concatenate chromosomes if needed
|
|
82 clean.data <- function(adata){
|
|
83
|
|
84 # remove the peaks that has the same start and stop value
|
|
85 adata <- adata[adata$start != adata$stop,]
|
|
86
|
|
87 # if some stops and starts are the same, need fix them
|
|
88 stop.in.start <- is.element(adata$stop, adata$start)
|
|
89 n.fix <- sum(stop.in.start)
|
|
90 if(n.fix >0){
|
|
91 print(paste("Fix", n.fix, "stops\n"))
|
|
92 adata$stop[stop.in.start] <- adata$stop[stop.in.start]-1
|
|
93 }
|
|
94
|
|
95 return(adata)
|
|
96 }
|
|
97
|
|
98 # concatenate peaks
|
|
99 # peaks: the dataframe to have all the peaks
|
|
100 # chr.file: the file to keep the length of each chromosome
|
|
101 # chr files should come from the species that the data is from
|
|
102 #concatenate.chr <- function(peaks, chr.size){
|
|
103
|
|
104 # chr.size <- read.table(chr.file)
|
|
105 # chr.o <- order(chr.size[,1])
|
|
106 # chr.size <- chr.size[chr.o,]
|
|
107 #
|
|
108 # chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2]))
|
|
109 # chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift)
|
|
110 #
|
|
111 # for(i in 1:nrow(chr.size)){
|
|
112 # is.in <- as.character(peaks$chr) == as.character(chr.size.cum$chr[i])
|
|
113 # if(sum(is.in)>0){
|
|
114 # peaks[is.in,]$start <- peaks[is.in,]$start + chr.size.cum$shift[i]
|
|
115 # peaks[is.in,]$stop <- peaks[is.in,]$stop + chr.size.cum$shift[i]
|
|
116 # }
|
|
117 # }
|
|
118 #
|
|
119 # invisible(peaks)
|
|
120 #}
|
|
121
|
|
122
|
|
123
|
|
124
|
|
125 # concatenate peaks
|
|
126 # peaks: the dataframe to have all the peaks
|
|
127 # chr.file: the file to keep the length of each chromosome
|
|
128 # chr files should come from the species that the data is from
|
|
129 concatenate.chr <- function(peaks, chr.size){
|
|
130
|
|
131 # chr.size <- read.table(chr.file)
|
|
132 chr.o <- order(chr.size[,1])
|
|
133 chr.size <- chr.size[chr.o,]
|
|
134
|
|
135 chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2]))
|
|
136 chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift)
|
|
137
|
|
138 peaks$start.ori <- peaks$start
|
|
139 peaks$stop.ori <- peaks$stop
|
|
140
|
|
141 for(i in 1:nrow(chr.size)){
|
|
142 is.in <- as.character(peaks$chr) == as.character(chr.size.cum$chr[i])
|
|
143 if(sum(is.in)>0){
|
|
144 peaks[is.in,]$start <- peaks[is.in,]$start + chr.size.cum$shift[i]
|
|
145 peaks[is.in,]$stop <- peaks[is.in,]$stop + chr.size.cum$shift[i]
|
|
146 }
|
|
147 }
|
|
148
|
|
149 invisible(peaks)
|
|
150 }
|
|
151
|
|
152
|
|
153 deconcatenate.chr <- function(peaks, chr.size){
|
|
154
|
|
155 chr.o <- order(chr.size[,1])
|
|
156 chr.size <- chr.size[chr.o,]
|
|
157
|
|
158 chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2]))
|
|
159 chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift)
|
|
160
|
|
161 peaks$chr <- rep(NA, nrow(peaks))
|
|
162
|
|
163 for(i in 1:(nrow(chr.size.cum)-1)){
|
|
164 is.in <- peaks$start > chr.size.cum[i,2] & peaks$start <= chr.size.cum[i+1, 2]
|
|
165 if(sum(is.in)>0){
|
|
166 peaks[is.in,]$start <- peaks[is.in,]$start - chr.size.cum[i,2]
|
|
167 peaks[is.in,]$stop <- peaks[is.in,]$stop - chr.size.cum[i,2]+1
|
|
168 peaks[is.in,]$chr <- chr.size[i,1]
|
|
169 }
|
|
170 }
|
|
171
|
|
172 if(i == nrow(chr.size.cum)){
|
|
173 is.in <- peaks$start > chr.size.cum[i, 2]
|
|
174 if(sum(is.in)>0){
|
|
175 peaks[is.in,]$start <- peaks[is.in,]$start - chr.size.cum[i,2]
|
|
176 peaks[is.in,]$stop <- peaks[is.in,]$stop - chr.size.cum[i,2]+1
|
|
177 peaks[is.in,]$chr <- chr.size[i,1]
|
|
178 }
|
|
179 }
|
|
180
|
|
181 invisible(peaks)
|
|
182 }
|
|
183
|
|
184 ################ preprocessing peak calling output
|
|
185
|
|
186
|
|
187 #
|
|
188 # read two calling results and sort by peak starting locations,
|
|
189 # then find overlap between peaks
|
|
190 # INPUT:
|
|
191 # rep1: the 1st replicate
|
|
192 # rep2: the 2nd replicate
|
|
193 # OUTPUT:
|
|
194 # id1, id2: the labels for the identified peaks on the replicates
|
|
195 find.overlap <- function(rep1, rep2){
|
|
196
|
|
197 o1 <- order(rep1$start)
|
|
198 rep1 <- rep1[o1,]
|
|
199
|
|
200 o2 <- order(rep2$start)
|
|
201 rep2 <- rep2[o2,]
|
|
202
|
|
203 n1 <- length(o1)
|
|
204 n2 <- length(o2)
|
|
205
|
|
206 # assign common ID to peaks
|
|
207 id1 <- rep(0, n1) # ID assigned on rep1
|
|
208 id2 <- rep(0, n2) # ID assigned on rep2
|
|
209 id <- 1 # keep track common id's
|
|
210
|
|
211 # check if two replicates overlap with each other
|
|
212 i <- 1
|
|
213 j <- 1
|
|
214
|
|
215 while(i <= n1|| j <= n2){
|
|
216
|
|
217 # && (id1[n1] ==0 || id2[n2] ==0)
|
|
218
|
|
219 # if one list runs out
|
|
220 if(i > n1 && j < n2){
|
|
221
|
|
222 j <- j+1
|
|
223 id2[j] <- id
|
|
224 id <- id +1
|
|
225 next
|
|
226 } else{
|
|
227 if(j > n2 && i < n1){
|
|
228 i <- i+1
|
|
229 id1[i] <- id
|
|
230 id <- id +1
|
|
231 next
|
|
232 } else {
|
|
233 if(i >= n1 && j >=n2)
|
|
234 break
|
|
235 }
|
|
236 }
|
|
237
|
|
238 # if not overlap
|
|
239
|
|
240 if(!(rep1$start[i] <= rep2$stop[j] && rep2$start[j] <= rep1$stop[i])){
|
|
241
|
|
242 # at the start of loop, when both are not assigned an ID
|
|
243 # the one locates in front is assigned first
|
|
244 if(id1[i] ==0 && id2[j]==0){
|
|
245 if(rep1$stop[i] < rep2$stop[j]){
|
|
246 id1[i] <- id
|
|
247 } else {
|
|
248 id2[j] <- id
|
|
249 }
|
|
250 } else { # in the middle of the loop, when one is already assigned
|
|
251 # The one that has not assigned gets assigned
|
|
252 # if(id1[i] ==0){ # id1[i] is not assigned
|
|
253 # id1[i] <- id
|
|
254 # } else { # id2[i] is not assigned
|
|
255 # id2[j] <- id
|
|
256 # }
|
|
257
|
|
258 # order the id according to location
|
|
259 if(rep1$stop[i] <= rep2$stop[j]){
|
|
260 id1[i] <- max(id2[j], id1[i])
|
|
261 id2[j] <- id
|
|
262 } else {
|
|
263 if(rep1$stop[i] > rep2$stop[j]){
|
|
264 id2[j] <- max(id1[i], id2[j])
|
|
265 id1[i] <- id
|
|
266 }
|
|
267 }
|
|
268
|
|
269 }
|
|
270
|
|
271 id <- id +1
|
|
272
|
|
273 } else { # if overlap
|
|
274
|
|
275 if(id1[i] == 0 && id2[j] == 0){ # not assign label yet
|
|
276 id1[i] <- id
|
|
277 id2[j] <- id
|
|
278 id <- id +1
|
|
279 } else { # one peak is already assigned label, the other is 0
|
|
280
|
|
281 id1[i] <- max(id1[i], id2[j]) # this is a way to copy the label of the assigned peak without knowing which one is already assigned
|
|
282 id2[j] <- id1[i] # syncronize the labels
|
|
283 }
|
|
284
|
|
285 }
|
|
286
|
|
287 if(rep1$stop[i] < rep2$stop[j]){
|
|
288 i <- i+1
|
|
289 } else {
|
|
290 j <- j+1
|
|
291 }
|
|
292
|
|
293 }
|
|
294
|
|
295 invisible(list(id1=id1, id2=id2))
|
|
296
|
|
297 }
|
|
298
|
|
299 # Impute the missing significant value for the peaks called only on one replicate.
|
|
300 # value
|
|
301 # INPUT:
|
|
302 # rep1, rep2: the two peak calling output
|
|
303 # id1, id2: the IDs assigned by function find.overlap, vectors
|
|
304 # If id1[i]==id2[j], peak i on rep1 overlaps with peak j on rep2
|
|
305 # p.value.impute: the significant value to impute for the missing peaks
|
|
306 # OUTPUT:
|
|
307 # rep1, rep2: peaks ordered by the start locations with imputed peaks
|
|
308 # id1, id2: the IDs with imputed peaks
|
|
309 fill.missing.peaks <- function(rep1, rep2, id1, id2, p.value.impute){
|
|
310
|
|
311 # rep1 <- data.frame(chr=rep1$chr, start=rep1$start, stop=rep1$stop, sig.value=rep1$sig.value)
|
|
312 # rep2 <- data.frame(chr=rep2$chr, start=rep2$start, stop=rep2$stop, sig.value=rep2$sig.value)
|
|
313
|
|
314 o1 <- order(rep1$start)
|
|
315 rep1 <- rep1[o1,]
|
|
316
|
|
317 o2 <- order(rep2$start)
|
|
318 rep2 <- rep2[o2,]
|
|
319
|
|
320 entry.in1.not2 <- !is.element(id1, id2)
|
|
321 entry.in2.not1 <- !is.element(id2, id1)
|
|
322
|
|
323 if(sum(entry.in1.not2) > 0){
|
|
324
|
|
325 temp1 <- rep1[entry.in1.not2, ]
|
|
326
|
|
327 # impute sig.value
|
|
328 temp1$sig.value <- p.value.impute
|
|
329 temp1$signal.value <- p.value.impute
|
|
330 temp1$p.value <- p.value.impute
|
|
331 temp1$q.value <- p.value.impute
|
|
332
|
|
333 rep2.filled <- rbind(rep2, temp1)
|
|
334 id2.filled <- c(id2, id1[entry.in1.not2])
|
|
335 } else {
|
|
336 id2.filled <- id2
|
|
337 rep2.filled <- rep2
|
|
338 }
|
|
339
|
|
340 if(sum(entry.in2.not1) > 0){
|
|
341
|
|
342 temp2 <- rep2[entry.in2.not1, ]
|
|
343
|
|
344 # fill in p.values to 1
|
|
345 temp2$sig.value <- p.value.impute
|
|
346 temp2$signal.value <- p.value.impute
|
|
347 temp2$p.value <- p.value.impute
|
|
348 temp2$q.value <- p.value.impute
|
|
349
|
|
350
|
|
351 # append to the end
|
|
352 rep1.filled <- rbind(rep1, temp2)
|
|
353
|
|
354 id1.filled <- c(id1, id2[entry.in2.not1])
|
|
355 } else {
|
|
356 id1.filled <- id1
|
|
357 rep1.filled <- rep1
|
|
358 }
|
|
359
|
|
360 # sort rep1 and rep2 by the same id
|
|
361 o1 <- order(id1.filled)
|
|
362 rep1.ordered <- rep1.filled[o1, ]
|
|
363
|
|
364 o2 <- order(id2.filled)
|
|
365 rep2.ordered <- rep2.filled[o2, ]
|
|
366
|
|
367 invisible(list(rep1=rep1.ordered, rep2=rep2.ordered,
|
|
368 id1=id1.filled[o1], id2=id2.filled[o2]))
|
|
369 }
|
|
370
|
|
371 # Merge peaks with same ID on the same replicates
|
|
372 # (They are generated if two peaks on rep1 map to the same peak on rep2)
|
|
373 # need peak.list have 3 columns: start, stop and sig.value
|
|
374 merge.peaks <- function(peak.list, id){
|
|
375
|
|
376 i <- 1
|
|
377 j <- 1
|
|
378 dup.index <- c()
|
|
379 sig.value <- c()
|
|
380 start.new <- c()
|
|
381 stop.new <- c()
|
|
382 id.new <- c()
|
|
383
|
|
384 # original data
|
|
385 chr <- c()
|
|
386 start.ori <- c()
|
|
387 stop.ori <- c()
|
|
388
|
|
389 signal.value <- c()
|
|
390 p.value <- c()
|
|
391 q.value <- c()
|
|
392
|
|
393 while(i < length(id)){
|
|
394
|
|
395 if(id[i] == id[i+1]){
|
|
396 dup.index <- c(dup.index, i, i+1) # push on dup.index
|
|
397 } else {
|
|
398 if(length(dup.index)>0){ # pop from dup.index
|
|
399 sig.value[j] <- mean(peak.list$sig.value[unique(dup.index)]) # mean of -log(pvalue)
|
|
400 start.new[j] <- peak.list$start[min(dup.index)]
|
|
401 stop.new[j] <- peak.list$stop[max(dup.index)]
|
|
402 id.new[j] <- id[max(dup.index)]
|
|
403
|
|
404 signal.value[j] <- mean(peak.list$signal.value[unique(dup.index)]) # mean of -log(pvalue)
|
|
405 p.value[j] <- mean(peak.list$p.value[unique(dup.index)]) # mean of -log(pvalue)
|
|
406 q.value[j] <- mean(peak.list$q.value[unique(dup.index)]) # mean of -log(pvalue)
|
|
407
|
|
408 chr[j] <- as.character(peak.list$chr[min(dup.index)])
|
|
409 start.ori[j] <- peak.list$start.ori[min(dup.index)]
|
|
410 stop.ori[j] <- peak.list$stop.ori[max(dup.index)]
|
|
411
|
|
412 dup.index <- c()
|
|
413 } else { # nothing to pop
|
|
414 sig.value[j] <- peak.list$sig.value[i]
|
|
415 start.new[j] <- peak.list$start[i]
|
|
416 stop.new[j] <- peak.list$stop[i]
|
|
417 id.new[j] <- id[i]
|
|
418
|
|
419 signal.value[j] <- peak.list$signal.value[i]
|
|
420 p.value[j] <- peak.list$p.value[i]
|
|
421 q.value[j] <- peak.list$q.value[i]
|
|
422
|
|
423 chr[j] <- as.character(peak.list$chr[i])
|
|
424 start.ori[j] <- peak.list$start.ori[i]
|
|
425 stop.ori[j] <- peak.list$stop.ori[i]
|
|
426
|
|
427 }
|
|
428 j <- j+1
|
|
429 }
|
|
430 i <- i+1
|
|
431 }
|
|
432
|
|
433 data.new <- data.frame(id=id.new, sig.value=sig.value, start=start.new, stop=stop.new, signal.value=signal.value, p.value=p.value, q.value=q.value, chr=chr, start.ori=start.ori, stop.ori=stop.ori)
|
|
434 invisible(data.new)
|
|
435 }
|
|
436
|
|
437
|
|
438
|
|
439
|
|
440
|
|
441 # a wrap function to fill in missing peaks, merge peaks and impute significant values
|
|
442 # out1 and out2 are two peak calling outputs
|
|
443 pair.peaks <- function(out1, out2, p.value.impute=0){
|
|
444
|
|
445 aa <- find.overlap(out1, out2)
|
|
446 bb <- fill.missing.peaks(out1, out2, aa$id1, aa$id2, p.value.impute=0)
|
|
447
|
|
448 cc1 <- merge.peaks(bb$rep1, bb$id1)
|
|
449 cc2 <- merge.peaks(bb$rep2, bb$id2)
|
|
450
|
|
451 invisible(list(merge1=cc1, merge2=cc2))
|
|
452 }
|
|
453
|
|
454
|
|
455
|
|
456 # overlap.ratio is a parameter to define the percentage of overlap
|
|
457 # if overlap.ratio =0, 1 basepair overlap is counted as overlap
|
|
458 # if overlap.ratio between 0 and 1, it is the minimum proportion of
|
|
459 # overlap required to be called as a match
|
|
460 # it is computed as the overlap part/min(peak1.length, peak2.length)
|
|
461 pair.peaks.filter <- function(out1, out2, p.value.impute=0, overlap.ratio=0){
|
|
462
|
|
463 aa <- find.overlap(out1, out2)
|
|
464 bb <- fill.missing.peaks(out1, out2, aa$id1, aa$id2, p.value.impute=0)
|
|
465
|
|
466 cc1 <- merge.peaks(bb$rep1, bb$id1)
|
|
467 cc2 <- merge.peaks(bb$rep2, bb$id2)
|
|
468
|
|
469 frag12 <- cbind(cc1$start, cc1$stop, cc2$start, cc2$stop)
|
|
470
|
|
471 frag.ratio <- apply(frag12, 1, overlap.middle)
|
|
472
|
|
473 frag.ratio[cc1$sig.value==p.value.impute | cc2$sig.value==p.value.impute] <- 0
|
|
474
|
|
475 cc1$frag.ratio <- frag.ratio
|
|
476 cc2$frag.ratio <- frag.ratio
|
|
477
|
|
478 merge1 <- cc1[cc1$frag.ratio >= overlap.ratio,]
|
|
479 merge2 <- cc2[cc2$frag.ratio >= overlap.ratio,]
|
|
480
|
|
481 invisible(list(merge1=merge1, merge2=merge2))
|
|
482 }
|
|
483
|
|
484 # x[1], x[2] are the start and end of the first fragment
|
|
485 # and x[3] and x[4] are the start and end of the 2nd fragment
|
|
486 # If there are two fragments, we can find the overlap by ordering the
|
|
487 # start and stop of all the ends and find the difference between the middle two
|
|
488 overlap.middle <- function(x){
|
|
489
|
|
490 x.o <- x[order(x)]
|
|
491 f1 <- x[2]-x[1]
|
|
492 f2 <- x[4]-x[3]
|
|
493
|
|
494 f.overlap <- abs(x.o[3]-x.o[2])
|
|
495 f.overlap.ratio <- f.overlap/min(f1, f2)
|
|
496
|
|
497 return(f.overlap.ratio)
|
|
498 }
|
|
499
|
|
500
|
|
501
|
|
502 #######
|
|
503 ####### compute correspondence profile
|
|
504 #######
|
|
505
|
|
506 # compute upper rank intersection for one t
|
|
507 # tv: the upper percentile
|
|
508 # x is sorted by the order of paired variable
|
|
509 comp.uri <- function(tv, x){
|
|
510 n <- length(x)
|
|
511 qt <- quantile(x, prob=1-tv[1]) # tv[1] is t
|
|
512 # sum(x[1:ceiling(n*tv[2])] >= qt)/n/tv[2]- tv[1]*tv[2] #tv[2] is v
|
|
513 sum(x[1:ceiling(n*tv[2])] >= qt)/n
|
|
514
|
|
515 }
|
|
516
|
|
517 # compute the correspondence profile
|
|
518 # tt, vv: vector between (0, 1) for percentages
|
|
519 get.uri.2d <- function(x1, x2, tt, vv, spline.df=NULL){
|
|
520
|
|
521 o <- order(x1, x2, decreasing=T)
|
|
522
|
|
523 # sort x2 by the order of x1
|
|
524 x2.ordered <- x2[o]
|
|
525
|
|
526 tv <- cbind(tt, vv)
|
|
527 ntotal <- length(x1) # number of peaks
|
|
528
|
|
529 uri <- apply(tv, 1, comp.uri, x=x2.ordered)
|
|
530
|
|
531 # compute the derivative of URI vs t using small bins
|
|
532 uri.binned <- uri[seq(1, length(uri), by=4)]
|
|
533 tt.binned <- tt[seq(1, length(uri), by=4)]
|
|
534 uri.slope <- (uri.binned[2:(length(uri.binned))] - uri.binned[1:(length(uri.binned)-1)])/(tt.binned[2:(length(uri.binned))] - tt.binned[1:(length(tt.binned)-1)])
|
|
535
|
|
536 # smooth uri using spline
|
|
537 # first find where the jump is and don't fit the jump
|
|
538 # this is the index on the left
|
|
539 # jump.left.old <- which.max(uri[-1]-uri[-length(uri)])
|
|
540 short.list.length <- min(sum(x1>0)/length(x1), sum(x2>0)/length(x2))
|
|
541
|
|
542 if(short.list.length < max(tt)){
|
|
543 jump.left <- which(tt>short.list.length)[1]-1
|
|
544 } else {
|
|
545 jump.left <- which.max(tt)
|
|
546 }
|
|
547
|
|
548 # reversed.index <- seq(length(tt), 1, by=-1)
|
|
549 # nequal <- sum(uri[reversed.index]== tt[reversed.index])
|
|
550 # temp <- which(uri[reversed.index]== tt[reversed.index])[nequal]
|
|
551 # jump.left <- length(tt)-temp
|
|
552
|
|
553 if(jump.left < 6){
|
|
554 jump.left <- length(tt)
|
|
555 }
|
|
556
|
|
557
|
|
558 if(is.null(spline.df))
|
|
559 uri.spl <- smooth.spline(tt[1:jump.left], uri[1:jump.left], df=6.4)
|
|
560 else{
|
|
561 uri.spl <- smooth.spline(tt[1:jump.left], uri[1:jump.left], df=spline.df)
|
|
562 }
|
|
563 # predict the first derivative
|
|
564 uri.der <- predict(uri.spl, tt[1:jump.left], deriv=1)
|
|
565
|
|
566 invisible(list(tv=tv, uri=uri,
|
|
567 uri.slope=uri.slope, t.binned=tt.binned[2:length(uri.binned)],
|
|
568 uri.spl=uri.spl, uri.der=uri.der, jump.left=jump.left,
|
|
569 ntotal=ntotal))
|
|
570 }
|
|
571
|
|
572
|
|
573 # change the scale of uri from based on t (percentage) to n (number of peaks or basepairs)
|
|
574 # this is for plotting multiple pairwise URI's on the same plot
|
|
575 scale.t2n <- function(uri){
|
|
576
|
|
577 ntotal <- uri$ntotal
|
|
578 tv <- uri$tv*uri$ntotal
|
|
579 uri.uri <- uri$uri*uri$ntotal
|
|
580 jump.left <- uri$jump.left
|
|
581 uri.spl <- uri$uri.spl
|
|
582 uri.spl$x <- uri$uri.spl$x*uri$ntotal
|
|
583 uri.spl$y <- uri$uri.spl$y*uri$ntotal
|
|
584
|
|
585 t.binned <- uri$t.binned*uri$ntotal
|
|
586 uri.slope <- uri$uri.slope
|
|
587 uri.der <- uri$uri.der
|
|
588 uri.der$x <- uri$uri.der$x*uri$ntotal
|
|
589 uri.der$y <- uri$uri.der$y
|
|
590
|
|
591 uri.n <- list(tv=tv, uri=uri.uri, t.binned=t.binned, uri.slope=uri.slope, uri.spl=uri.spl, uri.der=uri.der, ntotal=ntotal, jump.left=jump.left)
|
|
592 return(uri.n)
|
|
593 }
|
|
594
|
|
595
|
|
596
|
|
597
|
|
598 # a wrapper for running URI for peaks from peak calling results
|
|
599 # both data1 and data2 are calling results in narrowpeak format
|
|
600 compute.pair.uri <- function(data.1, data.2, sig.value1="signal.value", sig.value2="signal.value", spline.df=NULL, overlap.ratio=0){
|
|
601
|
|
602 tt <- seq(0.01, 1, by=0.01)
|
|
603 vv <- tt
|
|
604
|
|
605 if(sig.value1=="signal.value"){
|
|
606 data.1.enrich <- data.frame(chr=data.1$chr, start.ori=data.1$start.ori, stop.ori=data.1$stop.ori, start=data.1$start, stop=data.1$stop, sig.value=data.1$signal.value, signal.value=data.1$signal.value, p.value=data.1$p.value, q.value=data.1$q.value)
|
|
607 } else {
|
|
608 if(sig.value1=="p.value"){
|
|
609 data.1.enrich <- data.frame(chr=data.1$chr, start.ori=data.1$start.ori, stop.ori=data.1$stop.ori, start=data.1$start, stop=data.1$stop, sig.value=data.1$p.value, signal.value=data.1$signal.value, p.value=data.1$p.value, q.value=data.1$q.value)
|
|
610 } else {
|
|
611 if(sig.value1=="q.value"){
|
|
612 data.1.enrich <- data.frame(chr=data.1$chr, start.ori=data.1$start.ori, stop.ori=data.1$stop.ori, start=data.1$start, stop=data.1$stop, sig.value=data.1$q.value, signal.value=data.1$signal.value, p.value=data.1$p.value, q.value=data.1$q.value)
|
|
613 }
|
|
614 }
|
|
615 }
|
|
616
|
|
617 if(sig.value2=="signal.value"){
|
|
618 data.2.enrich <- data.frame(chr=data.2$chr, start.ori=data.2$start.ori, stop.ori=data.2$stop.ori, start=data.2$start, stop=data.2$stop, sig.value=data.2$signal.value, signal.value=data.2$signal.value, p.value=data.2$p.value, q.value=data.2$q.value)
|
|
619 } else {
|
|
620 if(sig.value2=="p.value"){
|
|
621 data.2.enrich <- data.frame(chr=data.2$chr, start.ori=data.2$start.ori, stop.ori=data.2$stop.ori, start=data.2$start, stop=data.2$stop, sig.value=data.2$p.value, signal.value=data.2$signal.value, p.value=data.2$p.value, q.value=data.2$q.value)
|
|
622 } else {
|
|
623 if(sig.value2=="q.value"){
|
|
624 data.2.enrich <- data.frame(chr=data.2$chr, start.ori=data.2$start.ori, stop.ori=data.2$stop.ori, start=data.2$start, stop=data.2$stop, sig.value=data.2$q.value, signal.value=data.2$signal.value, p.value=data.2$p.value, q.value=data.2$q.value)
|
|
625 }
|
|
626 }
|
|
627 }
|
|
628
|
|
629 ### by peaks
|
|
630 # data12.enrich <- pair.peaks(data.1.enrich, data.2.enrich)
|
|
631 data12.enrich <- pair.peaks.filter(data.1.enrich, data.2.enrich, p.value.impute=0, overlap.ratio)
|
|
632 uri <- get.uri.2d(as.numeric(as.character(data12.enrich$merge1$sig.value)), as.numeric(as.character(data12.enrich$merge2$sig.value)), tt, vv, spline.df=spline.df)
|
|
633 uri.n <- scale.t2n(uri)
|
|
634
|
|
635 return(list(uri=uri, uri.n=uri.n, data12.enrich=data12.enrich, sig.value1=sig.value1, sig.value2=sig.value2))
|
|
636
|
|
637
|
|
638 }
|
|
639
|
|
640
|
|
641
|
|
642 # compute uri for matched sample
|
|
643 get.uri.matched <- function(data12, df=10){
|
|
644
|
|
645 tt <- seq(0.01, 1, by=0.01)
|
|
646 vv <- tt
|
|
647 uri <- get.uri.2d(data12$sample1$sig.value, data12$sample2$sig.value, tt, vv, spline.df=df)
|
|
648
|
|
649 # change scale from t to n
|
|
650 uri.n <- scale.t2n(uri)
|
|
651
|
|
652 return(list(uri=uri, uri.n=uri.n))
|
|
653
|
|
654 }
|
|
655
|
|
656 # map.uv is a pair of significant values corresponding to specified consistency FDR
|
|
657 # assuming values in map.uv and qvalue are linearly related
|
|
658 # data.set is the original data set
|
|
659 # sig.value is the name of the significant value in map.uv, say enrichment
|
|
660 # nominal.value is the one we want to map to, say q-value
|
|
661 #
|
|
662 map.sig.value <- function(data.set, map.uv, nominal.value){
|
|
663
|
|
664 index.nominal <- which(names(data.set$merge1)==nominal.value)
|
|
665 nentry <- nrow(map.uv)
|
|
666 map.nominal <- rbind(map.uv[, c("sig.value1", "sig.value2")])
|
|
667
|
|
668 for(i in 1:nentry){
|
|
669
|
|
670 map.nominal[i, "sig.value1"] <- data.set$merge1[unique(which.min(abs(data.set$merge1$sig.value-map.uv[i, "sig.value1"]))), index.nominal]
|
|
671 map.nominal[i, "sig.value2"] <- data.set$merge2[unique(which.min(abs(data.set$merge2$sig.value-map.uv[i, "sig.value2"]))), index.nominal]
|
|
672 }
|
|
673
|
|
674 invisible(map.nominal)
|
|
675 }
|
|
676
|
|
677
|
|
678 ############### plot correspondence profile
|
|
679
|
|
680 # plot multiple comparison wrt one template
|
|
681 # uri.list contains the total number of peaks
|
|
682 # plot.missing=F: not plot the missing points on the right
|
|
683 plot.uri.group <- function(uri.n.list, plot.dir, file.name=NULL, legend.txt, xlab.txt="num of significant peaks", ylab.txt="num of peaks in common", col.start=0, col.txt=NULL, plot.missing=F, title.txt=NULL){
|
|
684
|
|
685 if(is.null(col.txt))
|
|
686 col.txt <- c("black", "red", "purple", "green", "blue", "cyan", "magenta", "orange", "grey")
|
|
687
|
|
688 n <- length(uri.n.list)
|
|
689
|
|
690 ntotal <- c()
|
|
691 for(i in 1:n)
|
|
692 ntotal[i] <- uri.n.list[[i]]$ntotal
|
|
693
|
|
694 jump.left <- c()
|
|
695 jump.left.der <- c()
|
|
696 ncommon <- c()
|
|
697 for(i in 1:n){
|
|
698 # jump.left[i] <- which.max(uri.n.list[[i]]$uri[-1]-uri.n.list[[i]]$uri[-length(uri.n.list[[i]]$uri)])
|
|
699 # if(jump.left[i] < 6)
|
|
700 # jump.left[i] <- length(uri.n.list[[i]]$uri)
|
|
701
|
|
702 ## reversed.index <- seq(length(uri.n.list[[i]]$tv[,1]), 1, by=-1)
|
|
703 ## nequal <- sum(uri.n.list[[i]]$uri[reversed.index]== uri.n.list[[i]]$tv[reversed.index,1])
|
|
704 ## temp <- which(uri.n.list[[i]]$uri[reversed.index]== uri.n.list[[i]]$tv[reversed.index,1])[nequal]
|
|
705 ## jump.left[i] <- length(uri.n.list[[i]]$tv[,1])-temp
|
|
706 ##print(uri.n.list[[i]]$uri)
|
|
707 ##print(uri.n.list[[i]]$tv[,1])
|
|
708 ## jump.left[i] <- uri.n.list[[i]]$jump.left
|
|
709
|
|
710 # jump.left.der[i] <- sum(uri.n.list[[i]]$t.binned < uri.n.list[[i]]$uri.der$x[length(uri.n.list[[i]]$uri.der$x)])
|
|
711
|
|
712 jump.left[i] <- uri.n.list[[i]]$jump.left
|
|
713 jump.left.der[i] <- jump.left[i]
|
|
714 ncommon[i] <- uri.n.list[[i]]$tv[jump.left[i],1]
|
|
715 }
|
|
716
|
|
717
|
|
718 if(plot.missing){
|
|
719 max.peak <- max(ntotal)
|
|
720 } else {
|
|
721 max.peak <- max(ncommon)*1.05
|
|
722 }
|
|
723
|
|
724 if(!is.null(file.name)){
|
|
725 postscript(paste(plot.dir, "uri.", file.name, sep=""))
|
|
726 par(mfrow=c(1,1), mar=c(5,5,4,2))
|
|
727 }
|
|
728
|
|
729 plot(uri.n.list[[1]]$tv[,1], uri.n.list[[1]]$uri, type="n", xlab=xlab.txt, ylab=ylab.txt, xlim=c(0, max.peak), ylim=c(0, max.peak), cex.lab=2)
|
|
730
|
|
731 for(i in 1:n){
|
|
732
|
|
733 if(plot.missing){
|
|
734 points(uri.n.list[[i]]$tv[,1], uri.n.list[[i]]$uri, col=col.txt[i+col.start], cex=0.5 )
|
|
735 } else {
|
|
736 points(uri.n.list[[i]]$tv[1:jump.left[i],1], uri.n.list[[i]]$uri[1:jump.left[i]], col=col.txt[i+col.start], cex=0.5)
|
|
737 }
|
|
738 lines(uri.n.list[[i]]$uri.spl, col=col.txt[i+col.start], lwd=4)
|
|
739 }
|
|
740 abline(coef=c(0,1), lty=3)
|
|
741 legend(0, max.peak, legend=legend.txt, col=col.txt[(col.start+1):length(col.txt)], lty=1, lwd=3, cex=2)
|
|
742 if(!is.null(title))
|
|
743 title(title.txt)
|
|
744
|
|
745 if(!is.null(file.name)){
|
|
746 dev.off()
|
|
747 }
|
|
748
|
|
749 if(!is.null(file.name)){
|
|
750 postscript(paste(plot.dir, "duri.", file.name, sep=""))
|
|
751 par(mfrow=c(1,1), mar=c(5,5,4,2))
|
|
752 }
|
|
753 plot(uri.n.list[[1]]$t.binned, uri.n.list[[1]]$uri.slope, type="n", xlab=xlab.txt, ylab="slope", xlim=c(0, max.peak), ylim=c(0, 1.5), cex.lab=2)
|
|
754
|
|
755 for(i in 1:n){
|
|
756 # if(plot.missing){
|
|
757 # points(uri.n.list[[i]]$t.binned, uri.n.list[[i]]$uri.slope, col=col.txt[i+col.start], cex=0.5)
|
|
758 # } else {
|
|
759 # points(uri.n.list[[i]]$t.binned[1:jump.left.der[i]], uri.n.list[[i]]$uri.slope[1:jump.left.der[i]], col=col.txt[i+col.start], cex=0.5)
|
|
760 # }
|
|
761 lines(uri.n.list[[i]]$uri.der, col=col.txt[i+col.start], lwd=4)
|
|
762 }
|
|
763 abline(h=1, lty=3)
|
|
764 legend(0.5*max.peak, 1.5, legend=legend.txt, col=col.txt[(col.start+1):length(col.txt)], lty=1, lwd=3, cex=2)
|
|
765
|
|
766 if(!is.null(title))
|
|
767 title(title.txt)
|
|
768
|
|
769 if(!is.null(file.name)){
|
|
770 dev.off()
|
|
771 }
|
|
772
|
|
773 }
|
|
774
|
|
775
|
|
776
|
|
777 #######################
|
|
778 ####################### copula fitting for matched peaks
|
|
779 #######################
|
|
780
|
|
781 # estimation from mixed copula model
|
|
782
|
|
783 # 4-5-09
|
|
784 # A nonparametric estimation of mixed copula model
|
|
785
|
|
786
|
|
787 # updated
|
|
788
|
|
789 # c1, c2, f1, f2, g1, g2 are vectors
|
|
790 # c1*f1*g1 and c2*f2*g2 are copula densities for the two components
|
|
791 # xd1 and yd1 are the values of marginals for the first component
|
|
792 # xd2 and yd2 are the values of marginals for the 2nd component
|
|
793 #
|
|
794 # ez is the prob for being in the consistent group
|
|
795 get.ez <- function(p, c1, c2, xd1, yd1, xd2, yd2){
|
|
796
|
|
797 return(p*c1*xd1*yd1/(p*c1*xd1*yd1 + (1-p)*c2*xd2*yd2))
|
|
798 }
|
|
799
|
|
800 # checked
|
|
801
|
|
802 # this is C_12 not the copula density function c=C_12 * f1* f2
|
|
803 # since nonparametric estimation is used here for f1 and f2, which
|
|
804 # are constant throughout the iterations, we don't need them for optimization
|
|
805 #
|
|
806 # bivariate gaussian copula function
|
|
807 # t and s are vectors of same length, both are percentiles
|
|
808 # return a vector
|
|
809 gaussian.cop.den <- function(t, s, rho){
|
|
810
|
|
811 A <- qnorm(t)^2 + qnorm(s)^2
|
|
812 B <- qnorm(t)*qnorm(s)
|
|
813
|
|
814 loglik <- -log(1-rho^2)/2 - rho/(2*(1-rho^2))*(rho*A-2*B)
|
|
815
|
|
816 return(exp(loglik))
|
|
817 }
|
|
818
|
|
819 clayton.cop.den <- function(t, s, rho){
|
|
820
|
|
821 if(rho > 0)
|
|
822 return(exp(log(rho+1)-(rho+1)*(log(t)+log(s))-(2+1/rho)*log(t^(-rho) + s^(-rho)-1)))
|
|
823
|
|
824 if(rho==0)
|
|
825 return(1)
|
|
826
|
|
827 if(rho<0)
|
|
828 stop("Incorrect Clayton copula coefficient")
|
|
829
|
|
830 }
|
|
831
|
|
832
|
|
833 # checked
|
|
834 # estimate rho from Gaussian copula
|
|
835 mle.gaussian.copula <- function(t, s, e.z){
|
|
836
|
|
837 # reparameterize to bound from rho=+-1
|
|
838 l.c <- function(rho, t, s, e.z){
|
|
839 # cat("rho=", rho, "\n")
|
|
840 sum(e.z*log(gaussian.cop.den(t, s, rho)))}
|
|
841
|
|
842 rho.max <- optimize(f=l.c, c(-0.998, 0.998), maximum=T, tol=0.00001, t=t, s=s, e.z=e.z)
|
|
843
|
|
844 #print(rho.max$m)
|
|
845
|
|
846 #cat("cor=", cor(qnorm(t)*e.z, qnorm(s)*e.z), "\t", "rho.max=", rho.max$m, "\n")
|
|
847 # return(sign(rho.max$m)/(1+rho.max$m))
|
|
848 return(rho.max$m)
|
|
849 }
|
|
850
|
|
851
|
|
852 # estimate mle from Clayton copula,
|
|
853 mle.clayton.copula <- function(t, s, e.z){
|
|
854
|
|
855 l.c <- function(rho, t, s, e.z){
|
|
856 lc <- sum(e.z*log(clayton.cop.den(t, s, rho)))
|
|
857 # cat("rho=", rho, "\t", "l.c=", lc, "\n")
|
|
858 return(lc)
|
|
859 }
|
|
860
|
|
861 rho.max <- optimize(f=l.c, c(0.1, 20), maximum=T, tol=0.00001, t=t, s=s, e.z=e.z)
|
|
862
|
|
863 return(rho.max$m)
|
|
864 }
|
|
865
|
|
866
|
|
867
|
|
868 # updated
|
|
869 # mixture likelihood of two gaussian copula
|
|
870 # nonparametric and ranked transformed
|
|
871 loglik.2gaussian.copula <- function(x, y, p, rho1, rho2, x.mar, y.mar){
|
|
872
|
|
873 px.1 <- get.pdf.cdf(x, x.mar$f1)
|
|
874 px.2 <- get.pdf.cdf(x, x.mar$f2)
|
|
875 py.1 <- get.pdf.cdf(y, y.mar$f1)
|
|
876 py.2 <- get.pdf.cdf(y, y.mar$f2)
|
|
877
|
|
878 c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
|
|
879 c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
|
|
880
|
|
881 sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf))
|
|
882 }
|
|
883
|
|
884 loglik.2copula <- function(x, y, p, rho1, rho2, x.mar, y.mar, copula.txt){
|
|
885
|
|
886 px.1 <- pdf.cdf$px.1
|
|
887 px.2 <- pdf.cdf$px.2
|
|
888 py.1 <- pdf.cdf$py.1
|
|
889 py.2 <- pdf.cdf$py.2
|
|
890
|
|
891 if(copula.txt=="gaussian"){
|
|
892 c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
|
|
893 c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
|
|
894 } else {
|
|
895 if(copula.txt=="clayton"){
|
|
896 c1 <- clayton.cop.den(px.1$cdf, py.1$cdf, rho1)
|
|
897 c2 <- clayton.cop.den(px.2$cdf, py.2$cdf, rho2)
|
|
898 }
|
|
899 }
|
|
900 sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf))
|
|
901 }
|
|
902
|
|
903
|
|
904 # estimate the marginals of each component using histogram estimator in EM
|
|
905 # return the density, breaks, and cdf of the histogram estimator
|
|
906 est.mar.hist <- function(x, e.z, breaks){
|
|
907
|
|
908 binwidth <- c()
|
|
909 nbin <- length(breaks)-1
|
|
910 nx <- length(x)
|
|
911
|
|
912 # the histogram
|
|
913 x1.pdf <- c()
|
|
914 x2.pdf <- c()
|
|
915 x1.cdf <- c()
|
|
916 x2.cdf <- c()
|
|
917
|
|
918 # the pdf for each point
|
|
919 x1.pdf.value <- rep(NA, nx)
|
|
920 x2.pdf.value <- rep(NA, nx)
|
|
921
|
|
922 x1.cdf.value <- rep(NA, nx)
|
|
923 x2.cdf.value <- rep(NA, nx)
|
|
924
|
|
925 for(i in 1:nbin){
|
|
926
|
|
927 binwidth[i] <- breaks[i+1] - breaks[i]
|
|
928 if(i < nbin)
|
|
929 in.bin <- x>= breaks[i] & x < breaks[i+1]
|
|
930 else # last bin
|
|
931 in.bin <- x>= breaks[i] & x <=breaks[i+1]
|
|
932
|
|
933 # each bin add one observation to avoid empty bins
|
|
934 # multiple (nx+nbin)/(nx+nbin+1) to avoid blowup when looking up for
|
|
935 # quantiles
|
|
936 x1.pdf[i] <- (sum(e.z[in.bin])+1)/(sum(e.z)+nbin)/binwidth[i]*(nx+nbin)/(nx+nbin+1)
|
|
937 x2.pdf[i] <- (sum(1-e.z[in.bin])+1)/(sum(1-e.z)+nbin)/binwidth[i]*(nx+nbin)/(nx+nbin+1)
|
|
938
|
|
939
|
|
940 # x1.pdf[i] <- sum(e.z[in.bin])/sum(e.z)/binwidth[i]*nx/(nx+1)
|
|
941 # x2.pdf[i] <- sum(1-e.z[in.bin])/sum(1-e.z)/binwidth[i]*nx/(nx+1)
|
|
942
|
|
943 # treat each bin as a value for a discrete variable
|
|
944 # x1.cdf[i] <- sum(x1.pdf[1:i]*binwidth[1:i])
|
|
945 # x2.cdf[i] <- sum(x2.pdf[1:i]*binwidth[1:i])
|
|
946
|
|
947
|
|
948 # cumulative density before reaching i
|
|
949 if(i>1){
|
|
950 x1.cdf[i] <- sum(x1.pdf[1:(i-1)]*binwidth[1:(i-1)])
|
|
951 x2.cdf[i] <- sum(x2.pdf[1:(i-1)]*binwidth[1:(i-1)])
|
|
952 } else{
|
|
953 x1.cdf[i] <- 0
|
|
954 x2.cdf[i] <- 0
|
|
955 }
|
|
956
|
|
957 # make a vector of nx to store the values of pdf and cdf for each x
|
|
958 # this will speed up the computation dramatically
|
|
959 x1.pdf.value[in.bin] <- x1.pdf[i]
|
|
960 x2.pdf.value[in.bin] <- x2.pdf[i]
|
|
961
|
|
962 x1.cdf.value[in.bin] <- x1.cdf[i] + x1.pdf[i]*(x[in.bin]-breaks[i])
|
|
963 x2.cdf.value[in.bin] <- x2.cdf[i] + x2.pdf[i]*(x[in.bin]-breaks[i])
|
|
964 }
|
|
965
|
|
966 # x1.cdf <- cumsum(x1.pdf*binwidth)
|
|
967 # x2.cdf <- cumsum(x2.pdf*binwidth)
|
|
968
|
|
969 f1 <-list(breaks=breaks, density=x1.pdf, cdf=x1.cdf)
|
|
970 f2 <-list(breaks=breaks, density=x2.pdf, cdf=x2.cdf)
|
|
971
|
|
972 f1.value <- list(pdf=x1.pdf.value, cdf=x1.cdf.value)
|
|
973 f2.value <- list(pdf=x2.pdf.value, cdf=x2.cdf.value)
|
|
974
|
|
975 return(list(f1=f1, f2=f2, f1.value=f1.value, f2.value=f2.value))
|
|
976 }
|
|
977
|
|
978 # estimate the marginal cdf from rank
|
|
979 est.cdf.rank <- function(x, conf.z){
|
|
980
|
|
981 # add 1 to prevent blow up
|
|
982 x1.cdf <- rank(x[conf.z==1])/(length(x[conf.z==1])+1)
|
|
983
|
|
984 x2.cdf <- rank(x[conf.z==0])/(length(x[conf.z==0])+1)
|
|
985
|
|
986 return(list(cdf1=x1.cdf, cdf2=x2.cdf))
|
|
987 }
|
|
988
|
|
989 # df is a density function with fields: density, cdf and breaks, x is a scalar
|
|
990 get.pdf <- function(x, df){
|
|
991
|
|
992 if(x < df$breaks[1])
|
|
993 cat("x is out of the range of df\n")
|
|
994
|
|
995 index <- which(df$breaks >= x)[1]
|
|
996
|
|
997 if(index==1)
|
|
998 index <- index +1
|
|
999 return(df$density[index-1])
|
|
1000 }
|
|
1001
|
|
1002 # get cdf from histgram estimator for a single value
|
|
1003 get.cdf <- function(x, df){
|
|
1004
|
|
1005 index <- which(df$breaks >= x)[1]
|
|
1006 if(index==1)
|
|
1007 index <- index +1
|
|
1008 return(df$cdf[index-1])
|
|
1009 }
|
|
1010
|
|
1011 # df is a density function with fields: density, cdf and breaks
|
|
1012 get.pdf.cdf <- function(x.vec, df){
|
|
1013
|
|
1014 x.pdf <- sapply(x.vec, get.pdf, df=df)
|
|
1015 x.cdf <- sapply(x.vec, get.cdf, df=df)
|
|
1016 return(list(cdf=x.cdf, pdf=x.pdf))
|
|
1017 }
|
|
1018
|
|
1019 # E-step
|
|
1020 # x and y are the original observations or ranks
|
|
1021 # rho1 and rho2 are the parameters of each copula
|
|
1022 # f1, f2, g1, g2 are functions, each is a histogram
|
|
1023 e.step.2gaussian <- function(x, y, p, rho1, rho2, x.mar, y.mar){
|
|
1024
|
|
1025 # get pdf and cdf of each component from functions in the corresponding component
|
|
1026 px.1 <- get.pdf.cdf(x, x.mar$f1)
|
|
1027 px.2 <- get.pdf.cdf(x, x.mar$f2)
|
|
1028 py.1 <- get.pdf.cdf(y, y.mar$f1)
|
|
1029 py.2 <- get.pdf.cdf(y, y.mar$f2)
|
|
1030
|
|
1031 c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
|
|
1032 c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
|
|
1033
|
|
1034 return(get.ez(p, c1, c2, px.1$pdf, py.1$pdf, px.2$pdf, py.2$pdf))
|
|
1035 }
|
|
1036
|
|
1037 # E-step
|
|
1038 # rho1 and rho2 are the parameters of each copula
|
|
1039 e.step.2copula <- function(x, y, p, rho1, rho2, x.mar, y.mar, copula.txt){
|
|
1040
|
|
1041 # get pdf and cdf of each component from functions in the corresponding component
|
|
1042 px.1 <- get.pdf.cdf(x, x.mar$f1)
|
|
1043 px.2 <- get.pdf.cdf(x, x.mar$f2)
|
|
1044 py.1 <- get.pdf.cdf(y, y.mar$f1)
|
|
1045 py.2 <- get.pdf.cdf(y, y.mar$f2)
|
|
1046
|
|
1047 if(copula.txt=="gaussian"){
|
|
1048 c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
|
|
1049 c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
|
|
1050 } else {
|
|
1051 if(copula.txt=="clayton"){
|
|
1052 c1 <- clayton.cop.den(px.1$cdf, py.1$cdf, rho1)
|
|
1053 c2 <- clayton.cop.den(px.2$cdf, py.2$cdf, rho2)
|
|
1054 }
|
|
1055 }
|
|
1056 return(get.ez(p, c1, c2, px.1$pdf, py.1$pdf, px.2$pdf, py.2$pdf))
|
|
1057 }
|
|
1058
|
|
1059
|
|
1060
|
|
1061
|
|
1062 # M-step
|
|
1063 m.step.2gaussian <- function(x, y, e.z, breaks){
|
|
1064
|
|
1065 # compute f1, f2, g1 and g2
|
|
1066 x.mar <- est.mar.hist(x, e.z, breaks)
|
|
1067 y.mar <- est.mar.hist(y, e.z, breaks)
|
|
1068
|
|
1069 px.1 <- get.pdf.cdf(x, x.mar$f1)
|
|
1070 px.2 <- get.pdf.cdf(x, x.mar$f2)
|
|
1071 py.1 <- get.pdf.cdf(y, y.mar$f1)
|
|
1072 py.2 <- get.pdf.cdf(y, y.mar$f2)
|
|
1073
|
|
1074 rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)
|
|
1075 rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z)
|
|
1076
|
|
1077 p <- sum(e.z)/length(e.z)
|
|
1078
|
|
1079 return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar))
|
|
1080 }
|
|
1081
|
|
1082 m.step.2copula <- function(x, y, e.z, breaks, copula.txt){
|
|
1083
|
|
1084 # compute f1, f2, g1 and g2
|
|
1085 x.mar <- est.mar.hist(x, e.z, breaks)
|
|
1086 y.mar <- est.mar.hist(y, e.z, breaks)
|
|
1087
|
|
1088 px.1 <- get.pdf.cdf(x, x.mar$f1)
|
|
1089 px.2 <- get.pdf.cdf(x, x.mar$f2)
|
|
1090 py.1 <- get.pdf.cdf(y, y.mar$f1)
|
|
1091 py.2 <- get.pdf.cdf(y, y.mar$f2)
|
|
1092
|
|
1093 if(copula.txt=="gaussian"){
|
|
1094 rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)
|
|
1095 rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z)
|
|
1096 } else {
|
|
1097 if(copula.txt=="clayton"){
|
|
1098 rho1 <- mle.clayton.copula(px.1$cdf, py.1$cdf, e.z)
|
|
1099 rho2 <- mle.clayton.copula(px.2$cdf, py.2$cdf, 1-e.z)
|
|
1100 }
|
|
1101 }
|
|
1102
|
|
1103 p <- sum(e.z)/length(e.z)
|
|
1104
|
|
1105 return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar))
|
|
1106 }
|
|
1107
|
|
1108
|
|
1109
|
|
1110 # E-step: pass values
|
|
1111 # x and y are the original observations or ranks
|
|
1112 # rho1 and rho2 are the parameters of each copula
|
|
1113 # f1, f2, g1, g2 are functions, each is a histogram
|
|
1114 e.step.2gaussian.value <- function(x, y, p, rho1, rho2, pdf.cdf){
|
|
1115
|
|
1116 c1 <- gaussian.cop.den(pdf.cdf$px.1$cdf, pdf.cdf$py.1$cdf, rho1)
|
|
1117 c2 <- gaussian.cop.den(pdf.cdf$px.2$cdf, pdf.cdf$py.2$cdf, rho2)
|
|
1118
|
|
1119 e.z <- get.ez(p, c1, c2, pdf.cdf$px.1$pdf, pdf.cdf$py.1$pdf,
|
|
1120 pdf.cdf$px.2$pdf, pdf.cdf$py.2$pdf)
|
|
1121 return(e.z)
|
|
1122 }
|
|
1123
|
|
1124
|
|
1125 e.step.2copula.value <- function(x, y, p, rho1, rho2, pdf.cdf, copula.txt){
|
|
1126
|
|
1127 if(copula.txt =="gaussian"){
|
|
1128 c1 <- gaussian.cop.den(pdf.cdf$px.1$cdf, pdf.cdf$py.1$cdf, rho1)
|
|
1129 c2 <- gaussian.cop.den(pdf.cdf$px.2$cdf, pdf.cdf$py.2$cdf, rho2)
|
|
1130 } else {
|
|
1131 if(copula.txt =="clayton"){
|
|
1132 c1 <- clayton.cop.den(pdf.cdf$px.1$cdf, pdf.cdf$py.1$cdf, rho1)
|
|
1133 c2 <- clayton.cop.den(pdf.cdf$px.2$cdf, pdf.cdf$py.2$cdf, rho2)
|
|
1134 }
|
|
1135 }
|
|
1136
|
|
1137 e.z <- get.ez(p, c1, c2, pdf.cdf$px.1$pdf, pdf.cdf$py.1$pdf,
|
|
1138 pdf.cdf$px.2$pdf, pdf.cdf$py.2$pdf)
|
|
1139 return(e.z)
|
|
1140 }
|
|
1141
|
|
1142
|
|
1143 # M-step: pass values
|
|
1144 m.step.2gaussian.value <- function(x, y, e.z, breaks, fix.rho2){
|
|
1145
|
|
1146 # compute f1, f2, g1 and g2
|
|
1147 x.mar <- est.mar.hist(x, e.z, breaks)
|
|
1148 y.mar <- est.mar.hist(y, e.z, breaks)
|
|
1149
|
|
1150 # px.1 <- get.pdf.cdf(x, x.mar$f1)
|
|
1151 # px.2 <- get.pdf.cdf(x, x.mar$f2)
|
|
1152 # py.1 <- get.pdf.cdf(y, y.mar$f1)
|
|
1153 # py.2 <- get.pdf.cdf(y, y.mar$f2)
|
|
1154
|
|
1155 px.1 <- x.mar$f1.value
|
|
1156 px.2 <- x.mar$f2.value
|
|
1157 py.1 <- y.mar$f1.value
|
|
1158 py.2 <- y.mar$f2.value
|
|
1159
|
|
1160 rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)
|
|
1161
|
|
1162 if(!fix.rho2)
|
|
1163 rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z)
|
|
1164 else
|
|
1165 rho2 <- 0
|
|
1166
|
|
1167 p <- sum(e.z)/length(e.z)
|
|
1168
|
|
1169 pdf.cdf <- list(px.1=px.1, px.2=px.2, py.1=py.1, py.2=py.2)
|
|
1170
|
|
1171 return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar,
|
|
1172 pdf.cdf=pdf.cdf))
|
|
1173 }
|
|
1174
|
|
1175 m.step.2gaussian.value2 <- function(x, y, e.z, breaks, fix.rho2, x.mar, y.mar){
|
|
1176
|
|
1177 # compute f1, f2, g1 and g2
|
|
1178 # x.mar <- est.mar.hist(x, e.z, breaks)
|
|
1179 # y.mar <- est.mar.hist(y, e.z, breaks)
|
|
1180
|
|
1181 # px.1 <- get.pdf.cdf(x, x.mar$f1)
|
|
1182 # px.2 <- get.pdf.cdf(x, x.mar$f2)
|
|
1183 # py.1 <- get.pdf.cdf(y, y.mar$f1)
|
|
1184 # py.2 <- get.pdf.cdf(y, y.mar$f2)
|
|
1185
|
|
1186 px.1 <- x.mar$f1.value
|
|
1187 px.2 <- x.mar$f2.value
|
|
1188 py.1 <- y.mar$f1.value
|
|
1189 py.2 <- y.mar$f2.value
|
|
1190
|
|
1191 rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)
|
|
1192
|
|
1193 if(!fix.rho2)
|
|
1194 rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z)
|
|
1195 else
|
|
1196 rho2 <- 0
|
|
1197
|
|
1198 p <- sum(e.z)/length(e.z)
|
|
1199
|
|
1200 pdf.cdf <- list(px.1=px.1, px.2=px.2, py.1=py.1, py.2=py.2)
|
|
1201
|
|
1202 return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar,
|
|
1203 pdf.cdf=pdf.cdf))
|
|
1204 }
|
|
1205
|
|
1206
|
|
1207
|
|
1208 m.step.2copula.value <- function(x, y, e.z, breaks, fix.rho2, copula.txt){
|
|
1209
|
|
1210 # compute f1, f2, g1 and g2
|
|
1211 x.mar <- est.mar.hist(x, e.z, breaks)
|
|
1212 y.mar <- est.mar.hist(y, e.z, breaks)
|
|
1213
|
|
1214 # px.1 <- get.pdf.cdf(x, x.mar$f1)
|
|
1215 # px.2 <- get.pdf.cdf(x, x.mar$f2)
|
|
1216 # py.1 <- get.pdf.cdf(y, y.mar$f1)
|
|
1217 # py.2 <- get.pdf.cdf(y, y.mar$f2)
|
|
1218
|
|
1219 px.1 <- x.mar$f1.value
|
|
1220 px.2 <- x.mar$f2.value
|
|
1221 py.1 <- y.mar$f1.value
|
|
1222 py.2 <- y.mar$f2.value
|
|
1223
|
|
1224 if(copula.txt=="gaussian"){
|
|
1225 rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z)
|
|
1226
|
|
1227 if(!fix.rho2)
|
|
1228 rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z)
|
|
1229 else
|
|
1230 rho2 <- 0
|
|
1231 } else {
|
|
1232
|
|
1233 if(copula.txt=="clayton"){
|
|
1234 rho1 <- mle.clayton.copula(px.1$cdf, py.1$cdf, e.z)
|
|
1235
|
|
1236 if(!fix.rho2)
|
|
1237 rho2 <- mle.clayton.copula(px.2$cdf, py.2$cdf, 1-e.z)
|
|
1238 else
|
|
1239 rho2 <- 0
|
|
1240 }
|
|
1241 }
|
|
1242
|
|
1243 p <- sum(e.z)/length(e.z)
|
|
1244
|
|
1245 pdf.cdf <- list(px.1=px.1, px.2=px.2, py.1=py.1, py.2=py.2)
|
|
1246
|
|
1247 return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar,
|
|
1248 pdf.cdf=pdf.cdf))
|
|
1249 }
|
|
1250
|
|
1251
|
|
1252
|
|
1253
|
|
1254 # updated
|
|
1255 # mixture likelihood of two gaussian copula
|
|
1256 # nonparametric and ranked transformed
|
|
1257 loglik.2gaussian.copula.value <- function(x, y, p, rho1, rho2, pdf.cdf){
|
|
1258
|
|
1259 px.1 <- pdf.cdf$px.1
|
|
1260 px.2 <- pdf.cdf$px.2
|
|
1261 py.1 <- pdf.cdf$py.1
|
|
1262 py.2 <- pdf.cdf$py.2
|
|
1263
|
|
1264 c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
|
|
1265 c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
|
|
1266
|
|
1267 sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf))
|
|
1268 }
|
|
1269
|
|
1270
|
|
1271
|
|
1272 # updated
|
|
1273 # mixture likelihood of two gaussian copula
|
|
1274 # nonparametric and ranked transformed
|
|
1275 loglik.2copula.value <- function(x, y, p, rho1, rho2, pdf.cdf, copula.txt){
|
|
1276
|
|
1277 px.1 <- pdf.cdf$px.1
|
|
1278 px.2 <- pdf.cdf$px.2
|
|
1279 py.1 <- pdf.cdf$py.1
|
|
1280 py.2 <- pdf.cdf$py.2
|
|
1281
|
|
1282 if(copula.txt=="gaussian"){
|
|
1283 c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1)
|
|
1284 c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2)
|
|
1285 } else {
|
|
1286 if(copula.txt=="clayton"){
|
|
1287 c1 <- clayton.cop.den(px.1$cdf, py.1$cdf, rho1)
|
|
1288 c2 <- clayton.cop.den(px.2$cdf, py.2$cdf, rho2)
|
|
1289 }
|
|
1290 }
|
|
1291
|
|
1292 sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf))
|
|
1293 }
|
|
1294
|
|
1295
|
|
1296
|
|
1297 # EM for 2 Gaussian, speed up computation, unfinished
|
|
1298
|
|
1299 em.2gaussian.quick <- function(x, y, p0, rho1.0, rho2.0, eps, fix.p=F, stoc=T, fix.rho2=T){
|
|
1300
|
|
1301 x <- rank(x, tie="random")
|
|
1302 y <- rank(y, tie="random")
|
|
1303
|
|
1304 # x <- rank(x, tie="average")
|
|
1305 # y <- rank(y, tie="average")
|
|
1306
|
|
1307 # nbin=20
|
|
1308 xy.min <- min(x, y)
|
|
1309 xy.max <- max(x, y)
|
|
1310 binwidth <- (xy.max-xy.min)/50
|
|
1311 breaks <- seq(xy.min-binwidth/100, xy.max+binwidth/100, by=(xy.max-xy.min+binwidth/50)/50)
|
|
1312 # breaks <- seq(xy.min, xy.max, by=binwidth)
|
|
1313
|
|
1314
|
|
1315 # initiate marginals
|
|
1316 # initialization: first p0 data has
|
|
1317 # e.z <- e.step.2gaussian(x, y, p0, rho1.0, rho2.0, x0.mar, y0.mar) # this starting point assumes two components are overlapped
|
|
1318
|
|
1319 e.z <- c(rep(0.9, round(length(x)*p0)), rep(0.1, length(x)-round(length(x)*p0)))
|
|
1320
|
|
1321 if(!stoc)
|
|
1322 para <- m.step.2gaussian.value(x, y, e.z, breaks, fix.rho2)
|
|
1323 else
|
|
1324 para <- m.step.2gaussian.stoc.value(x, y, e.z, breaks, fix.rho2)
|
|
1325
|
|
1326
|
|
1327 if(fix.p){
|
|
1328 p <- p0
|
|
1329 } else {
|
|
1330 p <- para$p
|
|
1331 }
|
|
1332
|
|
1333 if(fix.rho2){
|
|
1334 rho2 <- rho2.0
|
|
1335 } else {
|
|
1336 rho2 <- para$rho2
|
|
1337 }
|
|
1338
|
|
1339 # rho1 <- 0.8
|
|
1340 rho1 <- para$rho1
|
|
1341
|
|
1342 l0 <- loglik.2gaussian.copula.value(x, y, p, rho1, rho2, para$pdf.cdf)
|
|
1343
|
|
1344 loglik.trace <- c()
|
|
1345 loglik.trace[1] <- l0
|
|
1346 # loglik.trace[2] <- l1
|
|
1347 to.run <- T
|
|
1348
|
|
1349 i <- 2
|
|
1350
|
|
1351 # this two lines to remove
|
|
1352 # x.mar <- est.mar.hist(x, e.z, breaks)
|
|
1353 # y.mar <- est.mar.hist(y, e.z, breaks)
|
|
1354
|
|
1355 while(to.run){
|
|
1356
|
|
1357 e.z <- e.step.2gaussian.value(x, y, p, rho1, rho2, para$pdf.cdf)
|
|
1358 if(!stoc)
|
|
1359 para <- m.step.2gaussian.value(x, y, e.z, breaks, fix.rho2)
|
|
1360 else
|
|
1361 para <- m.step.2gaussian.stoc.value(x, y, e.z, breaks, fix.rho2)
|
|
1362
|
|
1363 # fix x.mar and y.mar : to remove
|
|
1364 # if(!stoc)
|
|
1365 # para <- m.step.2gaussian.value2(x, y, e.z, breaks, fix.rho2, x.mar, y.mar)
|
|
1366 # else
|
|
1367 # para <- m.step.2gaussian.stoc.value(x, y, e.z, breaks, fix.rho2)
|
|
1368
|
|
1369
|
|
1370 if(fix.p){
|
|
1371 p <- p0
|
|
1372 } else {
|
|
1373 p <- para$p
|
|
1374 }
|
|
1375
|
|
1376 if(fix.rho2){
|
|
1377 rho2 <- rho2.0
|
|
1378 } else {
|
|
1379 rho2 <- para$rho2
|
|
1380 }
|
|
1381
|
|
1382 # rho1 <- 0.8
|
|
1383 rho1 <- para$rho1
|
|
1384
|
|
1385 # l0 <- l1
|
|
1386 l1 <- loglik.2gaussian.copula.value(x, y, p, rho1, rho2, para$pdf.cdf)
|
|
1387 loglik.trace[i] <- l1
|
|
1388
|
|
1389 #cat("l1=", l1, "\n")
|
|
1390
|
|
1391 # Aitken acceleration criterion
|
|
1392 if(i > 2){
|
|
1393 l.inf <- loglik.trace[i-2] + (loglik.trace[i-1] - loglik.trace[i-2])/(1-(loglik.trace[i]-loglik.trace[i-1])/(loglik.trace[i-1]-loglik.trace[i-2]))
|
|
1394 to.run <- abs(l.inf - loglik.trace[i]) > eps
|
|
1395 #cat("para=", "p=", para$p, " rho1=", rho1, " rho2=", rho2, "\n")
|
|
1396 #cat("l.inf=", l.inf, "\n")
|
|
1397 #cat(l.inf-loglik.trace[i], "\n")
|
|
1398 }
|
|
1399
|
|
1400 i <- i+1
|
|
1401 }
|
|
1402
|
|
1403 bic <- -2*l1 + (2*(length(breaks)-1+1)+1-fix.p-fix.rho2)*log(length(x)) # parameters
|
|
1404 return(list(para=list(p=para$p, rho1=rho1, rho2=rho2),
|
|
1405 loglik=l1, bic=bic, e.z=e.z, conf.z = para$conf.z,
|
|
1406 loglik.trace=loglik.trace, x.mar=para$x.mar, y.mar=para$y.mar,
|
|
1407 breaks=breaks))
|
|
1408 }
|
|
1409
|
|
1410
|
|
1411
|
|
1412 em.2copula.quick <- function(x, y, p0, rho1.0, rho2.0, eps, fix.p=F, stoc=T, fix.rho2=T, copula.txt, nbin=50){
|
|
1413
|
|
1414 x <- rank(x, tie="random")
|
|
1415 y <- rank(y, tie="random")
|
|
1416
|
|
1417 # x <- rank(x, tie="first")
|
|
1418 # y <- rank(y, tie="first")
|
|
1419
|
|
1420 # nbin=50
|
|
1421 xy.min <- min(x, y)
|
|
1422 xy.max <- max(x, y)
|
|
1423 binwidth <- (xy.max-xy.min)/50
|
|
1424 breaks <- seq(xy.min-binwidth/100, xy.max+binwidth/100, by=(xy.max-xy.min+binwidth/50)/nbin)
|
|
1425 # breaks <- seq(xy.min, xy.max, by=binwidth)
|
|
1426
|
|
1427 # initiate marginals
|
|
1428 # initialization: first p0 data has
|
|
1429 # e.z <- e.step.2gaussian(x, y, p0, rho1.0, rho2.0, x0.mar, y0.mar) # this starting point assumes two components are overlapped
|
|
1430
|
|
1431 e.z <- c(rep(0.9, round(length(x)*p0)), rep(0.1, length(x)-round(length(x)*p0)))
|
|
1432
|
|
1433
|
|
1434 if(!stoc)
|
|
1435 para <- m.step.2copula.value(x, y, e.z, breaks, fix.rho2, copula.txt)
|
|
1436 else
|
|
1437 para <- m.step.2copula.stoc.value(x, y, e.z, breaks, fix.rho2, copula.txt)
|
|
1438
|
|
1439 if(fix.p){
|
|
1440 p <- p0
|
|
1441 } else {
|
|
1442 p <- para$p
|
|
1443 }
|
|
1444
|
|
1445 if(fix.rho2){
|
|
1446 rho2 <- rho2.0
|
|
1447 } else {
|
|
1448 rho2 <- para$rho2
|
|
1449 }
|
|
1450
|
|
1451 l0 <- loglik.2copula.value(x, y, p, para$rho1, rho2, para$pdf.cdf, copula.txt)
|
|
1452
|
|
1453 loglik.trace <- c()
|
|
1454 loglik.trace[1] <- l0
|
|
1455 # loglik.trace[2] <- l1
|
|
1456 to.run <- T
|
|
1457
|
|
1458 i <- 2
|
|
1459
|
|
1460 while(to.run){
|
|
1461
|
|
1462 e.z <- e.step.2copula.value(x, y, p, para$rho1, rho2, para$pdf.cdf, copula.txt)
|
|
1463 if(!stoc)
|
|
1464 para <- m.step.2copula.value(x, y, e.z, breaks, fix.rho2, copula.txt)
|
|
1465 else
|
|
1466 para <- m.step.2copula.stoc.value(x, y, e.z, breaks, fix.rho2, copula.txt)
|
|
1467
|
|
1468 if(fix.p){
|
|
1469 p <- p0
|
|
1470 } else {
|
|
1471 p <- para$p
|
|
1472 }
|
|
1473
|
|
1474 if(fix.rho2){
|
|
1475 rho2 <- rho2.0
|
|
1476 } else {
|
|
1477 rho2 <- para$rho2
|
|
1478 }
|
|
1479
|
|
1480
|
|
1481 # l0 <- l1
|
|
1482 l1 <- loglik.2copula.value(x, y, p, para$rho1, rho2, para$pdf.cdf, copula.txt)
|
|
1483 loglik.trace[i] <- l1
|
|
1484
|
|
1485 cat("l1=", l1, "\n")
|
|
1486
|
|
1487 # Aitken acceleration criterion
|
|
1488 if(i > 2){
|
|
1489 l.inf <- loglik.trace[i-2] + (loglik.trace[i-1] - loglik.trace[i-2])/(1-(loglik.trace[i]-loglik.trace[i-1])/(loglik.trace[i-1]-loglik.trace[i-2]))
|
|
1490 to.run <- abs(l.inf - loglik.trace[i]) > eps
|
|
1491 cat("para=", "p=", para$p, " rho1=", para$rho1, " rho2=", rho2, "\n")
|
|
1492 #cat("l.inf=", l.inf, "\n")
|
|
1493 #cat(l.inf-loglik.trace[i], "\n")
|
|
1494 }
|
|
1495
|
|
1496 i <- i+1
|
|
1497 }
|
|
1498
|
|
1499 bic <- -2*l1 + (2*(length(breaks)-1+1)+1-fix.p-fix.rho2)*log(length(x)) # parameters
|
|
1500 return(list(para=list(p=para$p, rho1=para$rho1, rho2=rho2),
|
|
1501 loglik=l1, bic=bic, e.z=e.z, conf.z = para$conf.z,
|
|
1502 loglik.trace=loglik.trace, x.mar=para$x.mar, y.mar=para$y.mar,
|
|
1503 breaks=breaks))
|
|
1504 }
|
|
1505
|
|
1506
|
|
1507 #######################
|
|
1508 ####################### fit EM procedure for the matched peaks
|
|
1509 #######################
|
|
1510
|
|
1511 # remove the unmatched ones
|
|
1512 #rm.unmatch <- function(sample1, sample2, p.value.impute=0){
|
|
1513 #
|
|
1514 # sample1.prune <- sample1[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,]
|
|
1515 # sample2.prune <- sample2[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,]
|
|
1516 #
|
|
1517 # invisible(list(sample1=sample1.prune$sig.value, sample2=sample2.prune$sig.value))
|
|
1518 #}
|
|
1519
|
|
1520
|
|
1521 # fit 2-component model
|
|
1522 #fit.em <- function(sample12, fix.rho2=T){
|
|
1523 #
|
|
1524 # prune.sample <- rm.unmatch(sample12$merge1, sample12$merge2)
|
|
1525 #
|
|
1526 # em.fit <- em.2gaussian.quick(-prune.sample$sample1, -prune.sample$sample2,
|
|
1527 # p0=0.5, rho1.0=0.7, rho2.0=0, eps=0.01, fix.p=F, stoc=F, fix.rho2)
|
|
1528 #
|
|
1529 # invisible(list(em.fit=em.fit, data.pruned=prune.sample))
|
|
1530 #}
|
|
1531
|
|
1532
|
|
1533 rm.unmatch <- function(sample1, sample2, p.value.impute=0){
|
|
1534
|
|
1535 sample1.prune <- sample1[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,]
|
|
1536 sample2.prune <- sample2[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,]
|
|
1537
|
|
1538 invisible(list(sample1=sample1.prune, sample2=sample2.prune))
|
|
1539 }
|
|
1540
|
|
1541
|
|
1542 # fit 2-component model
|
|
1543 fit.em <- function(sample12, fix.rho2=T){
|
|
1544
|
|
1545 prune.sample <- rm.unmatch(sample12$merge1, sample12$merge2)
|
|
1546
|
|
1547 em.fit <- em.2gaussian.quick(-prune.sample$sample1$sig.value, -prune.sample$sample2$sig.value,
|
|
1548 p0=0.5, rho1.0=0.7, rho2.0=0, eps=0.01, fix.p=F, stoc=F, fix.rho2)
|
|
1549
|
|
1550 invisible(list(em.fit=em.fit, data.pruned=prune.sample))
|
|
1551 }
|
|
1552
|
|
1553
|
|
1554
|
|
1555 fit.2copula.em <- function(sample12, fix.rho2=T, copula.txt){
|
|
1556
|
|
1557 prune.sample <- rm.unmatch(sample12$merge1, sample12$merge2)
|
|
1558
|
|
1559 # o <- order(prune.sample$sample1)
|
|
1560 # n <- length(prune.sample$sample1)
|
|
1561
|
|
1562 # para <- init(prune.sample$sample1$sig.value, prune.sample$sample2$sig.value, c(rep(0, round(n/3)), rep(c(0,1), round(n/6)), rep(1, n-round(n/3)-round(n/6))))
|
|
1563
|
|
1564 # temp <- init.dist(f0, f1)
|
|
1565 para <- list()
|
|
1566 para$rho <- 0.6
|
|
1567 para$p <- 0.3
|
|
1568 para$mu <- 2.5
|
|
1569 para$sigma <- 1
|
|
1570 ## para$mu <- -temp$mu
|
|
1571 ## para$sigma <- temp$sigma
|
|
1572 #cat("mu=", para$mu, "sigma=", para$sigma, "\n")
|
|
1573
|
|
1574 # em.fit <- em.transform.1loop(-prune.sample$sample1, -prune.sample$sample2,
|
|
1575 cat("EM is running")
|
|
1576 em.fit <- em.transform(prune.sample$sample1$sig.value, prune.sample$sample2$sig.value, para$mu, para$sigma, para$rho, para$p, eps=0.01)
|
|
1577
|
|
1578 invisible(list(em.fit=em.fit, data.pruned=prune.sample))
|
|
1579 }
|
|
1580
|
|
1581
|
|
1582
|
|
1583
|
|
1584 # fit 1-component model
|
|
1585 fit.1.component <- function(data.pruned, breaks){
|
|
1586
|
|
1587 # gaussian.1 <- fit.gaussian.1(-data.pruned$sample1$sig.value, -data.pruned$sample2$sig.value, breaks)
|
|
1588 # clayton.1 <- fit.clayton.1(-data.pruned$sample1$sig.value, -data.pruned$sample2$sig.value, breaks)
|
|
1589
|
|
1590 gaussian.1 <- fit.gaussian.1(-data.pruned$sample1, -data.pruned$sample2, breaks)
|
|
1591 clayton.1 <- fit.clayton.1(-data.pruned$sample1, -data.pruned$sample2, breaks)
|
|
1592
|
|
1593 return(list(gaussian.1=gaussian.1, clayton.1=clayton.1))
|
|
1594 }
|
|
1595
|
|
1596
|
|
1597
|
|
1598 #################
|
|
1599 # Fit a single component
|
|
1600 #################
|
|
1601
|
|
1602 # a single gaussian copula
|
|
1603 # if breaks=NULL, use empirical pdf, otherwise use histogram estimate
|
|
1604 fit.gaussian.1 <- function(x, y, breaks=NULL){
|
|
1605
|
|
1606 # rank transformed and compute the empirical cdf
|
|
1607 t <- emp.mar.cdf.rank(x)
|
|
1608 s <- emp.mar.cdf.rank(y)
|
|
1609
|
|
1610 mle.rho <- mle.gaussian.copula(t, s, rep(1, length(t)))
|
|
1611
|
|
1612 c1 <- gaussian.cop.den(t, s, mle.rho)
|
|
1613 cat("c1", sum(log(c1)), "\n")
|
|
1614
|
|
1615 if(is.null(breaks)){
|
|
1616 f1 <- emp.mar.pdf.rank(t)
|
|
1617 f2 <- emp.mar.pdf.rank(s)
|
|
1618 } else {
|
|
1619 x.mar <- est.mar.hist(rank(x), rep(1, length(x)), breaks)
|
|
1620 y.mar <- est.mar.hist(rank(y), rep(1, length(y)), breaks)
|
|
1621
|
|
1622 f1 <- x.mar$f1.value$pdf # only one component
|
|
1623 f2 <- y.mar$f1.value$pdf
|
|
1624 }
|
|
1625
|
|
1626
|
|
1627 cat("f1", sum(log(f1)), "\n")
|
|
1628 cat("f2", sum(log(f2)), "\n")
|
|
1629
|
|
1630 loglik <- sum(log(c1)+log(f1)+log(f2))
|
|
1631
|
|
1632 bic <- -2*loglik + log(length(t))*(1+length(breaks)-1)
|
|
1633
|
|
1634 return(list(rho=mle.rho, loglik=loglik, bic=bic))
|
|
1635 }
|
|
1636
|
|
1637
|
|
1638 # a single Clayton copula
|
|
1639 fit.clayton.1 <- function(x, y, breaks=NULL){
|
|
1640
|
|
1641 # rank transformed and compute the empirical cdf
|
|
1642 t <- emp.mar.cdf.rank(x)
|
|
1643 s <- emp.mar.cdf.rank(y)
|
|
1644
|
|
1645 mle.rho <- mle.clayton.copula(t, s, rep(1, length(t)))
|
|
1646
|
|
1647 c1 <- clayton.cop.den(t, s, mle.rho)
|
|
1648
|
|
1649 if(is.null(breaks)){
|
|
1650 f1 <- emp.mar.pdf.rank(t)
|
|
1651 f2 <- emp.mar.pdf.rank(s)
|
|
1652 } else {
|
|
1653 x.mar <- est.mar.hist(rank(x), rep(1, length(x)), breaks)
|
|
1654 y.mar <- est.mar.hist(rank(y), rep(1, length(y)), breaks)
|
|
1655
|
|
1656 f1 <- x.mar$f1.value$pdf # only one component
|
|
1657 f2 <- y.mar$f1.value$pdf
|
|
1658 }
|
|
1659
|
|
1660 loglik <- sum(log(c1)+log(f1)+log(f2))
|
|
1661
|
|
1662 bic <- -2*loglik + log(length(t))*(1+length(breaks)-1)
|
|
1663
|
|
1664 return(list(rho=mle.rho, tau=rho/(rho+2), loglik=loglik, bic=bic))
|
|
1665 }
|
|
1666
|
|
1667 ## obsolete function (01-06-2010)
|
|
1668 ## compute the average posterior probability to belong to the random component
|
|
1669 ## for peaks selected at different cutoffs
|
|
1670 comp.uri.ez <- function(tt, u, v, e.z){
|
|
1671
|
|
1672 u.t <- quantile(u, prob=(1-tt))
|
|
1673 v.t <- quantile(v, prob=(1-tt))
|
|
1674
|
|
1675 # ez <- mean(e.z[u >= u.t & v >=u.t]) Is this wrong?
|
|
1676 ez <- mean(e.z[u >= u.t & v >=v.t])
|
|
1677
|
|
1678 return(ez)
|
|
1679 }
|
|
1680
|
|
1681 ## obsolete function (01-06-2010)
|
|
1682 # compute the largest posterior error probability corresponding to
|
|
1683 # the square centered at the origin and spanned top tt% on both coordinates
|
|
1684 # so the consistent low rank ones are excluded
|
|
1685 # boundary.txt: either "max" or "min", if it is error prob, use "max"
|
|
1686 comp.ez.cutoff <- function(tt, u, v, e.z, boundary.txt){
|
|
1687
|
|
1688 u.t <- quantile(u, prob=(1-tt))
|
|
1689 v.t <- quantile(v, prob=(1-tt))
|
|
1690
|
|
1691 if(boundary.txt == "max"){
|
|
1692 # ez.bound <- max(e.z[u >= u.t & v >=u.t])
|
|
1693 ez.bound <- max(e.z[u >= u.t & v >=v.t])
|
|
1694 } else {
|
|
1695 # ez.bound <- min(e.z[u >= u.t & v >=u.t])
|
|
1696 ez.bound <- min(e.z[u >= u.t & v >=v.t])
|
|
1697 }
|
|
1698
|
|
1699 return(ez.bound)
|
|
1700
|
|
1701 }
|
|
1702
|
|
1703 # obsolete functions: 01-06-2010
|
|
1704 # compute the error rate
|
|
1705 # u.t and v.t are the quantiles
|
|
1706 # this one is used for the plots generated initially in the brief writeup
|
|
1707 # and it was used for processing merged data in July before the IDR definition
|
|
1708 # is formalized
|
|
1709 # It does not implement the current definition of IDR
|
|
1710 get.ez.tt.old <- function(em.fit, reverse=T, fdr.level=c(0.01, 0.05, 0.1)){
|
|
1711
|
|
1712 u <- em.fit$data.pruned$sample1
|
|
1713 v <- em.fit$data.pruned$sample2
|
|
1714
|
|
1715 tt <- seq(0.01, 0.99, by=0.01)
|
|
1716 if(reverse){
|
|
1717 e.z <- 1-em.fit$em.fit$e.z # this is the error prob
|
|
1718 uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z)
|
|
1719 ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="max")
|
|
1720 } else {
|
|
1721 e.z <- em.fit$em.fit$e.z
|
|
1722 uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z)
|
|
1723 ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="min")
|
|
1724 }
|
|
1725
|
|
1726 u.t <- quantile(u, prob=(1-tt))
|
|
1727 v.t <- quantile(v, prob=(1-tt))
|
|
1728
|
|
1729 # find the levels on the two replicates
|
|
1730 sig.value1 <- c()
|
|
1731 sig.value2 <- c()
|
|
1732 error.prob.cutoff <- c()
|
|
1733 n.selected.match <- c()
|
|
1734
|
|
1735 for(i in 1:length(fdr.level)){
|
|
1736
|
|
1737 # find which uri.ez is closet to fdr.level
|
|
1738 index <- which.min(abs(uri.ez - fdr.level[i]))
|
|
1739 sig.value1[i] <- u.t[index]
|
|
1740 sig.value2[i] <- v.t[index]
|
|
1741 error.prob.cutoff[i] <- ez.bound[index]
|
|
1742 if(reverse){
|
|
1743 n.selected.match[i] <- sum(e.z<=ez.bound[index])
|
|
1744 } else {
|
|
1745 n.selected.match[i] <- sum(e.z>=ez.bound[index])
|
|
1746 }
|
|
1747 }
|
|
1748
|
|
1749 # output the cutoff of posterior probability, signal values on two replicates
|
|
1750 map.uv <- cbind(error.prob.cutoff, sig.value1, sig.value2, n.selected.match)
|
|
1751
|
|
1752 return(list(n=tt*length(u), uri.ez=uri.ez, u.t=u.t, v.t=v.t, tt=tt, fdr.level=fdr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff))
|
|
1753 }
|
|
1754
|
|
1755 # created: 01-06-2010
|
|
1756 # Output IDR at various number of selected peaks
|
|
1757 # Find cutoff (idr cutoff, sig.value cutoff on each replicate) for specified IDR level
|
|
1758 # IDR definition is similar to FDR
|
|
1759 get.ez.tt <- function(em.fit, idr.level=c(0.01, 0.05, 0.1)){
|
|
1760
|
|
1761 # u <- em.fit$data.pruned$sample1$sig.value
|
|
1762 # v <- em.fit$data.pruned$sample2$sig.value
|
|
1763 u <- em.fit$data.pruned$sample1
|
|
1764 v <- em.fit$data.pruned$sample2
|
|
1765
|
|
1766 e.z <- 1-em.fit$em.fit$e.z # this is the error prob
|
|
1767
|
|
1768 o <- order(e.z)
|
|
1769 e.z.ordered <- e.z[o]
|
|
1770 n.select <- c(1:length(e.z))
|
|
1771 IDR <- cumsum(e.z.ordered)/n.select
|
|
1772
|
|
1773 u.o <- u[o]
|
|
1774 v.o <- v[o]
|
|
1775
|
|
1776 n.level <- length(idr.level)
|
|
1777 # sig.value1 <- rep(NA, n.level)
|
|
1778 # sig.value2 <- rep(NA, n.level)
|
|
1779 ez.cutoff <- rep(NA, n.level)
|
|
1780 n.selected <- rep(NA, n.level)
|
|
1781
|
|
1782 for(i in 1:length(idr.level)){
|
|
1783
|
|
1784 # find which uri.ez is closet to fdr.level
|
|
1785 index <- which.min(abs(IDR - idr.level[i]))
|
|
1786 # sig.value1[i] <- min(u.o[1:index])
|
|
1787 # sig.value2[i] <- min(v.o[1:index])
|
|
1788 ez.cutoff[i] <- e.z[index]
|
|
1789 n.selected[i] <- sum(e.z<=ez.cutoff[i])
|
|
1790 }
|
|
1791
|
|
1792 # output the cutoff of posterior probability, number of selected overlapped peaks
|
|
1793 # map.uv <- cbind(ez.cutoff, sig.value1, sig.value2, n.selected)
|
|
1794
|
|
1795 map.uv <- cbind(ez.cutoff, n.selected)
|
|
1796
|
|
1797 return(list(n=n.select, IDR=IDR, idr.level=idr.level, map.uv=map.uv))
|
|
1798 }
|
|
1799
|
|
1800 # return(list(n=tt*length(u), uri.ez=uri.ez, fdr.level=fdr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff))
|
|
1801
|
|
1802
|
|
1803
|
|
1804
|
|
1805
|
|
1806 ### compute the mean of the marginals
|
|
1807 get.mar.mean <- function(em.out){
|
|
1808
|
|
1809 x.f1 <- em.out$x.mar$f1
|
|
1810 x.f2 <- em.out$x.mar$f2
|
|
1811
|
|
1812 y.f1 <- em.out$y.mar$f1
|
|
1813 y.f2 <- em.out$y.mar$f2
|
|
1814
|
|
1815 x.stat1 <- get.hist.mean(x.f1)
|
|
1816 x.stat2 <- get.hist.mean(x.f2)
|
|
1817 y.stat1 <- get.hist.mean(y.f1)
|
|
1818 y.stat2 <- get.hist.mean(y.f2)
|
|
1819
|
|
1820 return(list(x.mean1=x.stat1$mean, x.mean2=x.stat2$mean,
|
|
1821 y.mean1=y.stat1$mean, y.mean2=y.stat2$mean,
|
|
1822 x.sd1=x.stat1$sd, x.sd2=x.stat2$sd,
|
|
1823 y.sd1=y.stat1$sd, y.sd2=y.stat2$sd
|
|
1824 ))
|
|
1825
|
|
1826 }
|
|
1827
|
|
1828
|
|
1829 # compute the mean of marginals
|
|
1830 get.hist.mean <- function(x.f){
|
|
1831
|
|
1832 nbreaks <- length(x.f$breaks)
|
|
1833 x.bin <- x.f$breaks[-1]-x.f$breaks[-nbreaks]
|
|
1834
|
|
1835 x.mid <- (x.f$breaks[-nbreaks]+x.f$breaks[-1])/2
|
|
1836 x.mean <- sum(x.mid*x.f$density*x.bin)
|
|
1837 x.sd <- sqrt(sum(x.mid*x.mid*x.f$density*x.bin)-x.mean^2)
|
|
1838
|
|
1839 return(list(mean=x.mean, sd=x.sd))
|
|
1840 }
|
|
1841
|
|
1842 get.hist.var <- function(x.f){
|
|
1843
|
|
1844 nbreaks <- length(x.f$breaks)
|
|
1845 x.bin <- x.f$breaks[-1]-x.f$breaks[-nbreaks]
|
|
1846
|
|
1847 x.mid <- (x.f$breaks[-nbreaks]+x.f$breaks[-1])/2
|
|
1848 x.mean <- sum(x.mid*x.f$density*x.bin)
|
|
1849
|
|
1850 return(mean=x.mean)
|
|
1851 }
|
|
1852
|
|
1853 # obsolete function (01-06-2010)
|
|
1854 # plot
|
|
1855 plot.ez.group.old <- function(ez.list, plot.dir, file.name=NULL, legend.txt, y.lim=NULL, xlab.txt="num of significant peaks", ylab.txt="avg posterior prob of being random", col.txt=NULL, title.txt=NULL){
|
|
1856
|
|
1857 if(is.null(col.txt))
|
|
1858 col.txt <- c("black", "red", "purple", "green", "blue", "cyan", "magenta", "orange", "grey")
|
|
1859
|
|
1860 x <- c()
|
|
1861 y <- c()
|
|
1862
|
|
1863 for(i in 1:length(ez.list)){
|
|
1864 x <- c(x, ez.list[[i]]$n)
|
|
1865
|
|
1866 y <- c(y, ez.list[[i]]$uri.ez)
|
|
1867 }
|
|
1868
|
|
1869 if(is.null(y.lim))
|
|
1870 y.lim <- c(0, max(y))
|
|
1871
|
|
1872 if(!is.null(file.name)){
|
|
1873 postscript(paste(plot.dir, "ez.", file.name, sep=""))
|
|
1874 par(mfrow=c(1,1), mar=c(5,5,4,2))
|
|
1875 }
|
|
1876
|
|
1877 plot(x, y, ylim=y.lim, type="n", xlab=xlab.txt, ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2)
|
|
1878
|
|
1879 for(i in 1:length(ez.list)){
|
|
1880 lines(ez.list[[i]]$n, ez.list[[i]]$uri.ez, col=col.txt[i], cex=2, lwd=5)
|
|
1881 }
|
|
1882
|
|
1883 # plot(ez.list[[1]]$u.t, y, ylim=y.lim, type="l", xlab="rep-sig", ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2)
|
|
1884 # plot(ez.list[[1]]$v.t, y, ylim=y.lim, type="l", xlab="rep-sig", ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2)
|
|
1885
|
|
1886
|
|
1887 legend(0, y.lim[2], legend=legend.txt, col=col.txt[1:length(col.txt)], lty=1, lwd=5, cex=2)
|
|
1888
|
|
1889 if(!is.null(title))
|
|
1890 title(title.txt)
|
|
1891
|
|
1892 if(!is.null(file.name)){
|
|
1893 dev.off()
|
|
1894 }
|
|
1895
|
|
1896 }
|
|
1897
|
|
1898
|
|
1899 plot.ez.group <- function(ez.list, plot.dir, file.name=NULL, legend.txt, y.lim=NULL, xlab.txt="num of significant peaks", ylab.txt="IDR", col.txt=NULL, title.txt=NULL){
|
|
1900
|
|
1901 if(is.null(col.txt))
|
|
1902 col.txt <- c("black", "red", "purple", "green", "blue", "cyan", "magenta", "orange", "grey")
|
|
1903
|
|
1904 n.entry <- length(ez.list)
|
|
1905 x <- rep(NA, n.entry)
|
|
1906 y.max <- rep(NA, n.entry)
|
|
1907
|
|
1908 for(i in 1:n.entry){
|
|
1909 x[i] <- max(ez.list[[i]]$n)
|
|
1910
|
|
1911 y.max[i] <- max(ez.list[[i]]$IDR)
|
|
1912
|
|
1913 }
|
|
1914
|
|
1915 if(is.null(y.lim))
|
|
1916 y.lim <- c(0, max(y.max))
|
|
1917
|
|
1918 if(!is.null(file.name)){
|
|
1919 postscript(paste(plot.dir, "ez.", file.name, sep=""))
|
|
1920 par(mfrow=c(1,1), mar=c(5,5,4,2))
|
|
1921 }
|
|
1922
|
|
1923
|
|
1924
|
|
1925 plot(c(0, max(x)), y.lim, ylim=y.lim, type="n", xlab=xlab.txt, ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2)
|
|
1926
|
|
1927 q <- seq(0.01, 0.99, by=0.01)
|
|
1928
|
|
1929 for(i in 1:length(ez.list)){
|
|
1930
|
|
1931 n.plot <- round(quantile(ez.list[[i]]$n, prob=q))
|
|
1932 IDR.plot <- ez.list[[i]]$IDR[n.plot]
|
|
1933 lines(n.plot, IDR.plot, col=col.txt[i], cex=2, lwd=5)
|
|
1934 }
|
|
1935
|
|
1936
|
|
1937 legend(0, y.lim[2], legend=legend.txt, col=col.txt[1:length(col.txt)], lty=1, lwd=5, cex=2)
|
|
1938
|
|
1939 if(!is.null(title))
|
|
1940 title(title.txt)
|
|
1941
|
|
1942 if(!is.null(file.name)){
|
|
1943 dev.off()
|
|
1944 }
|
|
1945
|
|
1946 }
|
|
1947
|
|
1948
|
|
1949
|
|
1950 #############################################################################
|
|
1951 #############################################################################
|
|
1952 # statistics about peaks selected on the individual replicates
|
|
1953 #
|
|
1954 # idr.level: the consistency cutoff, say 0.05
|
|
1955 # uri.output: a list of uri.output from consistency analysis generated by batch-consistency-analysis.r
|
|
1956 # ez.list : a list of IDRs computed from get.ez.tt using the same idr.level
|
|
1957 #
|
|
1958 ##################
|
|
1959
|
|
1960
|
|
1961 # obsolete?
|
|
1962 # compute the error rate
|
|
1963 # u.t and v.t are the quantiles
|
|
1964 #
|
|
1965 # map back to all peaks and report the number of peaks selected
|
|
1966 get.ez.tt.all.old <- function(em.fit, all.data1, all.data2, idr.level){
|
|
1967
|
|
1968 u <- em.fit$data.pruned$sample1
|
|
1969 v <- em.fit$data.pruned$sample2
|
|
1970
|
|
1971 tt <- seq(0.01, 0.99, by=0.01)
|
|
1972 # if(reverse){
|
|
1973 e.z <- 1-em.fit$em.fit$e.z # this is the error prob
|
|
1974 uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z)
|
|
1975 ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="max")
|
|
1976 # } else {
|
|
1977 # e.z <- em.fit$em.fit$e.z
|
|
1978 # uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z)
|
|
1979 # ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="min")
|
|
1980 # }
|
|
1981
|
|
1982 u.t <- quantile(u, prob=(1-tt))
|
|
1983 v.t <- quantile(v, prob=(1-tt))
|
|
1984
|
|
1985 # find the levels on the two replicates
|
|
1986 sig.value1 <- c()
|
|
1987 sig.value2 <- c()
|
|
1988 error.prob.cutoff <- c()
|
|
1989 n.selected.match <- c()
|
|
1990 npeak.rep1 <- c()
|
|
1991 npeak.rep2 <- c()
|
|
1992
|
|
1993 for(i in 1:length(idr.level)){
|
|
1994
|
|
1995 # find which uri.ez is closet to idr.level
|
|
1996 index <- which.min(abs(uri.ez - as.numeric(idr.level[i])))
|
|
1997
|
|
1998 sig.value1[i] <- u.t[index]
|
|
1999 sig.value2[i] <- v.t[index]
|
|
2000 error.prob.cutoff[i] <- ez.bound[index]
|
|
2001 n.selected.match[i] <- sum(u>= u.t[index] & v>=v.t[index])
|
|
2002
|
|
2003 npeak.rep1[i] <- sum(all.data1["sig.value"] >= sig.value1[i])
|
|
2004 npeak.rep2[i] <- sum(all.data2["sig.value"] >= sig.value2[i])
|
|
2005 }
|
|
2006
|
|
2007
|
|
2008 # output the cutoff of posterior probability, signal values on two replicates
|
|
2009 map.uv <- cbind(error.prob.cutoff, sig.value1, sig.value2, n.selected.match, npeak.rep1, npeak.rep2)
|
|
2010
|
|
2011 return(list(n=tt*length(u), uri.ez=uri.ez, u.t=u.t, v.t=v.t, tt=tt, idr.level=idr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff))
|
|
2012 }
|
|
2013
|
|
2014
|
|
2015 get.ez.tt.all <- function(em.fit, all.data1, all.data2, idr.level=c(0.01, 0.05, 0.1)){
|
|
2016
|
|
2017 u <- em.fit$data.pruned$sample1$sig.value
|
|
2018 v <- em.fit$data.pruned$sample2$sig.value
|
|
2019 # u <- em.fit$data.pruned$sample1
|
|
2020 # v <- em.fit$data.pruned$sample2
|
|
2021
|
|
2022 e.z <- 1-em.fit$em.fit$e.z # this is the error prob
|
|
2023
|
|
2024 o <- order(e.z)
|
|
2025 e.z.ordered <- e.z[o]
|
|
2026 n.select <- c(1:length(e.z))
|
|
2027 IDR <- cumsum(e.z.ordered)/n.select
|
|
2028
|
|
2029 u.o <- u[o]
|
|
2030 v.o <- v[o]
|
|
2031
|
|
2032 n.level <- length(idr.level)
|
|
2033 # sig.value1 <- rep(NA, n.level)
|
|
2034 # sig.value2 <- rep(NA, n.level)
|
|
2035 ez.cutoff <- rep(NA, n.level)
|
|
2036 n.selected <- rep(NA, n.level)
|
|
2037 npeak.rep1 <- rep(NA, n.level)
|
|
2038 npeak.rep2 <- rep(NA, n.level)
|
|
2039
|
|
2040 for(i in 1:length(idr.level)){
|
|
2041
|
|
2042 # find which uri.ez is closet to fdr.level
|
|
2043 index <- which.min(abs(IDR - idr.level[i]))
|
|
2044 # sig.value1[i] <- min(u.o[1:index])
|
|
2045 # sig.value2[i] <- min(v.o[1:index])
|
|
2046 ez.cutoff[i] <- e.z.ordered[index] # fixed on 02/20/10
|
|
2047 n.selected[i] <- sum(e.z<=ez.cutoff[i])
|
|
2048 # npeak.rep1[i] <- sum(all.data1["sig.value"] >= sig.value1[i])
|
|
2049 # npeak.rep2[i] <- sum(all.data2["sig.value"] >= sig.value2[i])
|
|
2050 }
|
|
2051
|
|
2052 # output the cutoff of posterior probability, number of selected overlapped peaks
|
|
2053 map.uv <- cbind(ez.cutoff, n.selected)
|
|
2054
|
|
2055 return(list(n=n.select, IDR=IDR, idr.level=idr.level, map.uv=map.uv))
|
|
2056 }
|
|
2057
|
|
2058 # return(list(n=tt*length(u), uri.ez=uri.ez, fdr.level=fdr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff))
|
|
2059
|
|
2060
|
|
2061
|
|
2062
|
|
2063
|
|
2064
|
|
2065 ####### the following is for determining thresholds for merged dataset
|
|
2066
|
|
2067 ############# select peaks above a given threshold
|
|
2068 #
|
|
2069 # pass.threshold: a simple method, passing the threshold on the threshold on the individual replicate to the pooled sample
|
|
2070 #
|
|
2071 # sig.map.list: a list of matrix to include all the cutoff values, each row corresponds to a cutoff. The first column is idr.level
|
|
2072 # the 2nd column is the cutoff of ez, the rest of columns are consistency analysis for other replicates
|
|
2073 # sig.value.name: the name of the sig.value column
|
|
2074 # combined: combined dataset
|
|
2075 # nrep: number of pairs of comparisons
|
|
2076 #
|
|
2077 # Procedure:
|
|
2078 # 1. Find the significant threshold corresponding to the idr cutoff on the matched peaks.
|
|
2079 # 2. Each time we will get two or more (if >2 replicates) cutoffs and will report the most stringent and the least stringent
|
|
2080 # cutoff and the number of peaks selected at those two cutoffs
|
|
2081 #############
|
|
2082
|
|
2083 pass.threshold <- function(sig.map.list, sig.value.name, combined, idr.level, nrep, chr.size){
|
|
2084
|
|
2085 sig.map <- c()
|
|
2086
|
|
2087 # choose idr.level
|
|
2088 idr.index <- which(rbind(sig.map.list[[1]])[,1] == idr.level)
|
|
2089 if(length(i) ==0){
|
|
2090 print("no level matches specified idr.level")
|
|
2091 return(-1)
|
|
2092 }
|
|
2093
|
|
2094 for(i in 1:length(sig.map.list))
|
|
2095 sig.map <- c(sig.map, rbind(sig.map.list[[i]])[idr.index, c("sig.value1", "sig.value2")])
|
|
2096
|
|
2097
|
|
2098 npeak.tight <- c()
|
|
2099 npeak.loose <- c()
|
|
2100
|
|
2101
|
|
2102 max.sig <- max(sig.map)
|
|
2103 min.sig <- min(sig.map)
|
|
2104 selected.sig.tight <- combined[combined[,sig.value.name]>=max.sig, ]
|
|
2105 selected.sig.loose <- combined[combined[,sig.value.name]>=min.sig, ]
|
|
2106
|
|
2107 selected.sig.tight <- deconcatenate.chr(selected.sig.tight, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")]
|
|
2108 selected.sig.loose <- deconcatenate.chr(selected.sig.loose, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")]
|
|
2109
|
|
2110 npeak.tight <- nrow(selected.sig.tight)
|
|
2111 npeak.loose <- nrow(selected.sig.loose)
|
|
2112
|
|
2113
|
|
2114 npeak.stat <- list(idr.level=idr.level, max.sig=max.sig, min.sig=min.sig, npeak.tight=npeak.tight, npeak.loose=npeak.loose)
|
|
2115
|
|
2116 invisible(list(npeak.stat=npeak.stat, combined.selected.tight=selected.sig.tight, combined.selected.loose=selected.sig.loose))
|
|
2117 }
|
|
2118
|
|
2119 #################
|
|
2120 # pass the regions selected from consistency analysis to combined data
|
|
2121 # Threshold is determined on the replicates, the regions above the threshold are selected
|
|
2122 # then peaks on the combined data are selected from the selected regions
|
|
2123 #
|
|
2124 # To avoid being too stringent, regions satisfying the following conditions are selected
|
|
2125 # 1. regions above the significant threshold determined by consistency analysis on either replicate
|
|
2126 # 2. regions that have consistent low peaks, i.e. posterior prob > threshold but not passing the significant threshold
|
|
2127 #
|
|
2128 # This method doesn't make a difference when using different thresholds
|
|
2129 #################
|
|
2130
|
|
2131 pass.region <- function(sig.map.list, uri.output, ez.list, em.output, combined, idr.level, sig.value.impute=0, chr.size){
|
|
2132
|
|
2133 combined <- combined[, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
|
|
2134 npair <- length(uri.output) # number of pairs of consistency analysis
|
|
2135 combined.region <- c()
|
|
2136
|
|
2137 # choose idr.level
|
|
2138 idr.index <- which(rbind(sig.map.list[[1]])[,1] == idr.level)
|
|
2139 if(length(idr.index) ==0){
|
|
2140 print("no level matches specified idr.level")
|
|
2141 return(-1)
|
|
2142 }
|
|
2143
|
|
2144 for(j in 1:npair){
|
|
2145 # select peaks from individual replicates using individual cutoff
|
|
2146 above.1 <- uri.output[[j]]$data12.enrich$merge1["sig.value"] >= ez.list[[j]]$map.uv[idr.index,"sig.value1"]
|
|
2147 above.2 <- uri.output[[j]]$data12.enrich$merge1["sig.value"] >= ez.list[[j]]$map.uv[idr.index,"sig.value2"]
|
|
2148 selected.sig.rep1 <- uri.output[[j]]$data12.enrich$merge1[above.1, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
|
|
2149 selected.sig.rep2 <- uri.output[[j]]$data12.enrich$merge2[above.2, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
|
|
2150
|
|
2151 # find the peaks that are overlapped with reliable peaks in the individual replicates
|
|
2152 overlap.1 <- pair.peaks(selected.sig.rep1, combined)$merge2
|
|
2153 overlap.2 <- pair.peaks(selected.sig.rep2, combined)$merge2
|
|
2154
|
|
2155 # choose the ones with significant value > 0, which are the overlapped ones
|
|
2156
|
|
2157 combined.in1 <- overlap.1[overlap.1$sig.value > sig.value.impute, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
|
|
2158 combined.in2 <- overlap.2[overlap.2$sig.value > sig.value.impute, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
|
|
2159
|
|
2160 ## consistent low significant ones
|
|
2161 ## first find consistenct ones, ie. high posterior prob
|
|
2162 # is.consistent <- ez.list[[j]]$e.z < ez.list[[j]]$ez.cutoff
|
|
2163
|
|
2164 # data.matched <- keep.match(uri.output[[j]]$data12.enrich$merge1[!above.1, ], uri.output[[j]]$data12.enrich$merge2[!above.2, ], sig.value.impute=0)
|
|
2165 # data.matched$sample1 <- data.matched$sample1[, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
|
|
2166 # data.matched$sample2 <- data.matched$sample2[, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
|
|
2167
|
|
2168 # consistent.in1 <- data.matched$sample1[is.consistent, ]
|
|
2169 # consistent.in2 <- data.matched$sample2[is.consistent, ]
|
|
2170
|
|
2171 # overlap.consistent.1 <- pair.peaks(consistent.in1, combined)$merge2
|
|
2172 # overlap.consistent.2 <- pair.peaks(consistent.in2, combined)$merge2
|
|
2173
|
|
2174 ## choose the ones with significant value > 0, which are the overlapped ones
|
|
2175
|
|
2176 # combined.consistent.in1 <- overlap.consistent.1[overlap.consistent.1$sig.value > sig.value.impute, ]
|
|
2177 # combined.consistent.in2 <- overlap.consistent.2[overlap.consistent.2$sig.value > sig.value.impute, ]
|
|
2178
|
|
2179 # combined.region <- rbind(combined.region, combined.in1, combined.in2, combined.consistent.in1, combined.consistent.in2)
|
|
2180
|
|
2181 combined.region <- rbind(combined.region, combined.in1, combined.in2)
|
|
2182
|
|
2183 is.repeated <- duplicated(combined.region$start)
|
|
2184 combined.region <- combined.region[!is.repeated, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")]
|
|
2185
|
|
2186 }
|
|
2187 npeak <- nrow(combined.region)
|
|
2188
|
|
2189 sig.combined <- c(min(combined.region[,"sig.value"], na.rm=T), max(combined.region[,"sig.value"], na.rm=T))
|
|
2190
|
|
2191 # idr.combined <- c(min(combined.region[,"q.value"], na.rm=T), max(combined.region[,"q.value"], na.rm=T))
|
|
2192
|
|
2193 npeak.stat <- list(idr.level=idr.level, npeak=npeak)
|
|
2194
|
|
2195 combined.region <- deconcatenate.chr(combined.region, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")]
|
|
2196
|
|
2197 invisible(list(npeak.stat=npeak.stat, combined.selected=combined.region, sig.combined=sig.combined))
|
|
2198 }
|
|
2199
|
|
2200 ################
|
|
2201 # pass structure: this method does another round of inference on the combined data
|
|
2202 #
|
|
2203 # To make the mixture structure comparable on the replicates and the combined data, the 2nd inference is done on the peaks
|
|
2204 # at the reliable regions on the combined data, using rank transformed significant values. The mixture structure is estimated using my consistency analysis, which
|
|
2205 # estimates marginal distributions of ranks using nonparametric ways. Then the significant values are found out.
|
|
2206 # There are several advantages to do it this way:
|
|
2207 # 1. The premise of passing structure is that the means and variance (i.e. distribution) of two replicates should be the same
|
|
2208 # The significant values on the two replicates clearly have different distributions. The structure estimated from consistency
|
|
2209 # analysis will generate similar rank distribution on two replicates by its setup (i.e. same number of peaks are paired up).
|
|
2210 # 2. Because pooled sample is a black box, the structure is more likely to be followed in the matched regions than other locations,
|
|
2211 # after all, we don't know what other things are. If even the structure doesn't hold on the matched regions,
|
|
2212 # which is possible, let alone the other regions. Focusing on the reliable regions helps to get rid of those unknown noises.
|
|
2213 #
|
|
2214 #
|
|
2215 # modified on 2-20-10: reverse rank.combined, make big sig.value with small
|
|
2216 # ranks, to be consistent with f1 and f2
|
|
2217 ################
|
|
2218
|
|
2219 pass.structure <- function(uri.output, em.output, combined, idr.level, sig.value.impute, chr.size, overlap.ratio=0){
|
|
2220
|
|
2221 columns.keep <- c("sig.value", "start", "stop", "signal.value", "p.value", "q.value", "chr", "start.ori", "stop.ori")
|
|
2222 combined <- combined[, columns.keep]
|
|
2223 combined.selected.all <- c()
|
|
2224
|
|
2225 for(j in 1:npair){
|
|
2226
|
|
2227 sample1 <- uri.output[[j]]$data12.enrich$merge1[, columns.keep]
|
|
2228 sample2 <- uri.output[[j]]$data12.enrich$merge2[, columns.keep]
|
|
2229
|
|
2230 # find peaks on the matched region on the combined one
|
|
2231 data.matched <- keep.match(sample1, sample2, sig.value.impute=sig.value.impute)
|
|
2232
|
|
2233 data.matched$sample1 <- data.matched$sample1[, columns.keep]
|
|
2234 data.matched$sample2 <- data.matched$sample2[, columns.keep]
|
|
2235
|
|
2236 overlap.1 <- pair.peaks.filter(data.matched$sample1, combined, p.value.impute=sig.value.impute, overlap.ratio)$merge2
|
|
2237 overlap.2 <- pair.peaks.filter(data.matched$sample2, combined, p.value.impute=sig.value.impute, overlap.ratio)$merge2
|
|
2238
|
|
2239 # choose the ones with significant value > sig.value.impute, which are the overlapped ones
|
|
2240
|
|
2241 combined.in1 <- overlap.1[overlap.1$sig.value > sig.value.impute, ]
|
|
2242 combined.in2 <- overlap.2[overlap.2$sig.value > sig.value.impute, ]
|
|
2243
|
|
2244 combined.region <- rbind(combined.in1, combined.in2)
|
|
2245
|
|
2246 is.repeated <- duplicated(combined.region$start)
|
|
2247 combined.region <- combined.region[!is.repeated,]
|
|
2248
|
|
2249 # now rank the peaks in matched region
|
|
2250 rank.combined <- rank(-combined.region$sig.value)
|
|
2251
|
|
2252 # now transform the parameters estimated into the new scale
|
|
2253 npeaks.overlap <- nrow(combined.region)
|
|
2254 npeaks.consistent <- nrow(cbind(em.output[[j]]$data.pruned$sample1))
|
|
2255
|
|
2256
|
|
2257 # the breaks are the same for x and y
|
|
2258 f1 <- list(breaks=em.output[[j]]$em.fit$x.mar$f1$breaks*npeaks.overlap/npeaks.consistent, density=(em.output[[j]]$em.fit$x.mar$f1$density+em.output[[j]]$em.fit$y.mar$f1$density)/2)
|
|
2259 # the first break boundary goes up when changing scale, need set it back to be a bit smaller than 1
|
|
2260 f1$breaks[1] <- min(f1$breaks[1], 0.95)
|
|
2261
|
|
2262 f2 <- list(breaks=em.output[[j]]$em.fit$x.mar$f2$breaks*npeaks.overlap/npeaks.consistent, density=(em.output[[j]]$em.fit$x.mar$f2$density+em.output[[j]]$em.fit$y.mar$f2$density)/2)
|
|
2263 # the first break boundary goes up when changing scale, need set it back to be a bit smaller than 1
|
|
2264 f2$breaks[1] <- min(f2$breaks[1], 0.95)
|
|
2265
|
|
2266 p <- em.output[[j]]$em.fit$para$p
|
|
2267
|
|
2268 # find the posterior probability
|
|
2269 errorprob.combined <- get.comp2.prob(rank.combined, p, f1, f2)
|
|
2270
|
|
2271 # compute the FDR and find cutoff of posterior prob and the sig value
|
|
2272 o <- order(errorprob.combined)
|
|
2273 idr <- cumsum(errorprob.combined[o])/c(1:length(o))
|
|
2274 idr.index <- which(idr > idr.level)[1]
|
|
2275 errorprob.cutoff <- errorprob.combined[o][idr.index]
|
|
2276
|
|
2277 # find the minimum significant measure among selected peaks
|
|
2278 sig.value <- min(combined.region$sig.value[o][1:idr.index])
|
|
2279 # sig.value <- quantile(combined.region$sig.value[o][1:idr.index], prob=0.05)
|
|
2280 #sig.value <- quantile(combined.region$sig.value[errorprob.combined<=errorprob.cutoff], prob=0.05)
|
|
2281
|
|
2282 # apply the significant value on the whole pooled list
|
|
2283 combined.selected <- combined[combined$sig.value >= sig.value,]
|
|
2284
|
|
2285 combined.selected.all <- rbind(combined.selected.all, combined.selected)
|
|
2286 }
|
|
2287
|
|
2288 is.repeated <- duplicated(combined.selected.all$start)
|
|
2289 combined.selected.all <- combined.selected.all[!is.repeated,]
|
|
2290
|
|
2291 npeak <- nrow(combined.selected.all)
|
|
2292
|
|
2293 npeak.stat <- list(idr.level=idr.level, npeak=npeak)
|
|
2294
|
|
2295 sig.combined <- c(min(combined.selected.all[,"sig.value"], na.rm=T), max(combined.selected.all[,"sig.value"], na.rm=T))
|
|
2296
|
|
2297 # idr.combined <- c(min(combined.selected.all[,"q.value"], na.rm=T), max(combined.selected.all[,"q.value"], na.rm=T))
|
|
2298 # combined.selected.all <- deconcatenate.chr(combined.selected.all, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")]
|
|
2299
|
|
2300 combined.selected.all <- combined.selected.all[, c("chr", "start.ori", "stop.ori", "signal.value", "p.value", "q.value")]
|
|
2301 colnames(combined.selected.all) <- c("chr", "start", "stop", "signal.value", "p.value", "q.value")
|
|
2302
|
|
2303 invisible(list(npeak.stat=npeak.stat, combined.selected=combined.selected.all, sig.combined=sig.combined))
|
|
2304 }
|
|
2305
|
|
2306
|
|
2307
|
|
2308 # get the posterior probability of the 2nd component
|
|
2309 get.comp2.prob <- function(x, p, f1, f2){
|
|
2310
|
|
2311 # get pdf and cdf of each component from functions in the corresponding component
|
|
2312 px.1 <- sapply(x, get.pdf, df=f1)
|
|
2313 px.2 <- sapply(x, get.pdf, df=f2)
|
|
2314
|
|
2315 comp2prob <- 1 - p*px.1/(p*px.1+(1-p)*px.2)
|
|
2316
|
|
2317 return(comp2prob)
|
|
2318 }
|
|
2319
|
|
2320 keep.match <- function(sample1, sample2, sig.value.impute=0){
|
|
2321
|
|
2322 sample1.prune <- sample1[sample1$sig.value > sig.value.impute & sample2$sig.value > sig.value.impute,]
|
|
2323 sample2.prune <- sample2[sample1$sig.value > sig.value.impute & sample2$sig.value > sig.value.impute,]
|
|
2324
|
|
2325 invisible(list(sample1=sample1.prune, sample2=sample2.prune))
|
|
2326 }
|
|
2327
|
|
2328
|
|
2329 ##############################################
|
|
2330 #
|
|
2331 # The following is for simulation
|
|
2332 #
|
|
2333 ##############################################
|
|
2334
|
|
2335
|
|
2336 # simulate gaussian copula
|
|
2337 # u is the uniform random variable and rho is correlation coefficient
|
|
2338 simu.gaussian.copula <- function(u, rho){
|
|
2339
|
|
2340 n <- length(u)
|
|
2341
|
|
2342 # simulate y given x=qnorm(u)
|
|
2343 y <- qnorm(u)*rho + rnorm(n)*sqrt(1-rho^2)
|
|
2344
|
|
2345 v <- pnorm(y)
|
|
2346
|
|
2347 invisible(v)
|
|
2348 }
|
|
2349
|
|
2350 ## simulate Clayton copula from its generating function
|
|
2351 ## Genest and MacKay (1986)
|
|
2352
|
|
2353 phi.ori <- function(t, s){
|
|
2354
|
|
2355 (t^(-s) -1)/s
|
|
2356 }
|
|
2357
|
|
2358
|
|
2359 phi.inv <- function(y, s){
|
|
2360
|
|
2361 exp(-log(s*y+1)/s)
|
|
2362 }
|
|
2363
|
|
2364 phi.der <- function(t, s){
|
|
2365
|
|
2366 -t^(-s-1)
|
|
2367 }
|
|
2368
|
|
2369 phi.der.inv <- function(y, s){
|
|
2370
|
|
2371 exp(log(-y)/(-s-1))
|
|
2372 }
|
|
2373
|
|
2374 get.w <- function(u, t, s){
|
|
2375
|
|
2376 phi.der.inv(phi.der(u, s)/t, s)
|
|
2377 }
|
|
2378
|
|
2379 get.v <- function(w, u, s){
|
|
2380
|
|
2381 phi.inv(phi.ori(w, s) - phi.ori(u, s), s)
|
|
2382 }
|
|
2383
|
|
2384 # u is a uniform random variable, s is the association parameter
|
|
2385 simu.clayton.copula <- function(u, s){
|
|
2386
|
|
2387 t <- runif(length(u))
|
|
2388
|
|
2389 if(s>0){
|
|
2390 w <- get.w(u, t, s)
|
|
2391 v <- get.v(w, u, s)
|
|
2392 return(v)
|
|
2393 }
|
|
2394
|
|
2395 if(s==0){
|
|
2396 return(t)
|
|
2397 }
|
|
2398
|
|
2399 if(s <0){
|
|
2400 print("Invalid association parameters for clayton copula")
|
|
2401 }
|
|
2402
|
|
2403 }
|
|
2404
|
|
2405
|
|
2406
|
|
2407 ###### 09-09-09
|
|
2408
|
|
2409 # simulate a two-component copula mixture:
|
|
2410 # - marginal distributions for the two variables in each component are both
|
|
2411 # normal and with the same parameters
|
|
2412 # p is the mixing proportion of component 1
|
|
2413 # n is the total sample size
|
|
2414 simu.copula.2mix <- function(s1, s2, p, n, mu1, mu2, sd1, sd2, copula.txt){
|
|
2415
|
|
2416 n1 <- round(n*p)
|
|
2417 n2 <- n-n1
|
|
2418
|
|
2419 u1 <- runif(n1)
|
|
2420
|
|
2421 if(copula.txt =="clayton")
|
|
2422 v1 <- simu.clayton.copula(u1, s1)
|
|
2423 else{
|
|
2424 if(copula.txt =="gaussian")
|
|
2425 v1 <- simu.gaussian.copula(u1, s1)
|
|
2426 }
|
|
2427
|
|
2428 u2 <- runif(n2)
|
|
2429
|
|
2430 if(copula.txt =="clayton")
|
|
2431 v2 <- simu.clayton.copula(u2, s2)
|
|
2432 else{
|
|
2433 if(copula.txt =="gaussian")
|
|
2434 v2 <- simu.gaussian.copula(u2, s2)
|
|
2435 }
|
|
2436
|
|
2437 # generate test statistics
|
|
2438 sample1.1 <- qnorm(u1, mu1, sd1)
|
|
2439 sample1.2 <- qnorm(v1, mu1, sd1)
|
|
2440
|
|
2441 sample2.1 <- qnorm(u2, mu2, sd2)
|
|
2442 sample2.2 <- qnorm(v2, mu2, sd2)
|
|
2443
|
|
2444 return(list(u=c(u1, u2), v=c(v1, v2),
|
|
2445 u.inv=c(sample1.1, sample2.1), v.inv=c(sample1.2, sample2.2),
|
|
2446 label=c(rep(1, n1), rep(2, n2))))
|
|
2447 }
|
|
2448
|
|
2449 # using inverse of the cdf to generate original observations
|
|
2450
|
|
2451 simu.copula.2mix.inv <- function(s1, s2, p, n, cdf1.x, cdf1.y, cdf2.x, cdf2.y, copula.txt){
|
|
2452
|
|
2453 n1 <- round(n*p)
|
|
2454 n2 <- n-n1
|
|
2455
|
|
2456 u1 <- runif(n1)
|
|
2457
|
|
2458 if(copula.txt =="clayton")
|
|
2459 v1 <- simu.clayton.copula(u1, s1)
|
|
2460 else{
|
|
2461 if(copula.txt =="gaussian")
|
|
2462 v1 <- simu.gaussian.copula(u1, s1)
|
|
2463 }
|
|
2464
|
|
2465 u2 <- runif(n2)
|
|
2466
|
|
2467 if(copula.txt =="clayton")
|
|
2468 v2 <- simu.clayton.copula(u2, s2)
|
|
2469 else{
|
|
2470 if(copula.txt =="gaussian")
|
|
2471 v2 <- simu.gaussian.copula(u2, s2)
|
|
2472 }
|
|
2473
|
|
2474 # generate test statistics
|
|
2475 # sample1.1 <- qnorm(u1, mu1, sd1)
|
|
2476 # sample1.2 <- qnorm(v1, mu1, sd1)
|
|
2477
|
|
2478 # sample2.1 <- qnorm(u2, mu2, sd2)
|
|
2479 # sample2.2 <- qnorm(v2, mu2, sd2)
|
|
2480
|
|
2481 sample1.x <- inv.cdf.vec(u1, cdf1.x)
|
|
2482 sample1.y <- inv.cdf.vec(v1, cdf1.y)
|
|
2483
|
|
2484 sample2.x <- inv.cdf.vec(u2, cdf2.x)
|
|
2485 sample2.y <- inv.cdf.vec(v2, cdf2.y)
|
|
2486
|
|
2487
|
|
2488 return(list(u=c(u1, u2), v=c(v1, v2),
|
|
2489 u.inv=c(sample1.x, sample2.x), v.inv=c(sample1.y, sample2.y),
|
|
2490 label=c(rep(1, n1), rep(2, n2))))
|
|
2491 }
|
|
2492
|
|
2493 # obtain original observation by converting cdf into quantiles
|
|
2494 # u is one cdf
|
|
2495 # u.cdf is a cdf (assuming it is a histogram) and has the break points (cdf$cdf and cdf$breaks)
|
|
2496 # the smallest value of cdf=0 and the largest =1
|
|
2497 inv.cdf <- function(u, u.cdf){
|
|
2498
|
|
2499 # which bin it falls into
|
|
2500 i <- which(u.cdf$cdf> u)[1]
|
|
2501 q.u <- (u - u.cdf$cdf[i-1])/(u.cdf$cdf[i] - u.cdf$cdf[i-1])* (u.cdf$breaks[i]-u.cdf$breaks[i-1]) + u.cdf$breaks[i-1]
|
|
2502
|
|
2503 return(q.u)
|
|
2504 }
|
|
2505
|
|
2506 inv.cdf.vec <- function(u, u.cdf){
|
|
2507
|
|
2508 # check if cdf has the right range (0, 1)
|
|
2509 ncdf <- length(u.cdf$cdf)
|
|
2510 nbreaks <- length(u.cdf$breaks)
|
|
2511
|
|
2512 if(ncdf == nbreaks-1 & u.cdf$cdf[ncdf]< 1)
|
|
2513 u.cdf[ncdf] <- 1
|
|
2514
|
|
2515 q.u <- sapply(u, inv.cdf, u.cdf)
|
|
2516
|
|
2517 return(q.u)
|
|
2518 }
|
|
2519
|
|
2520 # here we simulate a likely real situation
|
|
2521 # the test statistics from two normal distributions
|
|
2522 # according to their labels, then convert them into p-values w.r.t H0 using
|
|
2523 # one-sided test.
|
|
2524 # The test statistics are correlated for the signal component and independent
|
|
2525 # for the noise component
|
|
2526 # For the signal component, Y = X + eps, where eps ~ N(0, sigma^2)
|
|
2527 simu.test.stat <- function(p, n, mu1, sd1, mu0, sd0, sd.e){
|
|
2528
|
|
2529 # first component - signal
|
|
2530 n.signal <- round(n*p)
|
|
2531 n.noise <- n - n.signal
|
|
2532
|
|
2533 # labels
|
|
2534 labels <- c(rep(1, n.signal), rep(0, n.noise))
|
|
2535
|
|
2536 # test statistics for signal and noise
|
|
2537 mu.signal <- rnorm(n.signal, mu1, sd1)
|
|
2538 x.signal <- mu.signal + rnorm(n.signal, 0, sd.e)
|
|
2539 x.noise <- rnorm(n.noise, mu0, sd0) + rnorm(n.noise, 0, sd.e)
|
|
2540
|
|
2541 y.signal <- mu.signal + rnorm(n.signal, 0, sd.e)
|
|
2542 # sd.e can be dependent on signal
|
|
2543 y.noise <- rnorm(n.noise, mu0, sd0) + rnorm(n.noise, 0, sd.e)
|
|
2544
|
|
2545 # concatenate
|
|
2546 x <- c(x.signal, x.noise)
|
|
2547 y <- c(y.signal, y.noise)
|
|
2548
|
|
2549 # convert to p-values based on H0
|
|
2550 p.x <- 1-pnorm(x, mu0, sqrt(sd0^2+sd.e^2))
|
|
2551 p.y <- 1-pnorm(y, mu0, sqrt(sd0^2+sd.e^2))
|
|
2552
|
|
2553 return(list(p.x=p.x, p.y=p.y, x=x, y=y, labels=labels))
|
|
2554
|
|
2555 }
|
|
2556
|
|
2557 # compute the tradeoff and calibration
|
|
2558 forward.decoy.tradeoff.ndecoy <- function(xx, labels, ndecoy){
|
|
2559
|
|
2560 xx <- round(xx, 5)
|
|
2561 o <- order(xx, decreasing=T)
|
|
2562
|
|
2563 rand <- 1-labels # if rand==0, consistent
|
|
2564 # order the random indicator in the same order
|
|
2565 rand.o <- rand[o]
|
|
2566
|
|
2567 if(sum(rand.o) > ndecoy){
|
|
2568 index.decoy <- which(cumsum(rand.o)==ndecoy)
|
|
2569 } else {
|
|
2570 index.decoy <- which(cumsum(rand.o)==sum(rand.o))
|
|
2571 }
|
|
2572
|
|
2573 cutoff.decoy <- xx[o][index.decoy]
|
|
2574
|
|
2575 # only consider the unique ones
|
|
2576 cutoff.unique <- unique(xx[o])
|
|
2577
|
|
2578 cutoff <- cutoff.unique[cutoff.unique >= cutoff.decoy[length(cutoff.decoy)]]
|
|
2579
|
|
2580 get.decoy.count <- function(cut.off){
|
|
2581 above <- rep(0, length(xx))
|
|
2582 above[xx >= cut.off] <- 1
|
|
2583 decoy.count <- sum(above==1 & rand==1)
|
|
2584 return(decoy.count)
|
|
2585 }
|
|
2586
|
|
2587 get.forward.count <- function(cut.off){
|
|
2588 above <- rep(0, length(xx))
|
|
2589 above[xx >= cut.off] <- 1
|
|
2590 forward.count <- sum(above==1 & rand==0)
|
|
2591 return(forward.count)
|
|
2592 }
|
|
2593
|
|
2594 get.est.fdr <- function(cut.off){
|
|
2595 above <- rep(0, length(xx))
|
|
2596 above[xx >= cut.off] <- 1
|
|
2597 est.fdr <- 1-mean(xx[above==1])
|
|
2598 return(est.fdr)
|
|
2599 }
|
|
2600
|
|
2601 # assuming rand=0 is right
|
|
2602 get.false.neg.count <- function(cut.off){
|
|
2603 below <- rep(0, length(xx))
|
|
2604 below[xx < cut.off] <- 1
|
|
2605 false.neg.count <- sum(below==1 & rand==0)
|
|
2606 return(false.neg.count)
|
|
2607 }
|
|
2608
|
|
2609 get.false.pos.count <- function(cut.off){
|
|
2610 above <- rep(0, length(xx))
|
|
2611 above[xx >= cut.off] <- 1
|
|
2612 false.pos.count <- sum(above==1 & rand==1)
|
|
2613 return(false.pos.count)
|
|
2614 }
|
|
2615
|
|
2616 decoy <- sapply(cutoff, get.decoy.count)
|
|
2617 forward <- sapply(cutoff, get.forward.count)
|
|
2618
|
|
2619 est.fdr <- sapply(cutoff, get.est.fdr)
|
|
2620 emp.fdr <- decoy/(decoy+forward)
|
|
2621
|
|
2622 # compute specificity and sensitivity
|
|
2623 # assuming rand=1 is wrong and rand=0 is right
|
|
2624 false.neg <- sapply(cutoff, get.false.neg.count)
|
|
2625 false.pos <- sapply(cutoff, get.false.pos.count)
|
|
2626
|
|
2627 true.pos <- sum(rand==0)-false.neg
|
|
2628 true.neg <- sum(rand==1)-false.pos
|
|
2629
|
|
2630 sensitivity <- true.pos/(true.pos+false.neg)
|
|
2631 specificity <- true.neg/(true.neg+false.pos)
|
|
2632
|
|
2633 return(list(decoy=decoy, forward=forward, cutoff=cutoff, est.fdr=est.fdr, emp.fdr=emp.fdr, sensitivity=sensitivity, specificity=specificity))
|
|
2634 }
|
|
2635
|
|
2636
|
|
2637 # compute the em for jackknife and all data, and find FDR
|
|
2638 get.emp.jack <- function(a, p0){
|
|
2639
|
|
2640 nobs <- length(a$labels)
|
|
2641 est <- list()
|
|
2642 est.all <- list()
|
|
2643
|
|
2644 temp.all <- em.transform(-a$p.x, -a$p.y, mu=1.5, sigma=1.4, rho=0.4, p=0.7, eps=0.01)
|
|
2645 # temp.all <- em.2copula.quick(a$p.x, a$p.y, p0=p0, rho1.0=0.7,
|
|
2646 # rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian")
|
|
2647
|
|
2648 est.all$p <- temp.all$para$p
|
|
2649 est.all$rho1 <- temp.all$para$rho1
|
|
2650 est.all$FDR <- get.FDR(temp.all$e.z)
|
|
2651
|
|
2652 FDR <- list()
|
|
2653 p <- c()
|
|
2654 rho1 <- c()
|
|
2655
|
|
2656
|
|
2657 for(i in 1:nobs){
|
|
2658
|
|
2659 temp <- em.transform(-a$p.x[-i], -a$p.y[-i], mu=1.5, sigma=1.4, rho=0.4, p=0.7, eps=0.01)
|
|
2660 # temp <- em.2copula.quick(a$p.x[-i], a$p.y[-i], p0=p0, rho1.0=0.7,
|
|
2661 # rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian")
|
|
2662
|
|
2663 est[[i]] <- list(p=temp$para$p, rho1=temp$para$rho1, FDR=get.FDR(temp$e.z))
|
|
2664
|
|
2665 FDR[[i]] <- est[[i]]$FDR # this is the FDR for top n peaks
|
|
2666 p[i] <- est[[i]]$p
|
|
2667 rho1[i] <- est[[i]]$rho1
|
|
2668 }
|
|
2669
|
|
2670 est.jack <- list(FDR=FDR, p=p, rho1=rho1)
|
|
2671 return(list(est.jack=est.jack, est.all=est.all))
|
|
2672 }
|
|
2673
|
|
2674
|
|
2675 # get the npeaks corresponding to the nominal FDR estimated from the sample
|
|
2676 # and find the corresponding FDR from the entire data
|
|
2677 get.FDR.jack <- function(est, FDR.nominal){
|
|
2678
|
|
2679 nobs <- length(est$est.jack$FDR)
|
|
2680 FDR.all <- c()
|
|
2681 top.n <- c()
|
|
2682
|
|
2683 for(i in 1:nobs){
|
|
2684 top.n[i] <- max(which(est$est.jack$FDR[[i]] <= FDR.nominal))
|
|
2685 FDR.all[i] <- est$est.all$FDR[top.n[i]]
|
|
2686 }
|
|
2687
|
|
2688 invisible(list(FDR.all=FDR.all, top.n=top.n))
|
|
2689 }
|
|
2690
|
|
2691 # compute Jackknife peudonumber
|
|
2692 # a is the dataset
|
|
2693 get.emp.IF <- function(a, p0){
|
|
2694
|
|
2695 nobs <- length(a$labels)
|
|
2696 est <- list()
|
|
2697 est.all <- list()
|
|
2698
|
|
2699 temp.all <- em.2copula.quick(a$p.x, a$p.y, p0=p0, rho1.0=0.7,
|
|
2700 rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian")
|
|
2701
|
|
2702 est.all$p <- temp.all$para$p
|
|
2703 est.all$rho1 <- temp.all$para$rho1
|
|
2704 est.all$FDR <- get.FDR(temp.all$e.z)
|
|
2705
|
|
2706 IF.FDR <- list()
|
|
2707 IF.p <- c()
|
|
2708 IF.rho1 <- c()
|
|
2709
|
|
2710 for(i in 1:nobs){
|
|
2711
|
|
2712 temp <- em.2copula.quick(a$p.x[-i], a$p.y[-i], p0=p0, rho1.0=0.7,
|
|
2713 rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian")
|
|
2714
|
|
2715 est[[i]] <- list(p=temp$para$p, rho1=temp$para$rho1, FDR=get.FDR(temp$e.z))
|
|
2716
|
|
2717 IF.FDR[[i]] <- (nobs-1)*(est.all$FDR[-nobs] - est[[i]]$FDR) # this is the FDR for top n peaks
|
|
2718 IF.p[i] <- (nobs-1)*(est.all$p - est[[i]]$p)
|
|
2719 IF.rho1[i] <- (nobs-1)*(est.all$rho1 - est[[i]]$rho1)
|
|
2720 }
|
|
2721
|
|
2722 emp.IF <- list(FDR=IF.FDR, p=IF.p, rho1=IF.rho1)
|
|
2723
|
|
2724 invisible(list(emp.IF=emp.IF, est.all=est.all, est=est))
|
|
2725 }
|
|
2726
|
|
2727 # e.z is the posterior probability of being in signal component
|
|
2728 get.FDR <- function(e.z){
|
|
2729
|
|
2730 e.z.o <- order(1-e.z)
|
|
2731 FDR <- cumsum(1-e.z[e.z.o])/c(1:length(e.z.o))
|
|
2732
|
|
2733 invisible(FDR)
|
|
2734 }
|
|
2735
|
|
2736 # get the FDR of selecting the top n peaks
|
|
2737 # IF.est is the sample influence function
|
|
2738 # top.n
|
|
2739 get.IF.FDR <- function(IF.est, top.n){
|
|
2740
|
|
2741 nobs <- length(IF.est$emp.IF$FDR)
|
|
2742 FDR <- c()
|
|
2743
|
|
2744 # influence function of p
|
|
2745 for(i in 1:nobs)
|
|
2746 FDR[i] <- IF.est$emp.IF$FDR[[i]][top.n]
|
|
2747
|
|
2748 invisible(FDR)
|
|
2749 }
|
|
2750
|
|
2751 # get the sample influence function for FDR at a given FDR size
|
|
2752 # 1. find the number of peaks selected at a given FDR computed from all obs
|
|
2753 # 2. use the number to find the sample influence function for FDR
|
|
2754 # IF.est$est.all is the FDR with all peaks
|
|
2755 get.IF.FDR.all <- function(IF.est, FDR.size){
|
|
2756
|
|
2757 top.n <- which.min(abs(IF.est$est.all$FDR -FDR.size))
|
|
2758 nobs <- length(IF.est$est.all$FDR)
|
|
2759 FDR <- c()
|
|
2760
|
|
2761 # influence function of p
|
|
2762 for(i in 1:nobs)
|
|
2763 FDR[i] <- IF.est$emp.IF$FDR[[i]][top.n]
|
|
2764
|
|
2765 invisible(list(FDR=FDR, top.n=top.n))
|
|
2766 }
|
|
2767
|
|
2768 plot.simu.uri <- function(x, y){
|
|
2769
|
|
2770 tt <- seq(0.01, 0.99, by=0.01)
|
|
2771 uri <- sapply(tt, comp.uri.prob, u=x, v=y)
|
|
2772 uri.thin <- uri[seq(1, length(tt), by=3)]
|
|
2773 tt.thin <- tt[seq(1, length(tt), by=3)]
|
|
2774 duri <- (uri.thin[-1]-uri.thin[-length(uri.thin)])/(tt.thin[-1]-tt.thin[-length(tt.thin)])
|
|
2775 uri.spl <- smooth.spline(tt, uri, df=6.4)
|
|
2776 uri.der <- predict(uri.spl, tt, deriv=1)
|
|
2777
|
|
2778 par(mfrow=c(2,2))
|
|
2779 plot(x[1:n0], y[1:n0])
|
|
2780 points(x[(n0+1):n], y[(n0+1):n], col=2)
|
|
2781 plot(rank(-x)[1:n0], rank(-y)[1:n0])
|
|
2782 points(rank(-x)[(1+n0):n], rank(-y)[(1+n0):n])
|
|
2783 plot(tt, uri)
|
|
2784 lines(c(0,1), c(0,1), lty=2)
|
|
2785 title(paste("rho1=", rho1, " rho2=", rho2, "p=", p, sep=""))
|
|
2786 plot(tt.thin[-1], duri)
|
|
2787 lines(uri.der)
|
|
2788 abline(h=1)
|
|
2789 invisible(list(x=x, y=y, uri=uri, tt=tt, duri=duri, tt.thin=tt.thin, uri.der=uri.der))
|
|
2790
|
|
2791 }
|
|
2792
|
|
2793
|
|
2794 ###### new fitting procedure
|
|
2795
|
|
2796
|
|
2797
|
|
2798
|
|
2799 # 1. rank pairs
|
|
2800
|
|
2801 # 2. initialization
|
|
2802 # 3. convert to pseudo-number
|
|
2803
|
|
2804 # 4. EM
|
|
2805
|
|
2806 # need plugin and test
|
|
2807 # find the middle point between the bins
|
|
2808 get.pseudo.mix <- function(x, mu, sigma, rho, p){
|
|
2809
|
|
2810
|
|
2811 # first compute cdf for points on the grid
|
|
2812 # generate 200 points between [-3, mu+3*sigma]
|
|
2813 nw <- 1000
|
|
2814 w <- seq(min(-3, mu-3*sigma), max(mu+3*sigma, 3), length=nw)
|
|
2815 w.cdf <- p*pnorm(w, mean=mu, sd=sigma) + (1-p)*pnorm(w, mean=0, sd=1)
|
|
2816
|
|
2817 i <- 1
|
|
2818
|
|
2819 quan.x <- rep(NA, length(x))
|
|
2820
|
|
2821 for(i in c(1:nw)){
|
|
2822 index <- which(x >= w.cdf[i] & x < w.cdf[i+1])
|
|
2823 quan.x[index] <- (x[index]-w.cdf[i])*(w[i+1]-w[i])/(w.cdf[i+1]-w.cdf[i]) +w[i]
|
|
2824 }
|
|
2825
|
|
2826 index <- which(x < w.cdf[1])
|
|
2827 if(length(index)>0)
|
|
2828 quan.x[index] <- w[1]
|
|
2829
|
|
2830 index <- which(x > w.cdf[nw])
|
|
2831 if(length(index)>0)
|
|
2832 quan.x[index] <- w[nw]
|
|
2833
|
|
2834 # linear.ext <- function(x, w, w.cdf){
|
|
2835 # linear interpolation
|
|
2836 # index.up <- which(w.cdf>= x)[1]
|
|
2837 # left.index <- which(w.cdf <=x)
|
|
2838 # index.down <- left.index[length(left.index)]
|
|
2839 # quan.x <- (w[index.up] + w[index.down])/2
|
|
2840 # }
|
|
2841
|
|
2842 # x.pseudo <- sapply(x, linear.ext, w=w, w.cdf=w.cdf)
|
|
2843
|
|
2844 # invisible(x.pseudo)
|
|
2845 invisible(quan.x)
|
|
2846 }
|
|
2847
|
|
2848
|
|
2849 # EM to compute the latent structure
|
|
2850 # steps:
|
|
2851 # 1. raw values are first transformed into pseudovalues
|
|
2852 # 2. EM is used to compute the underlining structure, which is a mixture
|
|
2853 # of two normals
|
|
2854 em.transform <- function(x, y, mu, sigma, rho, p, eps){
|
|
2855
|
|
2856 x.cdf.func <- ecdf(x)
|
|
2857 y.cdf.func <- ecdf(y)
|
|
2858 afactor <- length(x)/(length(x)+1)
|
|
2859 x.cdf <- x.cdf.func(x)*afactor
|
|
2860 y.cdf <- y.cdf.func(y)*afactor
|
|
2861
|
|
2862 # initialization
|
|
2863 para <- list()
|
|
2864 para$mu <- mu
|
|
2865 para$sigma <- sigma
|
|
2866 para$rho <- rho
|
|
2867 para$p <- p
|
|
2868
|
|
2869 j <- 1
|
|
2870 to.run <- T
|
|
2871 loglik.trace <- c()
|
|
2872 loglik.inner.trace <- c()
|
|
2873
|
|
2874 #to.run.inner <- T
|
|
2875 z.1 <- get.pseudo.mix(x.cdf, para$mu, para$sigma, para$rho, para$p)
|
|
2876 z.2 <- get.pseudo.mix(y.cdf, para$mu, para$sigma, para$rho, para$p)
|
|
2877
|
|
2878 # cat("length(z1)", length(z.1), "\n")
|
|
2879 while(to.run){
|
|
2880
|
|
2881 # get pseudo value in each cycle
|
|
2882 # z.1 <- get.pseudo.mix(x.cdf, para$mu, para$sigma, para$rho, para$p)
|
|
2883 # z.2 <- get.pseudo.mix(y.cdf, para$mu, para$sigma, para$rho, para$p)
|
|
2884
|
|
2885 i <- 1
|
|
2886 while(to.run){
|
|
2887
|
|
2888 # EM for latent structure
|
|
2889 e.z <- e.step.2normal(z.1, z.2, para$mu, para$sigma, para$rho, para$p)
|
|
2890 para <- m.step.2normal(z.1, z.2, e.z)
|
|
2891 #para$rho <- rho
|
|
2892 #para$p <- p
|
|
2893 #para$mu <- mu
|
|
2894 #para$sigma <- sigma
|
|
2895 if(i > 1)
|
|
2896 l.old <- l.new
|
|
2897
|
|
2898 # this is just the mixture likelihood of two-component Gaussian
|
|
2899 l.new <- loglik.2binormal(z.1, z.2, para$mu, para$sigma, para$rho, para$p)
|
|
2900
|
|
2901 loglik.inner.trace[i] <- l.new
|
|
2902
|
|
2903 if(i > 1){
|
|
2904 to.run <- loglik.inner.trace[i]-loglik.inner.trace[i-1]>eps
|
|
2905 }
|
|
2906
|
|
2907
|
|
2908 # if(i > 2){
|
|
2909 # l.inf <- loglik.inner.trace[i-2] + (loglik.inner.trace[i-1] - loglik.inner.trace[i-2])/(1-(loglik.inner.trace[i]-loglik.inner.trace[i-1])/(loglik.inner.trace[i-1]-loglik.inner.trace[i-2]))
|
|
2910
|
|
2911 # if(loglik.inner.trace[i-1]!=loglik.inner.trace[i-2])
|
|
2912 # to.run <- abs(l.inf - loglik.inner.trace[i]) > eps
|
|
2913 # else
|
|
2914 # to.run <- F
|
|
2915
|
|
2916 # }
|
|
2917
|
|
2918 cat("loglik.inner.trace[", i, "]=", loglik.inner.trace[i], "\n")
|
|
2919 cat("mu=", para$mu, "sigma=", para$sigma, "p=", para$p, "rho=", para$rho, "\n\n")
|
|
2920
|
|
2921 i <- i+1
|
|
2922 }
|
|
2923
|
|
2924
|
|
2925 # get pseudo value in each cycle
|
|
2926 z.1 <- get.pseudo.mix(x.cdf, para$mu, para$sigma, para$rho, para$p)
|
|
2927 z.2 <- get.pseudo.mix(y.cdf, para$mu, para$sigma, para$rho, para$p)
|
|
2928
|
|
2929 if(j > 1)
|
|
2930 l.old.outer <- l.new.outer
|
|
2931
|
|
2932 l.new.outer <- loglik.2binormal(z.1, z.2, para$mu, para$sigma, para$rho, para$p)
|
|
2933
|
|
2934 loglik.trace[j] <- l.new.outer
|
|
2935
|
|
2936 if(j == 1)
|
|
2937 to.run <- T
|
|
2938 else{ # stop when iteration>100
|
|
2939 if(j > 100)
|
|
2940 to.run <- F
|
|
2941 else
|
|
2942 to.run <- l.new.outer - l.old.outer > eps
|
|
2943 }
|
|
2944
|
|
2945 # if(j %% 10==0)
|
|
2946 cat("loglik.trace[", j, "]=", loglik.trace[j], "\n")
|
|
2947 cat("mu=", para$mu, "sigma=", para$sigma, "p=", para$p, "rho=", para$rho, "\n")
|
|
2948
|
|
2949 j <- j+1
|
|
2950 }
|
|
2951
|
|
2952 bic <- -2*l.new + 4*log(length(z.1))
|
|
2953
|
|
2954 return(list(para=list(p=para$p, rho=para$rho, mu=para$mu, sigma=para$sigma),
|
|
2955 loglik=l.new, bic=bic, e.z=e.z, loglik.trace=loglik.trace))
|
|
2956 }
|
|
2957
|
|
2958
|
|
2959
|
|
2960
|
|
2961 # compute log-likelihood for mixture of two bivariate normals
|
|
2962 loglik.2binormal <- function(z.1, z.2, mu, sigma, rho, p){
|
|
2963
|
|
2964 l.m <- sum(d.binormal(z.1, z.2, 0, 1, 0)+log(p*exp(d.binormal(z.1, z.2, mu, sigma, rho)-d.binormal(z.1, z.2, 0, 1, 0))+(1-p)))
|
|
2965
|
|
2966 # l.m <- sum((p*d.binormal(z.1, z.2, mu, sigma, rho) + (1-p)*d.binormal(z.1, z.2, 0, 1, 0)))
|
|
2967 return(l.m)
|
|
2968 }
|
|
2969
|
|
2970 # check this when rho=1
|
|
2971
|
|
2972 # density of binomial distribution with equal mean and sigma on both dimensions
|
|
2973 d.binormal <- function(z.1, z.2, mu, sigma, rho){
|
|
2974
|
|
2975 loglik <- (-log(2)-log(pi)-2*log(sigma) - log(1-rho^2)/2 - (0.5/(1-rho^2)/sigma^2)*((z.1-mu)^2 -2*rho*(z.1-mu)*(z.2-mu) + (z.2-mu)^2))
|
|
2976
|
|
2977 return(loglik)
|
|
2978 }
|
|
2979
|
|
2980 # E-step for computing the latent strucutre
|
|
2981 # e.z is the prob to be in the consistent group
|
|
2982 # e.step for estimating posterior prob
|
|
2983 # z.1 and z.2 can be vectors or scalars
|
|
2984 e.step.2normal <- function(z.1, z.2, mu, sigma, rho, p){
|
|
2985
|
|
2986 e.z <- p/((1-p)*exp(d.binormal(z.1, z.2, 0, 1, 0)-d.binormal(z.1, z.2, mu, sigma, rho))+ p)
|
|
2987
|
|
2988 invisible(e.z)
|
|
2989 }
|
|
2990
|
|
2991 # M-step for computing the latent structure
|
|
2992 # m.step for estimating proportion, mean, sd and correlation coefficient
|
|
2993 m.step.2normal <- function(z.1, z.2, e.z){
|
|
2994
|
|
2995 p <- mean(e.z)
|
|
2996 mu <- sum((z.1+z.2)*e.z)/2/sum(e.z)
|
|
2997 sigma <- sqrt(sum(e.z*((z.1-mu)^2+(z.2-mu)^2))/2/sum(e.z))
|
|
2998 rho <- 2*sum(e.z*(z.1-mu)*(z.2-mu))/(sum(e.z*((z.1-mu)^2+(z.2-mu)^2)))
|
|
2999
|
|
3000 return(list(p=p, mu=mu, sigma=sigma, rho=rho))
|
|
3001 }
|
|
3002
|
|
3003
|
|
3004 # assume top p percent of observations are true
|
|
3005 # x and y are ranks, estimate
|
|
3006 init <- function(x, y, x.label){
|
|
3007
|
|
3008 x.o <- order(x)
|
|
3009
|
|
3010 x.ordered <- x[x.o]
|
|
3011 y.ordered <- y[x.o]
|
|
3012 x.label.ordered <- x.label[x.o]
|
|
3013
|
|
3014 n <- length(x)
|
|
3015 p <- sum(x.label)/n
|
|
3016
|
|
3017 rho <- cor(x.ordered[1:ceiling(p*n)], y.ordered[1:ceiling(p*n)])
|
|
3018
|
|
3019 temp <- find.mu.sigma(x.ordered, x.label.ordered)
|
|
3020 mu <- temp$mu
|
|
3021 sigma <- temp$sigma
|
|
3022
|
|
3023 invisible(list(mu=mu, sigma=sigma, rho=rho, p=p))
|
|
3024
|
|
3025 }
|
|
3026
|
|
3027 # find mu and sigma if the distributions of marginal ranks are known
|
|
3028 # take the medians of the two dist and map back to the original
|
|
3029 init.dist <- function(f0, f1){
|
|
3030
|
|
3031 # take the median in f0
|
|
3032 index.median.0 <- which(f0$cdf>0.5)[1]
|
|
3033 q.0.small <- f0$cdf[index.median.0] # because f0 and f1 have the same bins
|
|
3034 q.1.small <- f1$cdf[index.median.0]
|
|
3035
|
|
3036 # take the median in f1
|
|
3037 index.median.1 <- which(f1$cdf>0.5)[1]
|
|
3038 q.0.big <- f0$cdf[index.median.1] # because f0 and f1 have the same bins
|
|
3039 q.1.big <- f1$cdf[index.median.1]
|
|
3040
|
|
3041 # find pseudo value for x.middle[1] on normal(0,1)
|
|
3042 pseudo.small.0 <- qnorm(q.0.small, mean=0, sd=1)
|
|
3043 pseudo.small.1 <- qnorm(q.1.small, mean=0, sd=1)
|
|
3044
|
|
3045 # find pseudo value for x.middle[2] on normal(0,1)
|
|
3046 pseudo.big.0 <- qnorm(q.0.big, mean=0, sd=1)
|
|
3047 pseudo.big.1 <- qnorm(q.1.big, mean=0, sd=1)
|
|
3048
|
|
3049 mu <- (pseudo.small.0*pseudo.big.1 - pseudo.small.1*pseudo.big.0)/(pseudo.big.1-pseudo.small.1)
|
|
3050
|
|
3051 sigma <- (pseudo.small.0-mu)/pseudo.small.1
|
|
3052
|
|
3053 return(list(mu=mu, sigma=sigma))
|
|
3054 }
|
|
3055
|
|
3056 # generate labels
|
|
3057
|
|
3058 # find the part of data with overlap
|
|
3059
|
|
3060 # find the percentile on noise and signal
|
|
3061
|
|
3062 # Suppose there are signal and noise components, with mean=0 and sd=1 for noise
|
|
3063 # x and x.label are the rank of the observations and their labels,
|
|
3064 # find the mean and sd of the other component
|
|
3065 # x.label takes values of 0 and 1
|
|
3066 find.mu.sigma <- function(x, x.label){
|
|
3067
|
|
3068 x.0 <- x[x.label==0]
|
|
3069 x.1 <- x[x.label==1]
|
|
3070
|
|
3071 n.x0 <- length(x.0)
|
|
3072 n.x1 <- length(x.1)
|
|
3073
|
|
3074 x.end <- c(min(x.0), min(x.1), max(x.0), max(x.1))
|
|
3075 o <- order(x.end)
|
|
3076 x.middle <- x.end[o][c(2,3)]
|
|
3077
|
|
3078 # the smaller end of the overlap
|
|
3079 q.1.small <- mean(x.1 <= x.middle[1])*n.x1/(n.x1+1)
|
|
3080 q.0.small <- mean(x.0 <= x.middle[1])*n.x0/(n.x0+1)
|
|
3081
|
|
3082 # the bigger end of the overlap
|
|
3083 q.1.big <- mean(x.1 <= x.middle[2])*n.x1/(n.x1+1)
|
|
3084 q.0.big <- mean(x.0 <= x.middle[2])*n.x0/(n.x0+1)
|
|
3085
|
|
3086 # find pseudo value for x.middle[1] on normal(0,1)
|
|
3087 pseudo.small.0 <- qnorm(q.0.small, mean=0, sd=1)
|
|
3088 pseudo.small.1 <- qnorm(q.1.small, mean=0, sd=1)
|
|
3089
|
|
3090 # find pseudo value for x.middle[2] on normal(0,1)
|
|
3091 pseudo.big.0 <- qnorm(q.0.big, mean=0, sd=1)
|
|
3092 pseudo.big.1 <- qnorm(q.1.big, mean=0, sd=1)
|
|
3093
|
|
3094 mu <- (pseudo.small.0*pseudo.big.1 - pseudo.small.1*pseudo.big.0)/(pseudo.big.1-pseudo.small.1)
|
|
3095
|
|
3096 sigma <- (pseudo.small.0-mu)/pseudo.small.1
|
|
3097
|
|
3098 return(list(mu=mu, sigma=sigma))
|
|
3099 }
|