Mercurial > repos > modencode-dcc > idr_package
comparison functions-all-clayton-12-13.r @ 18:7dd341a53e77 draft
Uploaded
author | modencode-dcc |
---|---|
date | Mon, 21 Jan 2013 13:36:04 -0500 |
parents | 5e6efd5f3567 |
children |
comparison
equal
deleted
inserted
replaced
17:70dc9bb76f92 | 18:7dd341a53e77 |
---|---|
1 # revised on 2-20-10 | |
2 # - fix error in pass.structure: reverse rank.combined, so that big sig.value | |
3 # are ranked with small numbers (1, 2, ...) | |
4 # - fix error on get.ez.tt.all: get ez.cutoff from sorted e.z | |
5 | |
6 # | |
7 # modified EM procedure to compute empirical CDF more precisely - 09/2009 | |
8 | |
9 | |
10 | |
11 # this file contains the functions for | |
12 # 1. computing the correspondence profile (upper rank intersection and derivatives) | |
13 # 2. inference of copula mixture model | |
14 # | |
15 # It also has functions for | |
16 # 1. reading peak caller results | |
17 # 2. processing and matching called peaks | |
18 # 3. plotting results | |
19 | |
20 | |
21 ################ read peak caller results | |
22 | |
23 # process narrow peak format | |
24 # some peak callers may not report q-values, p-values or fold of enrichment | |
25 # need further process before comparison | |
26 # | |
27 # stop.exclusive: Is the basepair of peak.list$stop exclusive? In narrowpeak and broadpeak format they are exclusive. | |
28 # If it is exclusive, we need subtract peak.list$stop by 1 to avoid the same basepair being both a start and a stop of two | |
29 # adjacent peaks, which creates trouble for finding correct intersect | |
30 process.narrowpeak <- function(narrow.file, chr.size, half.width=NULL, summit="offset", stop.exclusive=T, broadpeak=F){ | |
31 | |
32 | |
33 aa <- read.table(narrow.file) | |
34 | |
35 if(broadpeak){ | |
36 bb.ori <- data.frame(chr=aa$V1, start=aa$V2, stop=aa$V3, signal.value=aa$V7, p.value=aa$V8, q.value=aa$V9) | |
37 }else{ | |
38 bb.ori <- data.frame(chr=aa$V1, start=aa$V2, stop=aa$V3, signal.value=aa$V7, p.value=aa$V8, q.value=aa$V9, summit=aa$V10) | |
39 } | |
40 | |
41 if(summit=="summit"){ | |
42 bb.ori$summit <- bb.ori$summit-bb.ori$start # change summit to offset to avoid error when concatenating chromosomes | |
43 } | |
44 | |
45 bb <- concatenate.chr(bb.ori, chr.size) | |
46 | |
47 #bb <- bb.ori | |
48 | |
49 # remove the peaks that has the same start and stop value | |
50 bb <- bb[bb$start != bb$stop,] | |
51 | |
52 if(stop.exclusive==T){ | |
53 bb$stop <- bb$stop-1 | |
54 } | |
55 | |
56 if(!is.null(half.width)){ | |
57 bb$start.ori <- bb$start | |
58 bb$stop.ori <- bb$stop | |
59 | |
60 # if peak is narrower than the specified window, stay with its width | |
61 # otherwise chop wider peaks to specified width | |
62 width <- bb$stop-bb$start +1 | |
63 is.wider <- width > 2*half.width | |
64 | |
65 if(summit=="offset" | summit=="summit"){ # if summit is offset from start | |
66 bb$start[is.wider] <- bb$start.ori[is.wider] + bb$summit[is.wider]-half.width | |
67 bb$stop[is.wider] <- bb$start.ori[is.wider] + bb$summit[is.wider]+half.width | |
68 } else { | |
69 if(summit=="unknown"){ | |
70 bb$start[is.wider] <- bb$start.ori[is.wider]+round(width[is.wider]/2) - half.width | |
71 bb$stop[is.wider] <- bb$start.ori[is.wider]+round(width[is.wider]/2) + half.width | |
72 } | |
73 } | |
74 } | |
75 | |
76 bb <- clean.data(bb) | |
77 invisible(list(data.ori=bb.ori, data.cleaned=bb)) | |
78 } | |
79 | |
80 # clean data | |
81 # and concatenate chromosomes if needed | |
82 clean.data <- function(adata){ | |
83 | |
84 # remove the peaks that has the same start and stop value | |
85 adata <- adata[adata$start != adata$stop,] | |
86 | |
87 # if some stops and starts are the same, need fix them | |
88 stop.in.start <- is.element(adata$stop, adata$start) | |
89 n.fix <- sum(stop.in.start) | |
90 if(n.fix >0){ | |
91 print(paste("Fix", n.fix, "stops\n")) | |
92 adata$stop[stop.in.start] <- adata$stop[stop.in.start]-1 | |
93 } | |
94 | |
95 return(adata) | |
96 } | |
97 | |
98 # concatenate peaks | |
99 # peaks: the dataframe to have all the peaks | |
100 # chr.file: the file to keep the length of each chromosome | |
101 # chr files should come from the species that the data is from | |
102 #concatenate.chr <- function(peaks, chr.size){ | |
103 | |
104 # chr.size <- read.table(chr.file) | |
105 # chr.o <- order(chr.size[,1]) | |
106 # chr.size <- chr.size[chr.o,] | |
107 # | |
108 # chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2])) | |
109 # chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift) | |
110 # | |
111 # for(i in 1:nrow(chr.size)){ | |
112 # is.in <- as.character(peaks$chr) == as.character(chr.size.cum$chr[i]) | |
113 # if(sum(is.in)>0){ | |
114 # peaks[is.in,]$start <- peaks[is.in,]$start + chr.size.cum$shift[i] | |
115 # peaks[is.in,]$stop <- peaks[is.in,]$stop + chr.size.cum$shift[i] | |
116 # } | |
117 # } | |
118 # | |
119 # invisible(peaks) | |
120 #} | |
121 | |
122 | |
123 | |
124 | |
125 # concatenate peaks | |
126 # peaks: the dataframe to have all the peaks | |
127 # chr.file: the file to keep the length of each chromosome | |
128 # chr files should come from the species that the data is from | |
129 concatenate.chr <- function(peaks, chr.size){ | |
130 | |
131 # chr.size <- read.table(chr.file) | |
132 chr.o <- order(chr.size[,1]) | |
133 chr.size <- chr.size[chr.o,] | |
134 | |
135 chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2])) | |
136 chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift) | |
137 | |
138 peaks$start.ori <- peaks$start | |
139 peaks$stop.ori <- peaks$stop | |
140 | |
141 for(i in 1:nrow(chr.size)){ | |
142 is.in <- as.character(peaks$chr) == as.character(chr.size.cum$chr[i]) | |
143 if(sum(is.in)>0){ | |
144 peaks[is.in,]$start <- peaks[is.in,]$start + chr.size.cum$shift[i] | |
145 peaks[is.in,]$stop <- peaks[is.in,]$stop + chr.size.cum$shift[i] | |
146 } | |
147 } | |
148 | |
149 invisible(peaks) | |
150 } | |
151 | |
152 | |
153 deconcatenate.chr <- function(peaks, chr.size){ | |
154 | |
155 chr.o <- order(chr.size[,1]) | |
156 chr.size <- chr.size[chr.o,] | |
157 | |
158 chr.shift <- cumsum(c(0, chr.size[-nrow(chr.size),2])) | |
159 chr.size.cum <- data.frame(chr=chr.size[,1], shift=chr.shift) | |
160 | |
161 peaks$chr <- rep(NA, nrow(peaks)) | |
162 | |
163 for(i in 1:(nrow(chr.size.cum)-1)){ | |
164 is.in <- peaks$start > chr.size.cum[i,2] & peaks$start <= chr.size.cum[i+1, 2] | |
165 if(sum(is.in)>0){ | |
166 peaks[is.in,]$start <- peaks[is.in,]$start - chr.size.cum[i,2] | |
167 peaks[is.in,]$stop <- peaks[is.in,]$stop - chr.size.cum[i,2]+1 | |
168 peaks[is.in,]$chr <- chr.size[i,1] | |
169 } | |
170 } | |
171 | |
172 if(i == nrow(chr.size.cum)){ | |
173 is.in <- peaks$start > chr.size.cum[i, 2] | |
174 if(sum(is.in)>0){ | |
175 peaks[is.in,]$start <- peaks[is.in,]$start - chr.size.cum[i,2] | |
176 peaks[is.in,]$stop <- peaks[is.in,]$stop - chr.size.cum[i,2]+1 | |
177 peaks[is.in,]$chr <- chr.size[i,1] | |
178 } | |
179 } | |
180 | |
181 invisible(peaks) | |
182 } | |
183 | |
184 ################ preprocessing peak calling output | |
185 | |
186 | |
187 # | |
188 # read two calling results and sort by peak starting locations, | |
189 # then find overlap between peaks | |
190 # INPUT: | |
191 # rep1: the 1st replicate | |
192 # rep2: the 2nd replicate | |
193 # OUTPUT: | |
194 # id1, id2: the labels for the identified peaks on the replicates | |
195 find.overlap <- function(rep1, rep2){ | |
196 | |
197 o1 <- order(rep1$start) | |
198 rep1 <- rep1[o1,] | |
199 | |
200 o2 <- order(rep2$start) | |
201 rep2 <- rep2[o2,] | |
202 | |
203 n1 <- length(o1) | |
204 n2 <- length(o2) | |
205 | |
206 # assign common ID to peaks | |
207 id1 <- rep(0, n1) # ID assigned on rep1 | |
208 id2 <- rep(0, n2) # ID assigned on rep2 | |
209 id <- 1 # keep track common id's | |
210 | |
211 # check if two replicates overlap with each other | |
212 i <- 1 | |
213 j <- 1 | |
214 | |
215 while(i <= n1|| j <= n2){ | |
216 | |
217 # && (id1[n1] ==0 || id2[n2] ==0) | |
218 | |
219 # if one list runs out | |
220 if(i > n1 && j < n2){ | |
221 | |
222 j <- j+1 | |
223 id2[j] <- id | |
224 id <- id +1 | |
225 next | |
226 } else{ | |
227 if(j > n2 && i < n1){ | |
228 i <- i+1 | |
229 id1[i] <- id | |
230 id <- id +1 | |
231 next | |
232 } else { | |
233 if(i >= n1 && j >=n2) | |
234 break | |
235 } | |
236 } | |
237 | |
238 # if not overlap | |
239 | |
240 if(!(rep1$start[i] <= rep2$stop[j] && rep2$start[j] <= rep1$stop[i])){ | |
241 | |
242 # at the start of loop, when both are not assigned an ID | |
243 # the one locates in front is assigned first | |
244 if(id1[i] ==0 && id2[j]==0){ | |
245 if(rep1$stop[i] < rep2$stop[j]){ | |
246 id1[i] <- id | |
247 } else { | |
248 id2[j] <- id | |
249 } | |
250 } else { # in the middle of the loop, when one is already assigned | |
251 # The one that has not assigned gets assigned | |
252 # if(id1[i] ==0){ # id1[i] is not assigned | |
253 # id1[i] <- id | |
254 # } else { # id2[i] is not assigned | |
255 # id2[j] <- id | |
256 # } | |
257 | |
258 # order the id according to location | |
259 if(rep1$stop[i] <= rep2$stop[j]){ | |
260 id1[i] <- max(id2[j], id1[i]) | |
261 id2[j] <- id | |
262 } else { | |
263 if(rep1$stop[i] > rep2$stop[j]){ | |
264 id2[j] <- max(id1[i], id2[j]) | |
265 id1[i] <- id | |
266 } | |
267 } | |
268 | |
269 } | |
270 | |
271 id <- id +1 | |
272 | |
273 } else { # if overlap | |
274 | |
275 if(id1[i] == 0 && id2[j] == 0){ # not assign label yet | |
276 id1[i] <- id | |
277 id2[j] <- id | |
278 id <- id +1 | |
279 } else { # one peak is already assigned label, the other is 0 | |
280 | |
281 id1[i] <- max(id1[i], id2[j]) # this is a way to copy the label of the assigned peak without knowing which one is already assigned | |
282 id2[j] <- id1[i] # syncronize the labels | |
283 } | |
284 | |
285 } | |
286 | |
287 if(rep1$stop[i] < rep2$stop[j]){ | |
288 i <- i+1 | |
289 } else { | |
290 j <- j+1 | |
291 } | |
292 | |
293 } | |
294 | |
295 invisible(list(id1=id1, id2=id2)) | |
296 | |
297 } | |
298 | |
299 # Impute the missing significant value for the peaks called only on one replicate. | |
300 # value | |
301 # INPUT: | |
302 # rep1, rep2: the two peak calling output | |
303 # id1, id2: the IDs assigned by function find.overlap, vectors | |
304 # If id1[i]==id2[j], peak i on rep1 overlaps with peak j on rep2 | |
305 # p.value.impute: the significant value to impute for the missing peaks | |
306 # OUTPUT: | |
307 # rep1, rep2: peaks ordered by the start locations with imputed peaks | |
308 # id1, id2: the IDs with imputed peaks | |
309 fill.missing.peaks <- function(rep1, rep2, id1, id2, p.value.impute){ | |
310 | |
311 # rep1 <- data.frame(chr=rep1$chr, start=rep1$start, stop=rep1$stop, sig.value=rep1$sig.value) | |
312 # rep2 <- data.frame(chr=rep2$chr, start=rep2$start, stop=rep2$stop, sig.value=rep2$sig.value) | |
313 | |
314 o1 <- order(rep1$start) | |
315 rep1 <- rep1[o1,] | |
316 | |
317 o2 <- order(rep2$start) | |
318 rep2 <- rep2[o2,] | |
319 | |
320 entry.in1.not2 <- !is.element(id1, id2) | |
321 entry.in2.not1 <- !is.element(id2, id1) | |
322 | |
323 if(sum(entry.in1.not2) > 0){ | |
324 | |
325 temp1 <- rep1[entry.in1.not2, ] | |
326 | |
327 # impute sig.value | |
328 temp1$sig.value <- p.value.impute | |
329 temp1$signal.value <- p.value.impute | |
330 temp1$p.value <- p.value.impute | |
331 temp1$q.value <- p.value.impute | |
332 | |
333 rep2.filled <- rbind(rep2, temp1) | |
334 id2.filled <- c(id2, id1[entry.in1.not2]) | |
335 } else { | |
336 id2.filled <- id2 | |
337 rep2.filled <- rep2 | |
338 } | |
339 | |
340 if(sum(entry.in2.not1) > 0){ | |
341 | |
342 temp2 <- rep2[entry.in2.not1, ] | |
343 | |
344 # fill in p.values to 1 | |
345 temp2$sig.value <- p.value.impute | |
346 temp2$signal.value <- p.value.impute | |
347 temp2$p.value <- p.value.impute | |
348 temp2$q.value <- p.value.impute | |
349 | |
350 | |
351 # append to the end | |
352 rep1.filled <- rbind(rep1, temp2) | |
353 | |
354 id1.filled <- c(id1, id2[entry.in2.not1]) | |
355 } else { | |
356 id1.filled <- id1 | |
357 rep1.filled <- rep1 | |
358 } | |
359 | |
360 # sort rep1 and rep2 by the same id | |
361 o1 <- order(id1.filled) | |
362 rep1.ordered <- rep1.filled[o1, ] | |
363 | |
364 o2 <- order(id2.filled) | |
365 rep2.ordered <- rep2.filled[o2, ] | |
366 | |
367 invisible(list(rep1=rep1.ordered, rep2=rep2.ordered, | |
368 id1=id1.filled[o1], id2=id2.filled[o2])) | |
369 } | |
370 | |
371 # Merge peaks with same ID on the same replicates | |
372 # (They are generated if two peaks on rep1 map to the same peak on rep2) | |
373 # need peak.list have 3 columns: start, stop and sig.value | |
374 merge.peaks <- function(peak.list, id){ | |
375 | |
376 i <- 1 | |
377 j <- 1 | |
378 dup.index <- c() | |
379 sig.value <- c() | |
380 start.new <- c() | |
381 stop.new <- c() | |
382 id.new <- c() | |
383 | |
384 # original data | |
385 chr <- c() | |
386 start.ori <- c() | |
387 stop.ori <- c() | |
388 | |
389 signal.value <- c() | |
390 p.value <- c() | |
391 q.value <- c() | |
392 | |
393 while(i < length(id)){ | |
394 | |
395 if(id[i] == id[i+1]){ | |
396 dup.index <- c(dup.index, i, i+1) # push on dup.index | |
397 } else { | |
398 if(length(dup.index)>0){ # pop from dup.index | |
399 sig.value[j] <- mean(peak.list$sig.value[unique(dup.index)]) # mean of -log(pvalue) | |
400 start.new[j] <- peak.list$start[min(dup.index)] | |
401 stop.new[j] <- peak.list$stop[max(dup.index)] | |
402 id.new[j] <- id[max(dup.index)] | |
403 | |
404 signal.value[j] <- mean(peak.list$signal.value[unique(dup.index)]) # mean of -log(pvalue) | |
405 p.value[j] <- mean(peak.list$p.value[unique(dup.index)]) # mean of -log(pvalue) | |
406 q.value[j] <- mean(peak.list$q.value[unique(dup.index)]) # mean of -log(pvalue) | |
407 | |
408 chr[j] <- as.character(peak.list$chr[min(dup.index)]) | |
409 start.ori[j] <- peak.list$start.ori[min(dup.index)] | |
410 stop.ori[j] <- peak.list$stop.ori[max(dup.index)] | |
411 | |
412 dup.index <- c() | |
413 } else { # nothing to pop | |
414 sig.value[j] <- peak.list$sig.value[i] | |
415 start.new[j] <- peak.list$start[i] | |
416 stop.new[j] <- peak.list$stop[i] | |
417 id.new[j] <- id[i] | |
418 | |
419 signal.value[j] <- peak.list$signal.value[i] | |
420 p.value[j] <- peak.list$p.value[i] | |
421 q.value[j] <- peak.list$q.value[i] | |
422 | |
423 chr[j] <- as.character(peak.list$chr[i]) | |
424 start.ori[j] <- peak.list$start.ori[i] | |
425 stop.ori[j] <- peak.list$stop.ori[i] | |
426 | |
427 } | |
428 j <- j+1 | |
429 } | |
430 i <- i+1 | |
431 } | |
432 | |
433 data.new <- data.frame(id=id.new, sig.value=sig.value, start=start.new, stop=stop.new, signal.value=signal.value, p.value=p.value, q.value=q.value, chr=chr, start.ori=start.ori, stop.ori=stop.ori) | |
434 invisible(data.new) | |
435 } | |
436 | |
437 | |
438 | |
439 | |
440 | |
441 # a wrap function to fill in missing peaks, merge peaks and impute significant values | |
442 # out1 and out2 are two peak calling outputs | |
443 pair.peaks <- function(out1, out2, p.value.impute=0){ | |
444 | |
445 aa <- find.overlap(out1, out2) | |
446 bb <- fill.missing.peaks(out1, out2, aa$id1, aa$id2, p.value.impute=0) | |
447 | |
448 cc1 <- merge.peaks(bb$rep1, bb$id1) | |
449 cc2 <- merge.peaks(bb$rep2, bb$id2) | |
450 | |
451 invisible(list(merge1=cc1, merge2=cc2)) | |
452 } | |
453 | |
454 | |
455 | |
456 # overlap.ratio is a parameter to define the percentage of overlap | |
457 # if overlap.ratio =0, 1 basepair overlap is counted as overlap | |
458 # if overlap.ratio between 0 and 1, it is the minimum proportion of | |
459 # overlap required to be called as a match | |
460 # it is computed as the overlap part/min(peak1.length, peak2.length) | |
461 pair.peaks.filter <- function(out1, out2, p.value.impute=0, overlap.ratio=0){ | |
462 | |
463 aa <- find.overlap(out1, out2) | |
464 bb <- fill.missing.peaks(out1, out2, aa$id1, aa$id2, p.value.impute=0) | |
465 | |
466 cc1 <- merge.peaks(bb$rep1, bb$id1) | |
467 cc2 <- merge.peaks(bb$rep2, bb$id2) | |
468 | |
469 frag12 <- cbind(cc1$start, cc1$stop, cc2$start, cc2$stop) | |
470 | |
471 frag.ratio <- apply(frag12, 1, overlap.middle) | |
472 | |
473 frag.ratio[cc1$sig.value==p.value.impute | cc2$sig.value==p.value.impute] <- 0 | |
474 | |
475 cc1$frag.ratio <- frag.ratio | |
476 cc2$frag.ratio <- frag.ratio | |
477 | |
478 merge1 <- cc1[cc1$frag.ratio >= overlap.ratio,] | |
479 merge2 <- cc2[cc2$frag.ratio >= overlap.ratio,] | |
480 | |
481 invisible(list(merge1=merge1, merge2=merge2)) | |
482 } | |
483 | |
484 # x[1], x[2] are the start and end of the first fragment | |
485 # and x[3] and x[4] are the start and end of the 2nd fragment | |
486 # If there are two fragments, we can find the overlap by ordering the | |
487 # start and stop of all the ends and find the difference between the middle two | |
488 overlap.middle <- function(x){ | |
489 | |
490 x.o <- x[order(x)] | |
491 f1 <- x[2]-x[1] | |
492 f2 <- x[4]-x[3] | |
493 | |
494 f.overlap <- abs(x.o[3]-x.o[2]) | |
495 f.overlap.ratio <- f.overlap/min(f1, f2) | |
496 | |
497 return(f.overlap.ratio) | |
498 } | |
499 | |
500 | |
501 | |
502 ####### | |
503 ####### compute correspondence profile | |
504 ####### | |
505 | |
506 # compute upper rank intersection for one t | |
507 # tv: the upper percentile | |
508 # x is sorted by the order of paired variable | |
509 comp.uri <- function(tv, x){ | |
510 n <- length(x) | |
511 qt <- quantile(x, prob=1-tv[1]) # tv[1] is t | |
512 # sum(x[1:ceiling(n*tv[2])] >= qt)/n/tv[2]- tv[1]*tv[2] #tv[2] is v | |
513 sum(x[1:ceiling(n*tv[2])] >= qt)/n | |
514 | |
515 } | |
516 | |
517 # compute the correspondence profile | |
518 # tt, vv: vector between (0, 1) for percentages | |
519 get.uri.2d <- function(x1, x2, tt, vv, spline.df=NULL){ | |
520 | |
521 o <- order(x1, x2, decreasing=T) | |
522 | |
523 # sort x2 by the order of x1 | |
524 x2.ordered <- x2[o] | |
525 | |
526 tv <- cbind(tt, vv) | |
527 ntotal <- length(x1) # number of peaks | |
528 | |
529 uri <- apply(tv, 1, comp.uri, x=x2.ordered) | |
530 | |
531 # compute the derivative of URI vs t using small bins | |
532 uri.binned <- uri[seq(1, length(uri), by=4)] | |
533 tt.binned <- tt[seq(1, length(uri), by=4)] | |
534 uri.slope <- (uri.binned[2:(length(uri.binned))] - uri.binned[1:(length(uri.binned)-1)])/(tt.binned[2:(length(uri.binned))] - tt.binned[1:(length(tt.binned)-1)]) | |
535 | |
536 # smooth uri using spline | |
537 # first find where the jump is and don't fit the jump | |
538 # this is the index on the left | |
539 # jump.left.old <- which.max(uri[-1]-uri[-length(uri)]) | |
540 short.list.length <- min(sum(x1>0)/length(x1), sum(x2>0)/length(x2)) | |
541 | |
542 if(short.list.length < max(tt)){ | |
543 jump.left <- which(tt>short.list.length)[1]-1 | |
544 } else { | |
545 jump.left <- which.max(tt) | |
546 } | |
547 | |
548 # reversed.index <- seq(length(tt), 1, by=-1) | |
549 # nequal <- sum(uri[reversed.index]== tt[reversed.index]) | |
550 # temp <- which(uri[reversed.index]== tt[reversed.index])[nequal] | |
551 # jump.left <- length(tt)-temp | |
552 | |
553 if(jump.left < 6){ | |
554 jump.left <- length(tt) | |
555 } | |
556 | |
557 | |
558 if(is.null(spline.df)) | |
559 uri.spl <- smooth.spline(tt[1:jump.left], uri[1:jump.left], df=6.4) | |
560 else{ | |
561 uri.spl <- smooth.spline(tt[1:jump.left], uri[1:jump.left], df=spline.df) | |
562 } | |
563 # predict the first derivative | |
564 uri.der <- predict(uri.spl, tt[1:jump.left], deriv=1) | |
565 | |
566 invisible(list(tv=tv, uri=uri, | |
567 uri.slope=uri.slope, t.binned=tt.binned[2:length(uri.binned)], | |
568 uri.spl=uri.spl, uri.der=uri.der, jump.left=jump.left, | |
569 ntotal=ntotal)) | |
570 } | |
571 | |
572 | |
573 # change the scale of uri from based on t (percentage) to n (number of peaks or basepairs) | |
574 # this is for plotting multiple pairwise URI's on the same plot | |
575 scale.t2n <- function(uri){ | |
576 | |
577 ntotal <- uri$ntotal | |
578 tv <- uri$tv*uri$ntotal | |
579 uri.uri <- uri$uri*uri$ntotal | |
580 jump.left <- uri$jump.left | |
581 uri.spl <- uri$uri.spl | |
582 uri.spl$x <- uri$uri.spl$x*uri$ntotal | |
583 uri.spl$y <- uri$uri.spl$y*uri$ntotal | |
584 | |
585 t.binned <- uri$t.binned*uri$ntotal | |
586 uri.slope <- uri$uri.slope | |
587 uri.der <- uri$uri.der | |
588 uri.der$x <- uri$uri.der$x*uri$ntotal | |
589 uri.der$y <- uri$uri.der$y | |
590 | |
591 uri.n <- list(tv=tv, uri=uri.uri, t.binned=t.binned, uri.slope=uri.slope, uri.spl=uri.spl, uri.der=uri.der, ntotal=ntotal, jump.left=jump.left) | |
592 return(uri.n) | |
593 } | |
594 | |
595 | |
596 | |
597 | |
598 # a wrapper for running URI for peaks from peak calling results | |
599 # both data1 and data2 are calling results in narrowpeak format | |
600 compute.pair.uri <- function(data.1, data.2, sig.value1="signal.value", sig.value2="signal.value", spline.df=NULL, overlap.ratio=0){ | |
601 | |
602 tt <- seq(0.01, 1, by=0.01) | |
603 vv <- tt | |
604 | |
605 if(sig.value1=="signal.value"){ | |
606 data.1.enrich <- data.frame(chr=data.1$chr, start.ori=data.1$start.ori, stop.ori=data.1$stop.ori, start=data.1$start, stop=data.1$stop, sig.value=data.1$signal.value, signal.value=data.1$signal.value, p.value=data.1$p.value, q.value=data.1$q.value) | |
607 } else { | |
608 if(sig.value1=="p.value"){ | |
609 data.1.enrich <- data.frame(chr=data.1$chr, start.ori=data.1$start.ori, stop.ori=data.1$stop.ori, start=data.1$start, stop=data.1$stop, sig.value=data.1$p.value, signal.value=data.1$signal.value, p.value=data.1$p.value, q.value=data.1$q.value) | |
610 } else { | |
611 if(sig.value1=="q.value"){ | |
612 data.1.enrich <- data.frame(chr=data.1$chr, start.ori=data.1$start.ori, stop.ori=data.1$stop.ori, start=data.1$start, stop=data.1$stop, sig.value=data.1$q.value, signal.value=data.1$signal.value, p.value=data.1$p.value, q.value=data.1$q.value) | |
613 } | |
614 } | |
615 } | |
616 | |
617 if(sig.value2=="signal.value"){ | |
618 data.2.enrich <- data.frame(chr=data.2$chr, start.ori=data.2$start.ori, stop.ori=data.2$stop.ori, start=data.2$start, stop=data.2$stop, sig.value=data.2$signal.value, signal.value=data.2$signal.value, p.value=data.2$p.value, q.value=data.2$q.value) | |
619 } else { | |
620 if(sig.value2=="p.value"){ | |
621 data.2.enrich <- data.frame(chr=data.2$chr, start.ori=data.2$start.ori, stop.ori=data.2$stop.ori, start=data.2$start, stop=data.2$stop, sig.value=data.2$p.value, signal.value=data.2$signal.value, p.value=data.2$p.value, q.value=data.2$q.value) | |
622 } else { | |
623 if(sig.value2=="q.value"){ | |
624 data.2.enrich <- data.frame(chr=data.2$chr, start.ori=data.2$start.ori, stop.ori=data.2$stop.ori, start=data.2$start, stop=data.2$stop, sig.value=data.2$q.value, signal.value=data.2$signal.value, p.value=data.2$p.value, q.value=data.2$q.value) | |
625 } | |
626 } | |
627 } | |
628 | |
629 ### by peaks | |
630 # data12.enrich <- pair.peaks(data.1.enrich, data.2.enrich) | |
631 data12.enrich <- pair.peaks.filter(data.1.enrich, data.2.enrich, p.value.impute=0, overlap.ratio) | |
632 uri <- get.uri.2d(as.numeric(as.character(data12.enrich$merge1$sig.value)), as.numeric(as.character(data12.enrich$merge2$sig.value)), tt, vv, spline.df=spline.df) | |
633 uri.n <- scale.t2n(uri) | |
634 | |
635 return(list(uri=uri, uri.n=uri.n, data12.enrich=data12.enrich, sig.value1=sig.value1, sig.value2=sig.value2)) | |
636 | |
637 | |
638 } | |
639 | |
640 | |
641 | |
642 # compute uri for matched sample | |
643 get.uri.matched <- function(data12, df=10){ | |
644 | |
645 tt <- seq(0.01, 1, by=0.01) | |
646 vv <- tt | |
647 uri <- get.uri.2d(data12$sample1$sig.value, data12$sample2$sig.value, tt, vv, spline.df=df) | |
648 | |
649 # change scale from t to n | |
650 uri.n <- scale.t2n(uri) | |
651 | |
652 return(list(uri=uri, uri.n=uri.n)) | |
653 | |
654 } | |
655 | |
656 # map.uv is a pair of significant values corresponding to specified consistency FDR | |
657 # assuming values in map.uv and qvalue are linearly related | |
658 # data.set is the original data set | |
659 # sig.value is the name of the significant value in map.uv, say enrichment | |
660 # nominal.value is the one we want to map to, say q-value | |
661 # | |
662 map.sig.value <- function(data.set, map.uv, nominal.value){ | |
663 | |
664 index.nominal <- which(names(data.set$merge1)==nominal.value) | |
665 nentry <- nrow(map.uv) | |
666 map.nominal <- rbind(map.uv[, c("sig.value1", "sig.value2")]) | |
667 | |
668 for(i in 1:nentry){ | |
669 | |
670 map.nominal[i, "sig.value1"] <- data.set$merge1[unique(which.min(abs(data.set$merge1$sig.value-map.uv[i, "sig.value1"]))), index.nominal] | |
671 map.nominal[i, "sig.value2"] <- data.set$merge2[unique(which.min(abs(data.set$merge2$sig.value-map.uv[i, "sig.value2"]))), index.nominal] | |
672 } | |
673 | |
674 invisible(map.nominal) | |
675 } | |
676 | |
677 | |
678 ############### plot correspondence profile | |
679 | |
680 # plot multiple comparison wrt one template | |
681 # uri.list contains the total number of peaks | |
682 # plot.missing=F: not plot the missing points on the right | |
683 plot.uri.group <- function(uri.n.list, plot.dir, file.name=NULL, legend.txt, xlab.txt="num of significant peaks", ylab.txt="num of peaks in common", col.start=0, col.txt=NULL, plot.missing=F, title.txt=NULL){ | |
684 | |
685 if(is.null(col.txt)) | |
686 col.txt <- c("black", "red", "purple", "green", "blue", "cyan", "magenta", "orange", "grey") | |
687 | |
688 n <- length(uri.n.list) | |
689 | |
690 ntotal <- c() | |
691 for(i in 1:n) | |
692 ntotal[i] <- uri.n.list[[i]]$ntotal | |
693 | |
694 jump.left <- c() | |
695 jump.left.der <- c() | |
696 ncommon <- c() | |
697 for(i in 1:n){ | |
698 # jump.left[i] <- which.max(uri.n.list[[i]]$uri[-1]-uri.n.list[[i]]$uri[-length(uri.n.list[[i]]$uri)]) | |
699 # if(jump.left[i] < 6) | |
700 # jump.left[i] <- length(uri.n.list[[i]]$uri) | |
701 | |
702 ## reversed.index <- seq(length(uri.n.list[[i]]$tv[,1]), 1, by=-1) | |
703 ## nequal <- sum(uri.n.list[[i]]$uri[reversed.index]== uri.n.list[[i]]$tv[reversed.index,1]) | |
704 ## temp <- which(uri.n.list[[i]]$uri[reversed.index]== uri.n.list[[i]]$tv[reversed.index,1])[nequal] | |
705 ## jump.left[i] <- length(uri.n.list[[i]]$tv[,1])-temp | |
706 ##print(uri.n.list[[i]]$uri) | |
707 ##print(uri.n.list[[i]]$tv[,1]) | |
708 ## jump.left[i] <- uri.n.list[[i]]$jump.left | |
709 | |
710 # jump.left.der[i] <- sum(uri.n.list[[i]]$t.binned < uri.n.list[[i]]$uri.der$x[length(uri.n.list[[i]]$uri.der$x)]) | |
711 | |
712 jump.left[i] <- uri.n.list[[i]]$jump.left | |
713 jump.left.der[i] <- jump.left[i] | |
714 ncommon[i] <- uri.n.list[[i]]$tv[jump.left[i],1] | |
715 } | |
716 | |
717 | |
718 if(plot.missing){ | |
719 max.peak <- max(ntotal) | |
720 } else { | |
721 max.peak <- max(ncommon)*1.05 | |
722 } | |
723 | |
724 if(!is.null(file.name)){ | |
725 postscript(paste(plot.dir, "uri.", file.name, sep="")) | |
726 par(mfrow=c(1,1), mar=c(5,5,4,2)) | |
727 } | |
728 | |
729 plot(uri.n.list[[1]]$tv[,1], uri.n.list[[1]]$uri, type="n", xlab=xlab.txt, ylab=ylab.txt, xlim=c(0, max.peak), ylim=c(0, max.peak), cex.lab=2) | |
730 | |
731 for(i in 1:n){ | |
732 | |
733 if(plot.missing){ | |
734 points(uri.n.list[[i]]$tv[,1], uri.n.list[[i]]$uri, col=col.txt[i+col.start], cex=0.5 ) | |
735 } else { | |
736 points(uri.n.list[[i]]$tv[1:jump.left[i],1], uri.n.list[[i]]$uri[1:jump.left[i]], col=col.txt[i+col.start], cex=0.5) | |
737 } | |
738 lines(uri.n.list[[i]]$uri.spl, col=col.txt[i+col.start], lwd=4) | |
739 } | |
740 abline(coef=c(0,1), lty=3) | |
741 legend(0, max.peak, legend=legend.txt, col=col.txt[(col.start+1):length(col.txt)], lty=1, lwd=3, cex=2) | |
742 if(!is.null(title)) | |
743 title(title.txt) | |
744 | |
745 if(!is.null(file.name)){ | |
746 dev.off() | |
747 } | |
748 | |
749 if(!is.null(file.name)){ | |
750 postscript(paste(plot.dir, "duri.", file.name, sep="")) | |
751 par(mfrow=c(1,1), mar=c(5,5,4,2)) | |
752 } | |
753 plot(uri.n.list[[1]]$t.binned, uri.n.list[[1]]$uri.slope, type="n", xlab=xlab.txt, ylab="slope", xlim=c(0, max.peak), ylim=c(0, 1.5), cex.lab=2) | |
754 | |
755 for(i in 1:n){ | |
756 # if(plot.missing){ | |
757 # points(uri.n.list[[i]]$t.binned, uri.n.list[[i]]$uri.slope, col=col.txt[i+col.start], cex=0.5) | |
758 # } else { | |
759 # points(uri.n.list[[i]]$t.binned[1:jump.left.der[i]], uri.n.list[[i]]$uri.slope[1:jump.left.der[i]], col=col.txt[i+col.start], cex=0.5) | |
760 # } | |
761 lines(uri.n.list[[i]]$uri.der, col=col.txt[i+col.start], lwd=4) | |
762 } | |
763 abline(h=1, lty=3) | |
764 legend(0.5*max.peak, 1.5, legend=legend.txt, col=col.txt[(col.start+1):length(col.txt)], lty=1, lwd=3, cex=2) | |
765 | |
766 if(!is.null(title)) | |
767 title(title.txt) | |
768 | |
769 if(!is.null(file.name)){ | |
770 dev.off() | |
771 } | |
772 | |
773 } | |
774 | |
775 | |
776 | |
777 ####################### | |
778 ####################### copula fitting for matched peaks | |
779 ####################### | |
780 | |
781 # estimation from mixed copula model | |
782 | |
783 # 4-5-09 | |
784 # A nonparametric estimation of mixed copula model | |
785 | |
786 | |
787 # updated | |
788 | |
789 # c1, c2, f1, f2, g1, g2 are vectors | |
790 # c1*f1*g1 and c2*f2*g2 are copula densities for the two components | |
791 # xd1 and yd1 are the values of marginals for the first component | |
792 # xd2 and yd2 are the values of marginals for the 2nd component | |
793 # | |
794 # ez is the prob for being in the consistent group | |
795 get.ez <- function(p, c1, c2, xd1, yd1, xd2, yd2){ | |
796 | |
797 return(p*c1*xd1*yd1/(p*c1*xd1*yd1 + (1-p)*c2*xd2*yd2)) | |
798 } | |
799 | |
800 # checked | |
801 | |
802 # this is C_12 not the copula density function c=C_12 * f1* f2 | |
803 # since nonparametric estimation is used here for f1 and f2, which | |
804 # are constant throughout the iterations, we don't need them for optimization | |
805 # | |
806 # bivariate gaussian copula function | |
807 # t and s are vectors of same length, both are percentiles | |
808 # return a vector | |
809 gaussian.cop.den <- function(t, s, rho){ | |
810 | |
811 A <- qnorm(t)^2 + qnorm(s)^2 | |
812 B <- qnorm(t)*qnorm(s) | |
813 | |
814 loglik <- -log(1-rho^2)/2 - rho/(2*(1-rho^2))*(rho*A-2*B) | |
815 | |
816 return(exp(loglik)) | |
817 } | |
818 | |
819 clayton.cop.den <- function(t, s, rho){ | |
820 | |
821 if(rho > 0) | |
822 return(exp(log(rho+1)-(rho+1)*(log(t)+log(s))-(2+1/rho)*log(t^(-rho) + s^(-rho)-1))) | |
823 | |
824 if(rho==0) | |
825 return(1) | |
826 | |
827 if(rho<0) | |
828 stop("Incorrect Clayton copula coefficient") | |
829 | |
830 } | |
831 | |
832 | |
833 # checked | |
834 # estimate rho from Gaussian copula | |
835 mle.gaussian.copula <- function(t, s, e.z){ | |
836 | |
837 # reparameterize to bound from rho=+-1 | |
838 l.c <- function(rho, t, s, e.z){ | |
839 # cat("rho=", rho, "\n") | |
840 sum(e.z*log(gaussian.cop.den(t, s, rho)))} | |
841 | |
842 rho.max <- optimize(f=l.c, c(-0.998, 0.998), maximum=T, tol=0.00001, t=t, s=s, e.z=e.z) | |
843 | |
844 #print(rho.max$m) | |
845 | |
846 #cat("cor=", cor(qnorm(t)*e.z, qnorm(s)*e.z), "\t", "rho.max=", rho.max$m, "\n") | |
847 # return(sign(rho.max$m)/(1+rho.max$m)) | |
848 return(rho.max$m) | |
849 } | |
850 | |
851 | |
852 # estimate mle from Clayton copula, | |
853 mle.clayton.copula <- function(t, s, e.z){ | |
854 | |
855 l.c <- function(rho, t, s, e.z){ | |
856 lc <- sum(e.z*log(clayton.cop.den(t, s, rho))) | |
857 # cat("rho=", rho, "\t", "l.c=", lc, "\n") | |
858 return(lc) | |
859 } | |
860 | |
861 rho.max <- optimize(f=l.c, c(0.1, 20), maximum=T, tol=0.00001, t=t, s=s, e.z=e.z) | |
862 | |
863 return(rho.max$m) | |
864 } | |
865 | |
866 | |
867 | |
868 # updated | |
869 # mixture likelihood of two gaussian copula | |
870 # nonparametric and ranked transformed | |
871 loglik.2gaussian.copula <- function(x, y, p, rho1, rho2, x.mar, y.mar){ | |
872 | |
873 px.1 <- get.pdf.cdf(x, x.mar$f1) | |
874 px.2 <- get.pdf.cdf(x, x.mar$f2) | |
875 py.1 <- get.pdf.cdf(y, y.mar$f1) | |
876 py.2 <- get.pdf.cdf(y, y.mar$f2) | |
877 | |
878 c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1) | |
879 c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2) | |
880 | |
881 sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf)) | |
882 } | |
883 | |
884 loglik.2copula <- function(x, y, p, rho1, rho2, x.mar, y.mar, copula.txt){ | |
885 | |
886 px.1 <- pdf.cdf$px.1 | |
887 px.2 <- pdf.cdf$px.2 | |
888 py.1 <- pdf.cdf$py.1 | |
889 py.2 <- pdf.cdf$py.2 | |
890 | |
891 if(copula.txt=="gaussian"){ | |
892 c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1) | |
893 c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2) | |
894 } else { | |
895 if(copula.txt=="clayton"){ | |
896 c1 <- clayton.cop.den(px.1$cdf, py.1$cdf, rho1) | |
897 c2 <- clayton.cop.den(px.2$cdf, py.2$cdf, rho2) | |
898 } | |
899 } | |
900 sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf)) | |
901 } | |
902 | |
903 | |
904 # estimate the marginals of each component using histogram estimator in EM | |
905 # return the density, breaks, and cdf of the histogram estimator | |
906 est.mar.hist <- function(x, e.z, breaks){ | |
907 | |
908 binwidth <- c() | |
909 nbin <- length(breaks)-1 | |
910 nx <- length(x) | |
911 | |
912 # the histogram | |
913 x1.pdf <- c() | |
914 x2.pdf <- c() | |
915 x1.cdf <- c() | |
916 x2.cdf <- c() | |
917 | |
918 # the pdf for each point | |
919 x1.pdf.value <- rep(NA, nx) | |
920 x2.pdf.value <- rep(NA, nx) | |
921 | |
922 x1.cdf.value <- rep(NA, nx) | |
923 x2.cdf.value <- rep(NA, nx) | |
924 | |
925 for(i in 1:nbin){ | |
926 | |
927 binwidth[i] <- breaks[i+1] - breaks[i] | |
928 if(i < nbin) | |
929 in.bin <- x>= breaks[i] & x < breaks[i+1] | |
930 else # last bin | |
931 in.bin <- x>= breaks[i] & x <=breaks[i+1] | |
932 | |
933 # each bin add one observation to avoid empty bins | |
934 # multiple (nx+nbin)/(nx+nbin+1) to avoid blowup when looking up for | |
935 # quantiles | |
936 x1.pdf[i] <- (sum(e.z[in.bin])+1)/(sum(e.z)+nbin)/binwidth[i]*(nx+nbin)/(nx+nbin+1) | |
937 x2.pdf[i] <- (sum(1-e.z[in.bin])+1)/(sum(1-e.z)+nbin)/binwidth[i]*(nx+nbin)/(nx+nbin+1) | |
938 | |
939 | |
940 # x1.pdf[i] <- sum(e.z[in.bin])/sum(e.z)/binwidth[i]*nx/(nx+1) | |
941 # x2.pdf[i] <- sum(1-e.z[in.bin])/sum(1-e.z)/binwidth[i]*nx/(nx+1) | |
942 | |
943 # treat each bin as a value for a discrete variable | |
944 # x1.cdf[i] <- sum(x1.pdf[1:i]*binwidth[1:i]) | |
945 # x2.cdf[i] <- sum(x2.pdf[1:i]*binwidth[1:i]) | |
946 | |
947 | |
948 # cumulative density before reaching i | |
949 if(i>1){ | |
950 x1.cdf[i] <- sum(x1.pdf[1:(i-1)]*binwidth[1:(i-1)]) | |
951 x2.cdf[i] <- sum(x2.pdf[1:(i-1)]*binwidth[1:(i-1)]) | |
952 } else{ | |
953 x1.cdf[i] <- 0 | |
954 x2.cdf[i] <- 0 | |
955 } | |
956 | |
957 # make a vector of nx to store the values of pdf and cdf for each x | |
958 # this will speed up the computation dramatically | |
959 x1.pdf.value[in.bin] <- x1.pdf[i] | |
960 x2.pdf.value[in.bin] <- x2.pdf[i] | |
961 | |
962 x1.cdf.value[in.bin] <- x1.cdf[i] + x1.pdf[i]*(x[in.bin]-breaks[i]) | |
963 x2.cdf.value[in.bin] <- x2.cdf[i] + x2.pdf[i]*(x[in.bin]-breaks[i]) | |
964 } | |
965 | |
966 # x1.cdf <- cumsum(x1.pdf*binwidth) | |
967 # x2.cdf <- cumsum(x2.pdf*binwidth) | |
968 | |
969 f1 <-list(breaks=breaks, density=x1.pdf, cdf=x1.cdf) | |
970 f2 <-list(breaks=breaks, density=x2.pdf, cdf=x2.cdf) | |
971 | |
972 f1.value <- list(pdf=x1.pdf.value, cdf=x1.cdf.value) | |
973 f2.value <- list(pdf=x2.pdf.value, cdf=x2.cdf.value) | |
974 | |
975 return(list(f1=f1, f2=f2, f1.value=f1.value, f2.value=f2.value)) | |
976 } | |
977 | |
978 # estimate the marginal cdf from rank | |
979 est.cdf.rank <- function(x, conf.z){ | |
980 | |
981 # add 1 to prevent blow up | |
982 x1.cdf <- rank(x[conf.z==1])/(length(x[conf.z==1])+1) | |
983 | |
984 x2.cdf <- rank(x[conf.z==0])/(length(x[conf.z==0])+1) | |
985 | |
986 return(list(cdf1=x1.cdf, cdf2=x2.cdf)) | |
987 } | |
988 | |
989 # df is a density function with fields: density, cdf and breaks, x is a scalar | |
990 get.pdf <- function(x, df){ | |
991 | |
992 if(x < df$breaks[1]) | |
993 cat("x is out of the range of df\n") | |
994 | |
995 index <- which(df$breaks >= x)[1] | |
996 | |
997 if(index==1) | |
998 index <- index +1 | |
999 return(df$density[index-1]) | |
1000 } | |
1001 | |
1002 # get cdf from histgram estimator for a single value | |
1003 get.cdf <- function(x, df){ | |
1004 | |
1005 index <- which(df$breaks >= x)[1] | |
1006 if(index==1) | |
1007 index <- index +1 | |
1008 return(df$cdf[index-1]) | |
1009 } | |
1010 | |
1011 # df is a density function with fields: density, cdf and breaks | |
1012 get.pdf.cdf <- function(x.vec, df){ | |
1013 | |
1014 x.pdf <- sapply(x.vec, get.pdf, df=df) | |
1015 x.cdf <- sapply(x.vec, get.cdf, df=df) | |
1016 return(list(cdf=x.cdf, pdf=x.pdf)) | |
1017 } | |
1018 | |
1019 # E-step | |
1020 # x and y are the original observations or ranks | |
1021 # rho1 and rho2 are the parameters of each copula | |
1022 # f1, f2, g1, g2 are functions, each is a histogram | |
1023 e.step.2gaussian <- function(x, y, p, rho1, rho2, x.mar, y.mar){ | |
1024 | |
1025 # get pdf and cdf of each component from functions in the corresponding component | |
1026 px.1 <- get.pdf.cdf(x, x.mar$f1) | |
1027 px.2 <- get.pdf.cdf(x, x.mar$f2) | |
1028 py.1 <- get.pdf.cdf(y, y.mar$f1) | |
1029 py.2 <- get.pdf.cdf(y, y.mar$f2) | |
1030 | |
1031 c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1) | |
1032 c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2) | |
1033 | |
1034 return(get.ez(p, c1, c2, px.1$pdf, py.1$pdf, px.2$pdf, py.2$pdf)) | |
1035 } | |
1036 | |
1037 # E-step | |
1038 # rho1 and rho2 are the parameters of each copula | |
1039 e.step.2copula <- function(x, y, p, rho1, rho2, x.mar, y.mar, copula.txt){ | |
1040 | |
1041 # get pdf and cdf of each component from functions in the corresponding component | |
1042 px.1 <- get.pdf.cdf(x, x.mar$f1) | |
1043 px.2 <- get.pdf.cdf(x, x.mar$f2) | |
1044 py.1 <- get.pdf.cdf(y, y.mar$f1) | |
1045 py.2 <- get.pdf.cdf(y, y.mar$f2) | |
1046 | |
1047 if(copula.txt=="gaussian"){ | |
1048 c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1) | |
1049 c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2) | |
1050 } else { | |
1051 if(copula.txt=="clayton"){ | |
1052 c1 <- clayton.cop.den(px.1$cdf, py.1$cdf, rho1) | |
1053 c2 <- clayton.cop.den(px.2$cdf, py.2$cdf, rho2) | |
1054 } | |
1055 } | |
1056 return(get.ez(p, c1, c2, px.1$pdf, py.1$pdf, px.2$pdf, py.2$pdf)) | |
1057 } | |
1058 | |
1059 | |
1060 | |
1061 | |
1062 # M-step | |
1063 m.step.2gaussian <- function(x, y, e.z, breaks){ | |
1064 | |
1065 # compute f1, f2, g1 and g2 | |
1066 x.mar <- est.mar.hist(x, e.z, breaks) | |
1067 y.mar <- est.mar.hist(y, e.z, breaks) | |
1068 | |
1069 px.1 <- get.pdf.cdf(x, x.mar$f1) | |
1070 px.2 <- get.pdf.cdf(x, x.mar$f2) | |
1071 py.1 <- get.pdf.cdf(y, y.mar$f1) | |
1072 py.2 <- get.pdf.cdf(y, y.mar$f2) | |
1073 | |
1074 rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z) | |
1075 rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z) | |
1076 | |
1077 p <- sum(e.z)/length(e.z) | |
1078 | |
1079 return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar)) | |
1080 } | |
1081 | |
1082 m.step.2copula <- function(x, y, e.z, breaks, copula.txt){ | |
1083 | |
1084 # compute f1, f2, g1 and g2 | |
1085 x.mar <- est.mar.hist(x, e.z, breaks) | |
1086 y.mar <- est.mar.hist(y, e.z, breaks) | |
1087 | |
1088 px.1 <- get.pdf.cdf(x, x.mar$f1) | |
1089 px.2 <- get.pdf.cdf(x, x.mar$f2) | |
1090 py.1 <- get.pdf.cdf(y, y.mar$f1) | |
1091 py.2 <- get.pdf.cdf(y, y.mar$f2) | |
1092 | |
1093 if(copula.txt=="gaussian"){ | |
1094 rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z) | |
1095 rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z) | |
1096 } else { | |
1097 if(copula.txt=="clayton"){ | |
1098 rho1 <- mle.clayton.copula(px.1$cdf, py.1$cdf, e.z) | |
1099 rho2 <- mle.clayton.copula(px.2$cdf, py.2$cdf, 1-e.z) | |
1100 } | |
1101 } | |
1102 | |
1103 p <- sum(e.z)/length(e.z) | |
1104 | |
1105 return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar)) | |
1106 } | |
1107 | |
1108 | |
1109 | |
1110 # E-step: pass values | |
1111 # x and y are the original observations or ranks | |
1112 # rho1 and rho2 are the parameters of each copula | |
1113 # f1, f2, g1, g2 are functions, each is a histogram | |
1114 e.step.2gaussian.value <- function(x, y, p, rho1, rho2, pdf.cdf){ | |
1115 | |
1116 c1 <- gaussian.cop.den(pdf.cdf$px.1$cdf, pdf.cdf$py.1$cdf, rho1) | |
1117 c2 <- gaussian.cop.den(pdf.cdf$px.2$cdf, pdf.cdf$py.2$cdf, rho2) | |
1118 | |
1119 e.z <- get.ez(p, c1, c2, pdf.cdf$px.1$pdf, pdf.cdf$py.1$pdf, | |
1120 pdf.cdf$px.2$pdf, pdf.cdf$py.2$pdf) | |
1121 return(e.z) | |
1122 } | |
1123 | |
1124 | |
1125 e.step.2copula.value <- function(x, y, p, rho1, rho2, pdf.cdf, copula.txt){ | |
1126 | |
1127 if(copula.txt =="gaussian"){ | |
1128 c1 <- gaussian.cop.den(pdf.cdf$px.1$cdf, pdf.cdf$py.1$cdf, rho1) | |
1129 c2 <- gaussian.cop.den(pdf.cdf$px.2$cdf, pdf.cdf$py.2$cdf, rho2) | |
1130 } else { | |
1131 if(copula.txt =="clayton"){ | |
1132 c1 <- clayton.cop.den(pdf.cdf$px.1$cdf, pdf.cdf$py.1$cdf, rho1) | |
1133 c2 <- clayton.cop.den(pdf.cdf$px.2$cdf, pdf.cdf$py.2$cdf, rho2) | |
1134 } | |
1135 } | |
1136 | |
1137 e.z <- get.ez(p, c1, c2, pdf.cdf$px.1$pdf, pdf.cdf$py.1$pdf, | |
1138 pdf.cdf$px.2$pdf, pdf.cdf$py.2$pdf) | |
1139 return(e.z) | |
1140 } | |
1141 | |
1142 | |
1143 # M-step: pass values | |
1144 m.step.2gaussian.value <- function(x, y, e.z, breaks, fix.rho2){ | |
1145 | |
1146 # compute f1, f2, g1 and g2 | |
1147 x.mar <- est.mar.hist(x, e.z, breaks) | |
1148 y.mar <- est.mar.hist(y, e.z, breaks) | |
1149 | |
1150 # px.1 <- get.pdf.cdf(x, x.mar$f1) | |
1151 # px.2 <- get.pdf.cdf(x, x.mar$f2) | |
1152 # py.1 <- get.pdf.cdf(y, y.mar$f1) | |
1153 # py.2 <- get.pdf.cdf(y, y.mar$f2) | |
1154 | |
1155 px.1 <- x.mar$f1.value | |
1156 px.2 <- x.mar$f2.value | |
1157 py.1 <- y.mar$f1.value | |
1158 py.2 <- y.mar$f2.value | |
1159 | |
1160 rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z) | |
1161 | |
1162 if(!fix.rho2) | |
1163 rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z) | |
1164 else | |
1165 rho2 <- 0 | |
1166 | |
1167 p <- sum(e.z)/length(e.z) | |
1168 | |
1169 pdf.cdf <- list(px.1=px.1, px.2=px.2, py.1=py.1, py.2=py.2) | |
1170 | |
1171 return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar, | |
1172 pdf.cdf=pdf.cdf)) | |
1173 } | |
1174 | |
1175 m.step.2gaussian.value2 <- function(x, y, e.z, breaks, fix.rho2, x.mar, y.mar){ | |
1176 | |
1177 # compute f1, f2, g1 and g2 | |
1178 # x.mar <- est.mar.hist(x, e.z, breaks) | |
1179 # y.mar <- est.mar.hist(y, e.z, breaks) | |
1180 | |
1181 # px.1 <- get.pdf.cdf(x, x.mar$f1) | |
1182 # px.2 <- get.pdf.cdf(x, x.mar$f2) | |
1183 # py.1 <- get.pdf.cdf(y, y.mar$f1) | |
1184 # py.2 <- get.pdf.cdf(y, y.mar$f2) | |
1185 | |
1186 px.1 <- x.mar$f1.value | |
1187 px.2 <- x.mar$f2.value | |
1188 py.1 <- y.mar$f1.value | |
1189 py.2 <- y.mar$f2.value | |
1190 | |
1191 rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z) | |
1192 | |
1193 if(!fix.rho2) | |
1194 rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z) | |
1195 else | |
1196 rho2 <- 0 | |
1197 | |
1198 p <- sum(e.z)/length(e.z) | |
1199 | |
1200 pdf.cdf <- list(px.1=px.1, px.2=px.2, py.1=py.1, py.2=py.2) | |
1201 | |
1202 return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar, | |
1203 pdf.cdf=pdf.cdf)) | |
1204 } | |
1205 | |
1206 | |
1207 | |
1208 m.step.2copula.value <- function(x, y, e.z, breaks, fix.rho2, copula.txt){ | |
1209 | |
1210 # compute f1, f2, g1 and g2 | |
1211 x.mar <- est.mar.hist(x, e.z, breaks) | |
1212 y.mar <- est.mar.hist(y, e.z, breaks) | |
1213 | |
1214 # px.1 <- get.pdf.cdf(x, x.mar$f1) | |
1215 # px.2 <- get.pdf.cdf(x, x.mar$f2) | |
1216 # py.1 <- get.pdf.cdf(y, y.mar$f1) | |
1217 # py.2 <- get.pdf.cdf(y, y.mar$f2) | |
1218 | |
1219 px.1 <- x.mar$f1.value | |
1220 px.2 <- x.mar$f2.value | |
1221 py.1 <- y.mar$f1.value | |
1222 py.2 <- y.mar$f2.value | |
1223 | |
1224 if(copula.txt=="gaussian"){ | |
1225 rho1 <- mle.gaussian.copula(px.1$cdf, py.1$cdf, e.z) | |
1226 | |
1227 if(!fix.rho2) | |
1228 rho2 <- mle.gaussian.copula(px.2$cdf, py.2$cdf, 1-e.z) | |
1229 else | |
1230 rho2 <- 0 | |
1231 } else { | |
1232 | |
1233 if(copula.txt=="clayton"){ | |
1234 rho1 <- mle.clayton.copula(px.1$cdf, py.1$cdf, e.z) | |
1235 | |
1236 if(!fix.rho2) | |
1237 rho2 <- mle.clayton.copula(px.2$cdf, py.2$cdf, 1-e.z) | |
1238 else | |
1239 rho2 <- 0 | |
1240 } | |
1241 } | |
1242 | |
1243 p <- sum(e.z)/length(e.z) | |
1244 | |
1245 pdf.cdf <- list(px.1=px.1, px.2=px.2, py.1=py.1, py.2=py.2) | |
1246 | |
1247 return(list(p=p, rho1=rho1, rho2=rho2, x.mar=x.mar, y.mar=y.mar, | |
1248 pdf.cdf=pdf.cdf)) | |
1249 } | |
1250 | |
1251 | |
1252 | |
1253 | |
1254 # updated | |
1255 # mixture likelihood of two gaussian copula | |
1256 # nonparametric and ranked transformed | |
1257 loglik.2gaussian.copula.value <- function(x, y, p, rho1, rho2, pdf.cdf){ | |
1258 | |
1259 px.1 <- pdf.cdf$px.1 | |
1260 px.2 <- pdf.cdf$px.2 | |
1261 py.1 <- pdf.cdf$py.1 | |
1262 py.2 <- pdf.cdf$py.2 | |
1263 | |
1264 c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1) | |
1265 c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2) | |
1266 | |
1267 sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf)) | |
1268 } | |
1269 | |
1270 | |
1271 | |
1272 # updated | |
1273 # mixture likelihood of two gaussian copula | |
1274 # nonparametric and ranked transformed | |
1275 loglik.2copula.value <- function(x, y, p, rho1, rho2, pdf.cdf, copula.txt){ | |
1276 | |
1277 px.1 <- pdf.cdf$px.1 | |
1278 px.2 <- pdf.cdf$px.2 | |
1279 py.1 <- pdf.cdf$py.1 | |
1280 py.2 <- pdf.cdf$py.2 | |
1281 | |
1282 if(copula.txt=="gaussian"){ | |
1283 c1 <- gaussian.cop.den(px.1$cdf, py.1$cdf, rho1) | |
1284 c2 <- gaussian.cop.den(px.2$cdf, py.2$cdf, rho2) | |
1285 } else { | |
1286 if(copula.txt=="clayton"){ | |
1287 c1 <- clayton.cop.den(px.1$cdf, py.1$cdf, rho1) | |
1288 c2 <- clayton.cop.den(px.2$cdf, py.2$cdf, rho2) | |
1289 } | |
1290 } | |
1291 | |
1292 sum(log(p*c1*px.1$pdf*py.1$pdf + (1-p)*c2*px.2$pdf*py.2$pdf)) | |
1293 } | |
1294 | |
1295 | |
1296 | |
1297 # EM for 2 Gaussian, speed up computation, unfinished | |
1298 | |
1299 em.2gaussian.quick <- function(x, y, p0, rho1.0, rho2.0, eps, fix.p=F, stoc=T, fix.rho2=T){ | |
1300 | |
1301 x <- rank(x, tie="random") | |
1302 y <- rank(y, tie="random") | |
1303 | |
1304 # x <- rank(x, tie="average") | |
1305 # y <- rank(y, tie="average") | |
1306 | |
1307 # nbin=20 | |
1308 xy.min <- min(x, y) | |
1309 xy.max <- max(x, y) | |
1310 binwidth <- (xy.max-xy.min)/50 | |
1311 breaks <- seq(xy.min-binwidth/100, xy.max+binwidth/100, by=(xy.max-xy.min+binwidth/50)/50) | |
1312 # breaks <- seq(xy.min, xy.max, by=binwidth) | |
1313 | |
1314 | |
1315 # initiate marginals | |
1316 # initialization: first p0 data has | |
1317 # e.z <- e.step.2gaussian(x, y, p0, rho1.0, rho2.0, x0.mar, y0.mar) # this starting point assumes two components are overlapped | |
1318 | |
1319 e.z <- c(rep(0.9, round(length(x)*p0)), rep(0.1, length(x)-round(length(x)*p0))) | |
1320 | |
1321 if(!stoc) | |
1322 para <- m.step.2gaussian.value(x, y, e.z, breaks, fix.rho2) | |
1323 else | |
1324 para <- m.step.2gaussian.stoc.value(x, y, e.z, breaks, fix.rho2) | |
1325 | |
1326 | |
1327 if(fix.p){ | |
1328 p <- p0 | |
1329 } else { | |
1330 p <- para$p | |
1331 } | |
1332 | |
1333 if(fix.rho2){ | |
1334 rho2 <- rho2.0 | |
1335 } else { | |
1336 rho2 <- para$rho2 | |
1337 } | |
1338 | |
1339 # rho1 <- 0.8 | |
1340 rho1 <- para$rho1 | |
1341 | |
1342 l0 <- loglik.2gaussian.copula.value(x, y, p, rho1, rho2, para$pdf.cdf) | |
1343 | |
1344 loglik.trace <- c() | |
1345 loglik.trace[1] <- l0 | |
1346 # loglik.trace[2] <- l1 | |
1347 to.run <- T | |
1348 | |
1349 i <- 2 | |
1350 | |
1351 # this two lines to remove | |
1352 # x.mar <- est.mar.hist(x, e.z, breaks) | |
1353 # y.mar <- est.mar.hist(y, e.z, breaks) | |
1354 | |
1355 while(to.run){ | |
1356 | |
1357 e.z <- e.step.2gaussian.value(x, y, p, rho1, rho2, para$pdf.cdf) | |
1358 if(!stoc) | |
1359 para <- m.step.2gaussian.value(x, y, e.z, breaks, fix.rho2) | |
1360 else | |
1361 para <- m.step.2gaussian.stoc.value(x, y, e.z, breaks, fix.rho2) | |
1362 | |
1363 # fix x.mar and y.mar : to remove | |
1364 # if(!stoc) | |
1365 # para <- m.step.2gaussian.value2(x, y, e.z, breaks, fix.rho2, x.mar, y.mar) | |
1366 # else | |
1367 # para <- m.step.2gaussian.stoc.value(x, y, e.z, breaks, fix.rho2) | |
1368 | |
1369 | |
1370 if(fix.p){ | |
1371 p <- p0 | |
1372 } else { | |
1373 p <- para$p | |
1374 } | |
1375 | |
1376 if(fix.rho2){ | |
1377 rho2 <- rho2.0 | |
1378 } else { | |
1379 rho2 <- para$rho2 | |
1380 } | |
1381 | |
1382 # rho1 <- 0.8 | |
1383 rho1 <- para$rho1 | |
1384 | |
1385 # l0 <- l1 | |
1386 l1 <- loglik.2gaussian.copula.value(x, y, p, rho1, rho2, para$pdf.cdf) | |
1387 loglik.trace[i] <- l1 | |
1388 | |
1389 #cat("l1=", l1, "\n") | |
1390 | |
1391 # Aitken acceleration criterion | |
1392 if(i > 2){ | |
1393 l.inf <- loglik.trace[i-2] + (loglik.trace[i-1] - loglik.trace[i-2])/(1-(loglik.trace[i]-loglik.trace[i-1])/(loglik.trace[i-1]-loglik.trace[i-2])) | |
1394 to.run <- abs(l.inf - loglik.trace[i]) > eps | |
1395 #cat("para=", "p=", para$p, " rho1=", rho1, " rho2=", rho2, "\n") | |
1396 #cat("l.inf=", l.inf, "\n") | |
1397 #cat(l.inf-loglik.trace[i], "\n") | |
1398 } | |
1399 | |
1400 i <- i+1 | |
1401 } | |
1402 | |
1403 bic <- -2*l1 + (2*(length(breaks)-1+1)+1-fix.p-fix.rho2)*log(length(x)) # parameters | |
1404 return(list(para=list(p=para$p, rho1=rho1, rho2=rho2), | |
1405 loglik=l1, bic=bic, e.z=e.z, conf.z = para$conf.z, | |
1406 loglik.trace=loglik.trace, x.mar=para$x.mar, y.mar=para$y.mar, | |
1407 breaks=breaks)) | |
1408 } | |
1409 | |
1410 | |
1411 | |
1412 em.2copula.quick <- function(x, y, p0, rho1.0, rho2.0, eps, fix.p=F, stoc=T, fix.rho2=T, copula.txt, nbin=50){ | |
1413 | |
1414 x <- rank(x, tie="random") | |
1415 y <- rank(y, tie="random") | |
1416 | |
1417 # x <- rank(x, tie="first") | |
1418 # y <- rank(y, tie="first") | |
1419 | |
1420 # nbin=50 | |
1421 xy.min <- min(x, y) | |
1422 xy.max <- max(x, y) | |
1423 binwidth <- (xy.max-xy.min)/50 | |
1424 breaks <- seq(xy.min-binwidth/100, xy.max+binwidth/100, by=(xy.max-xy.min+binwidth/50)/nbin) | |
1425 # breaks <- seq(xy.min, xy.max, by=binwidth) | |
1426 | |
1427 # initiate marginals | |
1428 # initialization: first p0 data has | |
1429 # e.z <- e.step.2gaussian(x, y, p0, rho1.0, rho2.0, x0.mar, y0.mar) # this starting point assumes two components are overlapped | |
1430 | |
1431 e.z <- c(rep(0.9, round(length(x)*p0)), rep(0.1, length(x)-round(length(x)*p0))) | |
1432 | |
1433 | |
1434 if(!stoc) | |
1435 para <- m.step.2copula.value(x, y, e.z, breaks, fix.rho2, copula.txt) | |
1436 else | |
1437 para <- m.step.2copula.stoc.value(x, y, e.z, breaks, fix.rho2, copula.txt) | |
1438 | |
1439 if(fix.p){ | |
1440 p <- p0 | |
1441 } else { | |
1442 p <- para$p | |
1443 } | |
1444 | |
1445 if(fix.rho2){ | |
1446 rho2 <- rho2.0 | |
1447 } else { | |
1448 rho2 <- para$rho2 | |
1449 } | |
1450 | |
1451 l0 <- loglik.2copula.value(x, y, p, para$rho1, rho2, para$pdf.cdf, copula.txt) | |
1452 | |
1453 loglik.trace <- c() | |
1454 loglik.trace[1] <- l0 | |
1455 # loglik.trace[2] <- l1 | |
1456 to.run <- T | |
1457 | |
1458 i <- 2 | |
1459 | |
1460 while(to.run){ | |
1461 | |
1462 e.z <- e.step.2copula.value(x, y, p, para$rho1, rho2, para$pdf.cdf, copula.txt) | |
1463 if(!stoc) | |
1464 para <- m.step.2copula.value(x, y, e.z, breaks, fix.rho2, copula.txt) | |
1465 else | |
1466 para <- m.step.2copula.stoc.value(x, y, e.z, breaks, fix.rho2, copula.txt) | |
1467 | |
1468 if(fix.p){ | |
1469 p <- p0 | |
1470 } else { | |
1471 p <- para$p | |
1472 } | |
1473 | |
1474 if(fix.rho2){ | |
1475 rho2 <- rho2.0 | |
1476 } else { | |
1477 rho2 <- para$rho2 | |
1478 } | |
1479 | |
1480 | |
1481 # l0 <- l1 | |
1482 l1 <- loglik.2copula.value(x, y, p, para$rho1, rho2, para$pdf.cdf, copula.txt) | |
1483 loglik.trace[i] <- l1 | |
1484 | |
1485 cat("l1=", l1, "\n") | |
1486 | |
1487 # Aitken acceleration criterion | |
1488 if(i > 2){ | |
1489 l.inf <- loglik.trace[i-2] + (loglik.trace[i-1] - loglik.trace[i-2])/(1-(loglik.trace[i]-loglik.trace[i-1])/(loglik.trace[i-1]-loglik.trace[i-2])) | |
1490 to.run <- abs(l.inf - loglik.trace[i]) > eps | |
1491 cat("para=", "p=", para$p, " rho1=", para$rho1, " rho2=", rho2, "\n") | |
1492 #cat("l.inf=", l.inf, "\n") | |
1493 #cat(l.inf-loglik.trace[i], "\n") | |
1494 } | |
1495 | |
1496 i <- i+1 | |
1497 } | |
1498 | |
1499 bic <- -2*l1 + (2*(length(breaks)-1+1)+1-fix.p-fix.rho2)*log(length(x)) # parameters | |
1500 return(list(para=list(p=para$p, rho1=para$rho1, rho2=rho2), | |
1501 loglik=l1, bic=bic, e.z=e.z, conf.z = para$conf.z, | |
1502 loglik.trace=loglik.trace, x.mar=para$x.mar, y.mar=para$y.mar, | |
1503 breaks=breaks)) | |
1504 } | |
1505 | |
1506 | |
1507 ####################### | |
1508 ####################### fit EM procedure for the matched peaks | |
1509 ####################### | |
1510 | |
1511 # remove the unmatched ones | |
1512 #rm.unmatch <- function(sample1, sample2, p.value.impute=0){ | |
1513 # | |
1514 # sample1.prune <- sample1[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,] | |
1515 # sample2.prune <- sample2[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,] | |
1516 # | |
1517 # invisible(list(sample1=sample1.prune$sig.value, sample2=sample2.prune$sig.value)) | |
1518 #} | |
1519 | |
1520 | |
1521 # fit 2-component model | |
1522 #fit.em <- function(sample12, fix.rho2=T){ | |
1523 # | |
1524 # prune.sample <- rm.unmatch(sample12$merge1, sample12$merge2) | |
1525 # | |
1526 # em.fit <- em.2gaussian.quick(-prune.sample$sample1, -prune.sample$sample2, | |
1527 # p0=0.5, rho1.0=0.7, rho2.0=0, eps=0.01, fix.p=F, stoc=F, fix.rho2) | |
1528 # | |
1529 # invisible(list(em.fit=em.fit, data.pruned=prune.sample)) | |
1530 #} | |
1531 | |
1532 | |
1533 rm.unmatch <- function(sample1, sample2, p.value.impute=0){ | |
1534 | |
1535 sample1.prune <- sample1[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,] | |
1536 sample2.prune <- sample2[sample1$sig.value > p.value.impute & sample2$sig.value > p.value.impute,] | |
1537 | |
1538 invisible(list(sample1=sample1.prune, sample2=sample2.prune)) | |
1539 } | |
1540 | |
1541 | |
1542 # fit 2-component model | |
1543 fit.em <- function(sample12, fix.rho2=T){ | |
1544 | |
1545 prune.sample <- rm.unmatch(sample12$merge1, sample12$merge2) | |
1546 | |
1547 em.fit <- em.2gaussian.quick(-prune.sample$sample1$sig.value, -prune.sample$sample2$sig.value, | |
1548 p0=0.5, rho1.0=0.7, rho2.0=0, eps=0.01, fix.p=F, stoc=F, fix.rho2) | |
1549 | |
1550 invisible(list(em.fit=em.fit, data.pruned=prune.sample)) | |
1551 } | |
1552 | |
1553 | |
1554 | |
1555 fit.2copula.em <- function(sample12, fix.rho2=T, copula.txt){ | |
1556 | |
1557 prune.sample <- rm.unmatch(sample12$merge1, sample12$merge2) | |
1558 | |
1559 # o <- order(prune.sample$sample1) | |
1560 # n <- length(prune.sample$sample1) | |
1561 | |
1562 # para <- init(prune.sample$sample1$sig.value, prune.sample$sample2$sig.value, c(rep(0, round(n/3)), rep(c(0,1), round(n/6)), rep(1, n-round(n/3)-round(n/6)))) | |
1563 | |
1564 # temp <- init.dist(f0, f1) | |
1565 para <- list() | |
1566 para$rho <- 0.6 | |
1567 para$p <- 0.3 | |
1568 para$mu <- 2.5 | |
1569 para$sigma <- 1 | |
1570 ## para$mu <- -temp$mu | |
1571 ## para$sigma <- temp$sigma | |
1572 #cat("mu=", para$mu, "sigma=", para$sigma, "\n") | |
1573 | |
1574 # em.fit <- em.transform.1loop(-prune.sample$sample1, -prune.sample$sample2, | |
1575 cat("EM is running") | |
1576 em.fit <- em.transform(prune.sample$sample1$sig.value, prune.sample$sample2$sig.value, para$mu, para$sigma, para$rho, para$p, eps=0.01) | |
1577 | |
1578 invisible(list(em.fit=em.fit, data.pruned=prune.sample)) | |
1579 } | |
1580 | |
1581 | |
1582 | |
1583 | |
1584 # fit 1-component model | |
1585 fit.1.component <- function(data.pruned, breaks){ | |
1586 | |
1587 # gaussian.1 <- fit.gaussian.1(-data.pruned$sample1$sig.value, -data.pruned$sample2$sig.value, breaks) | |
1588 # clayton.1 <- fit.clayton.1(-data.pruned$sample1$sig.value, -data.pruned$sample2$sig.value, breaks) | |
1589 | |
1590 gaussian.1 <- fit.gaussian.1(-data.pruned$sample1, -data.pruned$sample2, breaks) | |
1591 clayton.1 <- fit.clayton.1(-data.pruned$sample1, -data.pruned$sample2, breaks) | |
1592 | |
1593 return(list(gaussian.1=gaussian.1, clayton.1=clayton.1)) | |
1594 } | |
1595 | |
1596 | |
1597 | |
1598 ################# | |
1599 # Fit a single component | |
1600 ################# | |
1601 | |
1602 # a single gaussian copula | |
1603 # if breaks=NULL, use empirical pdf, otherwise use histogram estimate | |
1604 fit.gaussian.1 <- function(x, y, breaks=NULL){ | |
1605 | |
1606 # rank transformed and compute the empirical cdf | |
1607 t <- emp.mar.cdf.rank(x) | |
1608 s <- emp.mar.cdf.rank(y) | |
1609 | |
1610 mle.rho <- mle.gaussian.copula(t, s, rep(1, length(t))) | |
1611 | |
1612 c1 <- gaussian.cop.den(t, s, mle.rho) | |
1613 cat("c1", sum(log(c1)), "\n") | |
1614 | |
1615 if(is.null(breaks)){ | |
1616 f1 <- emp.mar.pdf.rank(t) | |
1617 f2 <- emp.mar.pdf.rank(s) | |
1618 } else { | |
1619 x.mar <- est.mar.hist(rank(x), rep(1, length(x)), breaks) | |
1620 y.mar <- est.mar.hist(rank(y), rep(1, length(y)), breaks) | |
1621 | |
1622 f1 <- x.mar$f1.value$pdf # only one component | |
1623 f2 <- y.mar$f1.value$pdf | |
1624 } | |
1625 | |
1626 | |
1627 cat("f1", sum(log(f1)), "\n") | |
1628 cat("f2", sum(log(f2)), "\n") | |
1629 | |
1630 loglik <- sum(log(c1)+log(f1)+log(f2)) | |
1631 | |
1632 bic <- -2*loglik + log(length(t))*(1+length(breaks)-1) | |
1633 | |
1634 return(list(rho=mle.rho, loglik=loglik, bic=bic)) | |
1635 } | |
1636 | |
1637 | |
1638 # a single Clayton copula | |
1639 fit.clayton.1 <- function(x, y, breaks=NULL){ | |
1640 | |
1641 # rank transformed and compute the empirical cdf | |
1642 t <- emp.mar.cdf.rank(x) | |
1643 s <- emp.mar.cdf.rank(y) | |
1644 | |
1645 mle.rho <- mle.clayton.copula(t, s, rep(1, length(t))) | |
1646 | |
1647 c1 <- clayton.cop.den(t, s, mle.rho) | |
1648 | |
1649 if(is.null(breaks)){ | |
1650 f1 <- emp.mar.pdf.rank(t) | |
1651 f2 <- emp.mar.pdf.rank(s) | |
1652 } else { | |
1653 x.mar <- est.mar.hist(rank(x), rep(1, length(x)), breaks) | |
1654 y.mar <- est.mar.hist(rank(y), rep(1, length(y)), breaks) | |
1655 | |
1656 f1 <- x.mar$f1.value$pdf # only one component | |
1657 f2 <- y.mar$f1.value$pdf | |
1658 } | |
1659 | |
1660 loglik <- sum(log(c1)+log(f1)+log(f2)) | |
1661 | |
1662 bic <- -2*loglik + log(length(t))*(1+length(breaks)-1) | |
1663 | |
1664 return(list(rho=mle.rho, tau=rho/(rho+2), loglik=loglik, bic=bic)) | |
1665 } | |
1666 | |
1667 ## obsolete function (01-06-2010) | |
1668 ## compute the average posterior probability to belong to the random component | |
1669 ## for peaks selected at different cutoffs | |
1670 comp.uri.ez <- function(tt, u, v, e.z){ | |
1671 | |
1672 u.t <- quantile(u, prob=(1-tt)) | |
1673 v.t <- quantile(v, prob=(1-tt)) | |
1674 | |
1675 # ez <- mean(e.z[u >= u.t & v >=u.t]) Is this wrong? | |
1676 ez <- mean(e.z[u >= u.t & v >=v.t]) | |
1677 | |
1678 return(ez) | |
1679 } | |
1680 | |
1681 ## obsolete function (01-06-2010) | |
1682 # compute the largest posterior error probability corresponding to | |
1683 # the square centered at the origin and spanned top tt% on both coordinates | |
1684 # so the consistent low rank ones are excluded | |
1685 # boundary.txt: either "max" or "min", if it is error prob, use "max" | |
1686 comp.ez.cutoff <- function(tt, u, v, e.z, boundary.txt){ | |
1687 | |
1688 u.t <- quantile(u, prob=(1-tt)) | |
1689 v.t <- quantile(v, prob=(1-tt)) | |
1690 | |
1691 if(boundary.txt == "max"){ | |
1692 # ez.bound <- max(e.z[u >= u.t & v >=u.t]) | |
1693 ez.bound <- max(e.z[u >= u.t & v >=v.t]) | |
1694 } else { | |
1695 # ez.bound <- min(e.z[u >= u.t & v >=u.t]) | |
1696 ez.bound <- min(e.z[u >= u.t & v >=v.t]) | |
1697 } | |
1698 | |
1699 return(ez.bound) | |
1700 | |
1701 } | |
1702 | |
1703 # obsolete functions: 01-06-2010 | |
1704 # compute the error rate | |
1705 # u.t and v.t are the quantiles | |
1706 # this one is used for the plots generated initially in the brief writeup | |
1707 # and it was used for processing merged data in July before the IDR definition | |
1708 # is formalized | |
1709 # It does not implement the current definition of IDR | |
1710 get.ez.tt.old <- function(em.fit, reverse=T, fdr.level=c(0.01, 0.05, 0.1)){ | |
1711 | |
1712 u <- em.fit$data.pruned$sample1 | |
1713 v <- em.fit$data.pruned$sample2 | |
1714 | |
1715 tt <- seq(0.01, 0.99, by=0.01) | |
1716 if(reverse){ | |
1717 e.z <- 1-em.fit$em.fit$e.z # this is the error prob | |
1718 uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z) | |
1719 ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="max") | |
1720 } else { | |
1721 e.z <- em.fit$em.fit$e.z | |
1722 uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z) | |
1723 ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="min") | |
1724 } | |
1725 | |
1726 u.t <- quantile(u, prob=(1-tt)) | |
1727 v.t <- quantile(v, prob=(1-tt)) | |
1728 | |
1729 # find the levels on the two replicates | |
1730 sig.value1 <- c() | |
1731 sig.value2 <- c() | |
1732 error.prob.cutoff <- c() | |
1733 n.selected.match <- c() | |
1734 | |
1735 for(i in 1:length(fdr.level)){ | |
1736 | |
1737 # find which uri.ez is closet to fdr.level | |
1738 index <- which.min(abs(uri.ez - fdr.level[i])) | |
1739 sig.value1[i] <- u.t[index] | |
1740 sig.value2[i] <- v.t[index] | |
1741 error.prob.cutoff[i] <- ez.bound[index] | |
1742 if(reverse){ | |
1743 n.selected.match[i] <- sum(e.z<=ez.bound[index]) | |
1744 } else { | |
1745 n.selected.match[i] <- sum(e.z>=ez.bound[index]) | |
1746 } | |
1747 } | |
1748 | |
1749 # output the cutoff of posterior probability, signal values on two replicates | |
1750 map.uv <- cbind(error.prob.cutoff, sig.value1, sig.value2, n.selected.match) | |
1751 | |
1752 return(list(n=tt*length(u), uri.ez=uri.ez, u.t=u.t, v.t=v.t, tt=tt, fdr.level=fdr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff)) | |
1753 } | |
1754 | |
1755 # created: 01-06-2010 | |
1756 # Output IDR at various number of selected peaks | |
1757 # Find cutoff (idr cutoff, sig.value cutoff on each replicate) for specified IDR level | |
1758 # IDR definition is similar to FDR | |
1759 get.ez.tt <- function(em.fit, idr.level=c(0.01, 0.05, 0.1)){ | |
1760 | |
1761 # u <- em.fit$data.pruned$sample1$sig.value | |
1762 # v <- em.fit$data.pruned$sample2$sig.value | |
1763 u <- em.fit$data.pruned$sample1 | |
1764 v <- em.fit$data.pruned$sample2 | |
1765 | |
1766 e.z <- 1-em.fit$em.fit$e.z # this is the error prob | |
1767 | |
1768 o <- order(e.z) | |
1769 e.z.ordered <- e.z[o] | |
1770 n.select <- c(1:length(e.z)) | |
1771 IDR <- cumsum(e.z.ordered)/n.select | |
1772 | |
1773 u.o <- u[o] | |
1774 v.o <- v[o] | |
1775 | |
1776 n.level <- length(idr.level) | |
1777 # sig.value1 <- rep(NA, n.level) | |
1778 # sig.value2 <- rep(NA, n.level) | |
1779 ez.cutoff <- rep(NA, n.level) | |
1780 n.selected <- rep(NA, n.level) | |
1781 | |
1782 for(i in 1:length(idr.level)){ | |
1783 | |
1784 # find which uri.ez is closet to fdr.level | |
1785 index <- which.min(abs(IDR - idr.level[i])) | |
1786 # sig.value1[i] <- min(u.o[1:index]) | |
1787 # sig.value2[i] <- min(v.o[1:index]) | |
1788 ez.cutoff[i] <- e.z[index] | |
1789 n.selected[i] <- sum(e.z<=ez.cutoff[i]) | |
1790 } | |
1791 | |
1792 # output the cutoff of posterior probability, number of selected overlapped peaks | |
1793 # map.uv <- cbind(ez.cutoff, sig.value1, sig.value2, n.selected) | |
1794 | |
1795 map.uv <- cbind(ez.cutoff, n.selected) | |
1796 | |
1797 return(list(n=n.select, IDR=IDR, idr.level=idr.level, map.uv=map.uv)) | |
1798 } | |
1799 | |
1800 # return(list(n=tt*length(u), uri.ez=uri.ez, fdr.level=fdr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff)) | |
1801 | |
1802 | |
1803 | |
1804 | |
1805 | |
1806 ### compute the mean of the marginals | |
1807 get.mar.mean <- function(em.out){ | |
1808 | |
1809 x.f1 <- em.out$x.mar$f1 | |
1810 x.f2 <- em.out$x.mar$f2 | |
1811 | |
1812 y.f1 <- em.out$y.mar$f1 | |
1813 y.f2 <- em.out$y.mar$f2 | |
1814 | |
1815 x.stat1 <- get.hist.mean(x.f1) | |
1816 x.stat2 <- get.hist.mean(x.f2) | |
1817 y.stat1 <- get.hist.mean(y.f1) | |
1818 y.stat2 <- get.hist.mean(y.f2) | |
1819 | |
1820 return(list(x.mean1=x.stat1$mean, x.mean2=x.stat2$mean, | |
1821 y.mean1=y.stat1$mean, y.mean2=y.stat2$mean, | |
1822 x.sd1=x.stat1$sd, x.sd2=x.stat2$sd, | |
1823 y.sd1=y.stat1$sd, y.sd2=y.stat2$sd | |
1824 )) | |
1825 | |
1826 } | |
1827 | |
1828 | |
1829 # compute the mean of marginals | |
1830 get.hist.mean <- function(x.f){ | |
1831 | |
1832 nbreaks <- length(x.f$breaks) | |
1833 x.bin <- x.f$breaks[-1]-x.f$breaks[-nbreaks] | |
1834 | |
1835 x.mid <- (x.f$breaks[-nbreaks]+x.f$breaks[-1])/2 | |
1836 x.mean <- sum(x.mid*x.f$density*x.bin) | |
1837 x.sd <- sqrt(sum(x.mid*x.mid*x.f$density*x.bin)-x.mean^2) | |
1838 | |
1839 return(list(mean=x.mean, sd=x.sd)) | |
1840 } | |
1841 | |
1842 get.hist.var <- function(x.f){ | |
1843 | |
1844 nbreaks <- length(x.f$breaks) | |
1845 x.bin <- x.f$breaks[-1]-x.f$breaks[-nbreaks] | |
1846 | |
1847 x.mid <- (x.f$breaks[-nbreaks]+x.f$breaks[-1])/2 | |
1848 x.mean <- sum(x.mid*x.f$density*x.bin) | |
1849 | |
1850 return(mean=x.mean) | |
1851 } | |
1852 | |
1853 # obsolete function (01-06-2010) | |
1854 # plot | |
1855 plot.ez.group.old <- function(ez.list, plot.dir, file.name=NULL, legend.txt, y.lim=NULL, xlab.txt="num of significant peaks", ylab.txt="avg posterior prob of being random", col.txt=NULL, title.txt=NULL){ | |
1856 | |
1857 if(is.null(col.txt)) | |
1858 col.txt <- c("black", "red", "purple", "green", "blue", "cyan", "magenta", "orange", "grey") | |
1859 | |
1860 x <- c() | |
1861 y <- c() | |
1862 | |
1863 for(i in 1:length(ez.list)){ | |
1864 x <- c(x, ez.list[[i]]$n) | |
1865 | |
1866 y <- c(y, ez.list[[i]]$uri.ez) | |
1867 } | |
1868 | |
1869 if(is.null(y.lim)) | |
1870 y.lim <- c(0, max(y)) | |
1871 | |
1872 if(!is.null(file.name)){ | |
1873 postscript(paste(plot.dir, "ez.", file.name, sep="")) | |
1874 par(mfrow=c(1,1), mar=c(5,5,4,2)) | |
1875 } | |
1876 | |
1877 plot(x, y, ylim=y.lim, type="n", xlab=xlab.txt, ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2) | |
1878 | |
1879 for(i in 1:length(ez.list)){ | |
1880 lines(ez.list[[i]]$n, ez.list[[i]]$uri.ez, col=col.txt[i], cex=2, lwd=5) | |
1881 } | |
1882 | |
1883 # plot(ez.list[[1]]$u.t, y, ylim=y.lim, type="l", xlab="rep-sig", ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2) | |
1884 # plot(ez.list[[1]]$v.t, y, ylim=y.lim, type="l", xlab="rep-sig", ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2) | |
1885 | |
1886 | |
1887 legend(0, y.lim[2], legend=legend.txt, col=col.txt[1:length(col.txt)], lty=1, lwd=5, cex=2) | |
1888 | |
1889 if(!is.null(title)) | |
1890 title(title.txt) | |
1891 | |
1892 if(!is.null(file.name)){ | |
1893 dev.off() | |
1894 } | |
1895 | |
1896 } | |
1897 | |
1898 | |
1899 plot.ez.group <- function(ez.list, plot.dir, file.name=NULL, legend.txt, y.lim=NULL, xlab.txt="num of significant peaks", ylab.txt="IDR", col.txt=NULL, title.txt=NULL){ | |
1900 | |
1901 if(is.null(col.txt)) | |
1902 col.txt <- c("black", "red", "purple", "green", "blue", "cyan", "magenta", "orange", "grey") | |
1903 | |
1904 n.entry <- length(ez.list) | |
1905 x <- rep(NA, n.entry) | |
1906 y.max <- rep(NA, n.entry) | |
1907 | |
1908 for(i in 1:n.entry){ | |
1909 x[i] <- max(ez.list[[i]]$n) | |
1910 | |
1911 y.max[i] <- max(ez.list[[i]]$IDR) | |
1912 | |
1913 } | |
1914 | |
1915 if(is.null(y.lim)) | |
1916 y.lim <- c(0, max(y.max)) | |
1917 | |
1918 if(!is.null(file.name)){ | |
1919 postscript(paste(plot.dir, "ez.", file.name, sep="")) | |
1920 par(mfrow=c(1,1), mar=c(5,5,4,2)) | |
1921 } | |
1922 | |
1923 | |
1924 | |
1925 plot(c(0, max(x)), y.lim, ylim=y.lim, type="n", xlab=xlab.txt, ylab=ylab.txt, lwd=5, cex=5, cex.axis=2, cex.lab=2) | |
1926 | |
1927 q <- seq(0.01, 0.99, by=0.01) | |
1928 | |
1929 for(i in 1:length(ez.list)){ | |
1930 | |
1931 n.plot <- round(quantile(ez.list[[i]]$n, prob=q)) | |
1932 IDR.plot <- ez.list[[i]]$IDR[n.plot] | |
1933 lines(n.plot, IDR.plot, col=col.txt[i], cex=2, lwd=5) | |
1934 } | |
1935 | |
1936 | |
1937 legend(0, y.lim[2], legend=legend.txt, col=col.txt[1:length(col.txt)], lty=1, lwd=5, cex=2) | |
1938 | |
1939 if(!is.null(title)) | |
1940 title(title.txt) | |
1941 | |
1942 if(!is.null(file.name)){ | |
1943 dev.off() | |
1944 } | |
1945 | |
1946 } | |
1947 | |
1948 | |
1949 | |
1950 ############################################################################# | |
1951 ############################################################################# | |
1952 # statistics about peaks selected on the individual replicates | |
1953 # | |
1954 # idr.level: the consistency cutoff, say 0.05 | |
1955 # uri.output: a list of uri.output from consistency analysis generated by batch-consistency-analysis.r | |
1956 # ez.list : a list of IDRs computed from get.ez.tt using the same idr.level | |
1957 # | |
1958 ################## | |
1959 | |
1960 | |
1961 # obsolete? | |
1962 # compute the error rate | |
1963 # u.t and v.t are the quantiles | |
1964 # | |
1965 # map back to all peaks and report the number of peaks selected | |
1966 get.ez.tt.all.old <- function(em.fit, all.data1, all.data2, idr.level){ | |
1967 | |
1968 u <- em.fit$data.pruned$sample1 | |
1969 v <- em.fit$data.pruned$sample2 | |
1970 | |
1971 tt <- seq(0.01, 0.99, by=0.01) | |
1972 # if(reverse){ | |
1973 e.z <- 1-em.fit$em.fit$e.z # this is the error prob | |
1974 uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z) | |
1975 ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="max") | |
1976 # } else { | |
1977 # e.z <- em.fit$em.fit$e.z | |
1978 # uri.ez <- sapply(tt, comp.uri.ez, u=u, v=v, e.z=e.z) | |
1979 # ez.bound <- sapply(tt, comp.ez.cutoff, u=u, v=v, e.z=e.z, boundary.txt="min") | |
1980 # } | |
1981 | |
1982 u.t <- quantile(u, prob=(1-tt)) | |
1983 v.t <- quantile(v, prob=(1-tt)) | |
1984 | |
1985 # find the levels on the two replicates | |
1986 sig.value1 <- c() | |
1987 sig.value2 <- c() | |
1988 error.prob.cutoff <- c() | |
1989 n.selected.match <- c() | |
1990 npeak.rep1 <- c() | |
1991 npeak.rep2 <- c() | |
1992 | |
1993 for(i in 1:length(idr.level)){ | |
1994 | |
1995 # find which uri.ez is closet to idr.level | |
1996 index <- which.min(abs(uri.ez - as.numeric(idr.level[i]))) | |
1997 | |
1998 sig.value1[i] <- u.t[index] | |
1999 sig.value2[i] <- v.t[index] | |
2000 error.prob.cutoff[i] <- ez.bound[index] | |
2001 n.selected.match[i] <- sum(u>= u.t[index] & v>=v.t[index]) | |
2002 | |
2003 npeak.rep1[i] <- sum(all.data1["sig.value"] >= sig.value1[i]) | |
2004 npeak.rep2[i] <- sum(all.data2["sig.value"] >= sig.value2[i]) | |
2005 } | |
2006 | |
2007 | |
2008 # output the cutoff of posterior probability, signal values on two replicates | |
2009 map.uv <- cbind(error.prob.cutoff, sig.value1, sig.value2, n.selected.match, npeak.rep1, npeak.rep2) | |
2010 | |
2011 return(list(n=tt*length(u), uri.ez=uri.ez, u.t=u.t, v.t=v.t, tt=tt, idr.level=idr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff)) | |
2012 } | |
2013 | |
2014 | |
2015 get.ez.tt.all <- function(em.fit, all.data1, all.data2, idr.level=c(0.01, 0.05, 0.1)){ | |
2016 | |
2017 u <- em.fit$data.pruned$sample1$sig.value | |
2018 v <- em.fit$data.pruned$sample2$sig.value | |
2019 # u <- em.fit$data.pruned$sample1 | |
2020 # v <- em.fit$data.pruned$sample2 | |
2021 | |
2022 e.z <- 1-em.fit$em.fit$e.z # this is the error prob | |
2023 | |
2024 o <- order(e.z) | |
2025 e.z.ordered <- e.z[o] | |
2026 n.select <- c(1:length(e.z)) | |
2027 IDR <- cumsum(e.z.ordered)/n.select | |
2028 | |
2029 u.o <- u[o] | |
2030 v.o <- v[o] | |
2031 | |
2032 n.level <- length(idr.level) | |
2033 # sig.value1 <- rep(NA, n.level) | |
2034 # sig.value2 <- rep(NA, n.level) | |
2035 ez.cutoff <- rep(NA, n.level) | |
2036 n.selected <- rep(NA, n.level) | |
2037 npeak.rep1 <- rep(NA, n.level) | |
2038 npeak.rep2 <- rep(NA, n.level) | |
2039 | |
2040 for(i in 1:length(idr.level)){ | |
2041 | |
2042 # find which uri.ez is closet to fdr.level | |
2043 index <- which.min(abs(IDR - idr.level[i])) | |
2044 # sig.value1[i] <- min(u.o[1:index]) | |
2045 # sig.value2[i] <- min(v.o[1:index]) | |
2046 ez.cutoff[i] <- e.z.ordered[index] # fixed on 02/20/10 | |
2047 n.selected[i] <- sum(e.z<=ez.cutoff[i]) | |
2048 # npeak.rep1[i] <- sum(all.data1["sig.value"] >= sig.value1[i]) | |
2049 # npeak.rep2[i] <- sum(all.data2["sig.value"] >= sig.value2[i]) | |
2050 } | |
2051 | |
2052 # output the cutoff of posterior probability, number of selected overlapped peaks | |
2053 map.uv <- cbind(ez.cutoff, n.selected) | |
2054 | |
2055 return(list(n=n.select, IDR=IDR, idr.level=idr.level, map.uv=map.uv)) | |
2056 } | |
2057 | |
2058 # return(list(n=tt*length(u), uri.ez=uri.ez, fdr.level=fdr.level, map.uv=map.uv, e.z=e.z, error.prob.cutoff=error.prob.cutoff)) | |
2059 | |
2060 | |
2061 | |
2062 | |
2063 | |
2064 | |
2065 ####### the following is for determining thresholds for merged dataset | |
2066 | |
2067 ############# select peaks above a given threshold | |
2068 # | |
2069 # pass.threshold: a simple method, passing the threshold on the threshold on the individual replicate to the pooled sample | |
2070 # | |
2071 # sig.map.list: a list of matrix to include all the cutoff values, each row corresponds to a cutoff. The first column is idr.level | |
2072 # the 2nd column is the cutoff of ez, the rest of columns are consistency analysis for other replicates | |
2073 # sig.value.name: the name of the sig.value column | |
2074 # combined: combined dataset | |
2075 # nrep: number of pairs of comparisons | |
2076 # | |
2077 # Procedure: | |
2078 # 1. Find the significant threshold corresponding to the idr cutoff on the matched peaks. | |
2079 # 2. Each time we will get two or more (if >2 replicates) cutoffs and will report the most stringent and the least stringent | |
2080 # cutoff and the number of peaks selected at those two cutoffs | |
2081 ############# | |
2082 | |
2083 pass.threshold <- function(sig.map.list, sig.value.name, combined, idr.level, nrep, chr.size){ | |
2084 | |
2085 sig.map <- c() | |
2086 | |
2087 # choose idr.level | |
2088 idr.index <- which(rbind(sig.map.list[[1]])[,1] == idr.level) | |
2089 if(length(i) ==0){ | |
2090 print("no level matches specified idr.level") | |
2091 return(-1) | |
2092 } | |
2093 | |
2094 for(i in 1:length(sig.map.list)) | |
2095 sig.map <- c(sig.map, rbind(sig.map.list[[i]])[idr.index, c("sig.value1", "sig.value2")]) | |
2096 | |
2097 | |
2098 npeak.tight <- c() | |
2099 npeak.loose <- c() | |
2100 | |
2101 | |
2102 max.sig <- max(sig.map) | |
2103 min.sig <- min(sig.map) | |
2104 selected.sig.tight <- combined[combined[,sig.value.name]>=max.sig, ] | |
2105 selected.sig.loose <- combined[combined[,sig.value.name]>=min.sig, ] | |
2106 | |
2107 selected.sig.tight <- deconcatenate.chr(selected.sig.tight, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")] | |
2108 selected.sig.loose <- deconcatenate.chr(selected.sig.loose, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")] | |
2109 | |
2110 npeak.tight <- nrow(selected.sig.tight) | |
2111 npeak.loose <- nrow(selected.sig.loose) | |
2112 | |
2113 | |
2114 npeak.stat <- list(idr.level=idr.level, max.sig=max.sig, min.sig=min.sig, npeak.tight=npeak.tight, npeak.loose=npeak.loose) | |
2115 | |
2116 invisible(list(npeak.stat=npeak.stat, combined.selected.tight=selected.sig.tight, combined.selected.loose=selected.sig.loose)) | |
2117 } | |
2118 | |
2119 ################# | |
2120 # pass the regions selected from consistency analysis to combined data | |
2121 # Threshold is determined on the replicates, the regions above the threshold are selected | |
2122 # then peaks on the combined data are selected from the selected regions | |
2123 # | |
2124 # To avoid being too stringent, regions satisfying the following conditions are selected | |
2125 # 1. regions above the significant threshold determined by consistency analysis on either replicate | |
2126 # 2. regions that have consistent low peaks, i.e. posterior prob > threshold but not passing the significant threshold | |
2127 # | |
2128 # This method doesn't make a difference when using different thresholds | |
2129 ################# | |
2130 | |
2131 pass.region <- function(sig.map.list, uri.output, ez.list, em.output, combined, idr.level, sig.value.impute=0, chr.size){ | |
2132 | |
2133 combined <- combined[, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")] | |
2134 npair <- length(uri.output) # number of pairs of consistency analysis | |
2135 combined.region <- c() | |
2136 | |
2137 # choose idr.level | |
2138 idr.index <- which(rbind(sig.map.list[[1]])[,1] == idr.level) | |
2139 if(length(idr.index) ==0){ | |
2140 print("no level matches specified idr.level") | |
2141 return(-1) | |
2142 } | |
2143 | |
2144 for(j in 1:npair){ | |
2145 # select peaks from individual replicates using individual cutoff | |
2146 above.1 <- uri.output[[j]]$data12.enrich$merge1["sig.value"] >= ez.list[[j]]$map.uv[idr.index,"sig.value1"] | |
2147 above.2 <- uri.output[[j]]$data12.enrich$merge1["sig.value"] >= ez.list[[j]]$map.uv[idr.index,"sig.value2"] | |
2148 selected.sig.rep1 <- uri.output[[j]]$data12.enrich$merge1[above.1, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")] | |
2149 selected.sig.rep2 <- uri.output[[j]]$data12.enrich$merge2[above.2, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")] | |
2150 | |
2151 # find the peaks that are overlapped with reliable peaks in the individual replicates | |
2152 overlap.1 <- pair.peaks(selected.sig.rep1, combined)$merge2 | |
2153 overlap.2 <- pair.peaks(selected.sig.rep2, combined)$merge2 | |
2154 | |
2155 # choose the ones with significant value > 0, which are the overlapped ones | |
2156 | |
2157 combined.in1 <- overlap.1[overlap.1$sig.value > sig.value.impute, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")] | |
2158 combined.in2 <- overlap.2[overlap.2$sig.value > sig.value.impute, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")] | |
2159 | |
2160 ## consistent low significant ones | |
2161 ## first find consistenct ones, ie. high posterior prob | |
2162 # is.consistent <- ez.list[[j]]$e.z < ez.list[[j]]$ez.cutoff | |
2163 | |
2164 # data.matched <- keep.match(uri.output[[j]]$data12.enrich$merge1[!above.1, ], uri.output[[j]]$data12.enrich$merge2[!above.2, ], sig.value.impute=0) | |
2165 # data.matched$sample1 <- data.matched$sample1[, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")] | |
2166 # data.matched$sample2 <- data.matched$sample2[, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")] | |
2167 | |
2168 # consistent.in1 <- data.matched$sample1[is.consistent, ] | |
2169 # consistent.in2 <- data.matched$sample2[is.consistent, ] | |
2170 | |
2171 # overlap.consistent.1 <- pair.peaks(consistent.in1, combined)$merge2 | |
2172 # overlap.consistent.2 <- pair.peaks(consistent.in2, combined)$merge2 | |
2173 | |
2174 ## choose the ones with significant value > 0, which are the overlapped ones | |
2175 | |
2176 # combined.consistent.in1 <- overlap.consistent.1[overlap.consistent.1$sig.value > sig.value.impute, ] | |
2177 # combined.consistent.in2 <- overlap.consistent.2[overlap.consistent.2$sig.value > sig.value.impute, ] | |
2178 | |
2179 # combined.region <- rbind(combined.region, combined.in1, combined.in2, combined.consistent.in1, combined.consistent.in2) | |
2180 | |
2181 combined.region <- rbind(combined.region, combined.in1, combined.in2) | |
2182 | |
2183 is.repeated <- duplicated(combined.region$start) | |
2184 combined.region <- combined.region[!is.repeated, c("start", "stop", "sig.value", "signal.value", "p.value", "q.value")] | |
2185 | |
2186 } | |
2187 npeak <- nrow(combined.region) | |
2188 | |
2189 sig.combined <- c(min(combined.region[,"sig.value"], na.rm=T), max(combined.region[,"sig.value"], na.rm=T)) | |
2190 | |
2191 # idr.combined <- c(min(combined.region[,"q.value"], na.rm=T), max(combined.region[,"q.value"], na.rm=T)) | |
2192 | |
2193 npeak.stat <- list(idr.level=idr.level, npeak=npeak) | |
2194 | |
2195 combined.region <- deconcatenate.chr(combined.region, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")] | |
2196 | |
2197 invisible(list(npeak.stat=npeak.stat, combined.selected=combined.region, sig.combined=sig.combined)) | |
2198 } | |
2199 | |
2200 ################ | |
2201 # pass structure: this method does another round of inference on the combined data | |
2202 # | |
2203 # To make the mixture structure comparable on the replicates and the combined data, the 2nd inference is done on the peaks | |
2204 # at the reliable regions on the combined data, using rank transformed significant values. The mixture structure is estimated using my consistency analysis, which | |
2205 # estimates marginal distributions of ranks using nonparametric ways. Then the significant values are found out. | |
2206 # There are several advantages to do it this way: | |
2207 # 1. The premise of passing structure is that the means and variance (i.e. distribution) of two replicates should be the same | |
2208 # The significant values on the two replicates clearly have different distributions. The structure estimated from consistency | |
2209 # analysis will generate similar rank distribution on two replicates by its setup (i.e. same number of peaks are paired up). | |
2210 # 2. Because pooled sample is a black box, the structure is more likely to be followed in the matched regions than other locations, | |
2211 # after all, we don't know what other things are. If even the structure doesn't hold on the matched regions, | |
2212 # which is possible, let alone the other regions. Focusing on the reliable regions helps to get rid of those unknown noises. | |
2213 # | |
2214 # | |
2215 # modified on 2-20-10: reverse rank.combined, make big sig.value with small | |
2216 # ranks, to be consistent with f1 and f2 | |
2217 ################ | |
2218 | |
2219 pass.structure <- function(uri.output, em.output, combined, idr.level, sig.value.impute, chr.size, overlap.ratio=0){ | |
2220 | |
2221 columns.keep <- c("sig.value", "start", "stop", "signal.value", "p.value", "q.value", "chr", "start.ori", "stop.ori") | |
2222 combined <- combined[, columns.keep] | |
2223 combined.selected.all <- c() | |
2224 | |
2225 for(j in 1:npair){ | |
2226 | |
2227 sample1 <- uri.output[[j]]$data12.enrich$merge1[, columns.keep] | |
2228 sample2 <- uri.output[[j]]$data12.enrich$merge2[, columns.keep] | |
2229 | |
2230 # find peaks on the matched region on the combined one | |
2231 data.matched <- keep.match(sample1, sample2, sig.value.impute=sig.value.impute) | |
2232 | |
2233 data.matched$sample1 <- data.matched$sample1[, columns.keep] | |
2234 data.matched$sample2 <- data.matched$sample2[, columns.keep] | |
2235 | |
2236 overlap.1 <- pair.peaks.filter(data.matched$sample1, combined, p.value.impute=sig.value.impute, overlap.ratio)$merge2 | |
2237 overlap.2 <- pair.peaks.filter(data.matched$sample2, combined, p.value.impute=sig.value.impute, overlap.ratio)$merge2 | |
2238 | |
2239 # choose the ones with significant value > sig.value.impute, which are the overlapped ones | |
2240 | |
2241 combined.in1 <- overlap.1[overlap.1$sig.value > sig.value.impute, ] | |
2242 combined.in2 <- overlap.2[overlap.2$sig.value > sig.value.impute, ] | |
2243 | |
2244 combined.region <- rbind(combined.in1, combined.in2) | |
2245 | |
2246 is.repeated <- duplicated(combined.region$start) | |
2247 combined.region <- combined.region[!is.repeated,] | |
2248 | |
2249 # now rank the peaks in matched region | |
2250 rank.combined <- rank(-combined.region$sig.value) | |
2251 | |
2252 # now transform the parameters estimated into the new scale | |
2253 npeaks.overlap <- nrow(combined.region) | |
2254 npeaks.consistent <- nrow(cbind(em.output[[j]]$data.pruned$sample1)) | |
2255 | |
2256 | |
2257 # the breaks are the same for x and y | |
2258 f1 <- list(breaks=em.output[[j]]$em.fit$x.mar$f1$breaks*npeaks.overlap/npeaks.consistent, density=(em.output[[j]]$em.fit$x.mar$f1$density+em.output[[j]]$em.fit$y.mar$f1$density)/2) | |
2259 # the first break boundary goes up when changing scale, need set it back to be a bit smaller than 1 | |
2260 f1$breaks[1] <- min(f1$breaks[1], 0.95) | |
2261 | |
2262 f2 <- list(breaks=em.output[[j]]$em.fit$x.mar$f2$breaks*npeaks.overlap/npeaks.consistent, density=(em.output[[j]]$em.fit$x.mar$f2$density+em.output[[j]]$em.fit$y.mar$f2$density)/2) | |
2263 # the first break boundary goes up when changing scale, need set it back to be a bit smaller than 1 | |
2264 f2$breaks[1] <- min(f2$breaks[1], 0.95) | |
2265 | |
2266 p <- em.output[[j]]$em.fit$para$p | |
2267 | |
2268 # find the posterior probability | |
2269 errorprob.combined <- get.comp2.prob(rank.combined, p, f1, f2) | |
2270 | |
2271 # compute the FDR and find cutoff of posterior prob and the sig value | |
2272 o <- order(errorprob.combined) | |
2273 idr <- cumsum(errorprob.combined[o])/c(1:length(o)) | |
2274 idr.index <- which(idr > idr.level)[1] | |
2275 errorprob.cutoff <- errorprob.combined[o][idr.index] | |
2276 | |
2277 # find the minimum significant measure among selected peaks | |
2278 sig.value <- min(combined.region$sig.value[o][1:idr.index]) | |
2279 # sig.value <- quantile(combined.region$sig.value[o][1:idr.index], prob=0.05) | |
2280 #sig.value <- quantile(combined.region$sig.value[errorprob.combined<=errorprob.cutoff], prob=0.05) | |
2281 | |
2282 # apply the significant value on the whole pooled list | |
2283 combined.selected <- combined[combined$sig.value >= sig.value,] | |
2284 | |
2285 combined.selected.all <- rbind(combined.selected.all, combined.selected) | |
2286 } | |
2287 | |
2288 is.repeated <- duplicated(combined.selected.all$start) | |
2289 combined.selected.all <- combined.selected.all[!is.repeated,] | |
2290 | |
2291 npeak <- nrow(combined.selected.all) | |
2292 | |
2293 npeak.stat <- list(idr.level=idr.level, npeak=npeak) | |
2294 | |
2295 sig.combined <- c(min(combined.selected.all[,"sig.value"], na.rm=T), max(combined.selected.all[,"sig.value"], na.rm=T)) | |
2296 | |
2297 # idr.combined <- c(min(combined.selected.all[,"q.value"], na.rm=T), max(combined.selected.all[,"q.value"], na.rm=T)) | |
2298 # combined.selected.all <- deconcatenate.chr(combined.selected.all, chr.size)[,c("chr", "start", "stop", "signal.value", "p.value", "q.value")] | |
2299 | |
2300 combined.selected.all <- combined.selected.all[, c("chr", "start.ori", "stop.ori", "signal.value", "p.value", "q.value")] | |
2301 colnames(combined.selected.all) <- c("chr", "start", "stop", "signal.value", "p.value", "q.value") | |
2302 | |
2303 invisible(list(npeak.stat=npeak.stat, combined.selected=combined.selected.all, sig.combined=sig.combined)) | |
2304 } | |
2305 | |
2306 | |
2307 | |
2308 # get the posterior probability of the 2nd component | |
2309 get.comp2.prob <- function(x, p, f1, f2){ | |
2310 | |
2311 # get pdf and cdf of each component from functions in the corresponding component | |
2312 px.1 <- sapply(x, get.pdf, df=f1) | |
2313 px.2 <- sapply(x, get.pdf, df=f2) | |
2314 | |
2315 comp2prob <- 1 - p*px.1/(p*px.1+(1-p)*px.2) | |
2316 | |
2317 return(comp2prob) | |
2318 } | |
2319 | |
2320 keep.match <- function(sample1, sample2, sig.value.impute=0){ | |
2321 | |
2322 sample1.prune <- sample1[sample1$sig.value > sig.value.impute & sample2$sig.value > sig.value.impute,] | |
2323 sample2.prune <- sample2[sample1$sig.value > sig.value.impute & sample2$sig.value > sig.value.impute,] | |
2324 | |
2325 invisible(list(sample1=sample1.prune, sample2=sample2.prune)) | |
2326 } | |
2327 | |
2328 | |
2329 ############################################## | |
2330 # | |
2331 # The following is for simulation | |
2332 # | |
2333 ############################################## | |
2334 | |
2335 | |
2336 # simulate gaussian copula | |
2337 # u is the uniform random variable and rho is correlation coefficient | |
2338 simu.gaussian.copula <- function(u, rho){ | |
2339 | |
2340 n <- length(u) | |
2341 | |
2342 # simulate y given x=qnorm(u) | |
2343 y <- qnorm(u)*rho + rnorm(n)*sqrt(1-rho^2) | |
2344 | |
2345 v <- pnorm(y) | |
2346 | |
2347 invisible(v) | |
2348 } | |
2349 | |
2350 ## simulate Clayton copula from its generating function | |
2351 ## Genest and MacKay (1986) | |
2352 | |
2353 phi.ori <- function(t, s){ | |
2354 | |
2355 (t^(-s) -1)/s | |
2356 } | |
2357 | |
2358 | |
2359 phi.inv <- function(y, s){ | |
2360 | |
2361 exp(-log(s*y+1)/s) | |
2362 } | |
2363 | |
2364 phi.der <- function(t, s){ | |
2365 | |
2366 -t^(-s-1) | |
2367 } | |
2368 | |
2369 phi.der.inv <- function(y, s){ | |
2370 | |
2371 exp(log(-y)/(-s-1)) | |
2372 } | |
2373 | |
2374 get.w <- function(u, t, s){ | |
2375 | |
2376 phi.der.inv(phi.der(u, s)/t, s) | |
2377 } | |
2378 | |
2379 get.v <- function(w, u, s){ | |
2380 | |
2381 phi.inv(phi.ori(w, s) - phi.ori(u, s), s) | |
2382 } | |
2383 | |
2384 # u is a uniform random variable, s is the association parameter | |
2385 simu.clayton.copula <- function(u, s){ | |
2386 | |
2387 t <- runif(length(u)) | |
2388 | |
2389 if(s>0){ | |
2390 w <- get.w(u, t, s) | |
2391 v <- get.v(w, u, s) | |
2392 return(v) | |
2393 } | |
2394 | |
2395 if(s==0){ | |
2396 return(t) | |
2397 } | |
2398 | |
2399 if(s <0){ | |
2400 print("Invalid association parameters for clayton copula") | |
2401 } | |
2402 | |
2403 } | |
2404 | |
2405 | |
2406 | |
2407 ###### 09-09-09 | |
2408 | |
2409 # simulate a two-component copula mixture: | |
2410 # - marginal distributions for the two variables in each component are both | |
2411 # normal and with the same parameters | |
2412 # p is the mixing proportion of component 1 | |
2413 # n is the total sample size | |
2414 simu.copula.2mix <- function(s1, s2, p, n, mu1, mu2, sd1, sd2, copula.txt){ | |
2415 | |
2416 n1 <- round(n*p) | |
2417 n2 <- n-n1 | |
2418 | |
2419 u1 <- runif(n1) | |
2420 | |
2421 if(copula.txt =="clayton") | |
2422 v1 <- simu.clayton.copula(u1, s1) | |
2423 else{ | |
2424 if(copula.txt =="gaussian") | |
2425 v1 <- simu.gaussian.copula(u1, s1) | |
2426 } | |
2427 | |
2428 u2 <- runif(n2) | |
2429 | |
2430 if(copula.txt =="clayton") | |
2431 v2 <- simu.clayton.copula(u2, s2) | |
2432 else{ | |
2433 if(copula.txt =="gaussian") | |
2434 v2 <- simu.gaussian.copula(u2, s2) | |
2435 } | |
2436 | |
2437 # generate test statistics | |
2438 sample1.1 <- qnorm(u1, mu1, sd1) | |
2439 sample1.2 <- qnorm(v1, mu1, sd1) | |
2440 | |
2441 sample2.1 <- qnorm(u2, mu2, sd2) | |
2442 sample2.2 <- qnorm(v2, mu2, sd2) | |
2443 | |
2444 return(list(u=c(u1, u2), v=c(v1, v2), | |
2445 u.inv=c(sample1.1, sample2.1), v.inv=c(sample1.2, sample2.2), | |
2446 label=c(rep(1, n1), rep(2, n2)))) | |
2447 } | |
2448 | |
2449 # using inverse of the cdf to generate original observations | |
2450 | |
2451 simu.copula.2mix.inv <- function(s1, s2, p, n, cdf1.x, cdf1.y, cdf2.x, cdf2.y, copula.txt){ | |
2452 | |
2453 n1 <- round(n*p) | |
2454 n2 <- n-n1 | |
2455 | |
2456 u1 <- runif(n1) | |
2457 | |
2458 if(copula.txt =="clayton") | |
2459 v1 <- simu.clayton.copula(u1, s1) | |
2460 else{ | |
2461 if(copula.txt =="gaussian") | |
2462 v1 <- simu.gaussian.copula(u1, s1) | |
2463 } | |
2464 | |
2465 u2 <- runif(n2) | |
2466 | |
2467 if(copula.txt =="clayton") | |
2468 v2 <- simu.clayton.copula(u2, s2) | |
2469 else{ | |
2470 if(copula.txt =="gaussian") | |
2471 v2 <- simu.gaussian.copula(u2, s2) | |
2472 } | |
2473 | |
2474 # generate test statistics | |
2475 # sample1.1 <- qnorm(u1, mu1, sd1) | |
2476 # sample1.2 <- qnorm(v1, mu1, sd1) | |
2477 | |
2478 # sample2.1 <- qnorm(u2, mu2, sd2) | |
2479 # sample2.2 <- qnorm(v2, mu2, sd2) | |
2480 | |
2481 sample1.x <- inv.cdf.vec(u1, cdf1.x) | |
2482 sample1.y <- inv.cdf.vec(v1, cdf1.y) | |
2483 | |
2484 sample2.x <- inv.cdf.vec(u2, cdf2.x) | |
2485 sample2.y <- inv.cdf.vec(v2, cdf2.y) | |
2486 | |
2487 | |
2488 return(list(u=c(u1, u2), v=c(v1, v2), | |
2489 u.inv=c(sample1.x, sample2.x), v.inv=c(sample1.y, sample2.y), | |
2490 label=c(rep(1, n1), rep(2, n2)))) | |
2491 } | |
2492 | |
2493 # obtain original observation by converting cdf into quantiles | |
2494 # u is one cdf | |
2495 # u.cdf is a cdf (assuming it is a histogram) and has the break points (cdf$cdf and cdf$breaks) | |
2496 # the smallest value of cdf=0 and the largest =1 | |
2497 inv.cdf <- function(u, u.cdf){ | |
2498 | |
2499 # which bin it falls into | |
2500 i <- which(u.cdf$cdf> u)[1] | |
2501 q.u <- (u - u.cdf$cdf[i-1])/(u.cdf$cdf[i] - u.cdf$cdf[i-1])* (u.cdf$breaks[i]-u.cdf$breaks[i-1]) + u.cdf$breaks[i-1] | |
2502 | |
2503 return(q.u) | |
2504 } | |
2505 | |
2506 inv.cdf.vec <- function(u, u.cdf){ | |
2507 | |
2508 # check if cdf has the right range (0, 1) | |
2509 ncdf <- length(u.cdf$cdf) | |
2510 nbreaks <- length(u.cdf$breaks) | |
2511 | |
2512 if(ncdf == nbreaks-1 & u.cdf$cdf[ncdf]< 1) | |
2513 u.cdf[ncdf] <- 1 | |
2514 | |
2515 q.u <- sapply(u, inv.cdf, u.cdf) | |
2516 | |
2517 return(q.u) | |
2518 } | |
2519 | |
2520 # here we simulate a likely real situation | |
2521 # the test statistics from two normal distributions | |
2522 # according to their labels, then convert them into p-values w.r.t H0 using | |
2523 # one-sided test. | |
2524 # The test statistics are correlated for the signal component and independent | |
2525 # for the noise component | |
2526 # For the signal component, Y = X + eps, where eps ~ N(0, sigma^2) | |
2527 simu.test.stat <- function(p, n, mu1, sd1, mu0, sd0, sd.e){ | |
2528 | |
2529 # first component - signal | |
2530 n.signal <- round(n*p) | |
2531 n.noise <- n - n.signal | |
2532 | |
2533 # labels | |
2534 labels <- c(rep(1, n.signal), rep(0, n.noise)) | |
2535 | |
2536 # test statistics for signal and noise | |
2537 mu.signal <- rnorm(n.signal, mu1, sd1) | |
2538 x.signal <- mu.signal + rnorm(n.signal, 0, sd.e) | |
2539 x.noise <- rnorm(n.noise, mu0, sd0) + rnorm(n.noise, 0, sd.e) | |
2540 | |
2541 y.signal <- mu.signal + rnorm(n.signal, 0, sd.e) | |
2542 # sd.e can be dependent on signal | |
2543 y.noise <- rnorm(n.noise, mu0, sd0) + rnorm(n.noise, 0, sd.e) | |
2544 | |
2545 # concatenate | |
2546 x <- c(x.signal, x.noise) | |
2547 y <- c(y.signal, y.noise) | |
2548 | |
2549 # convert to p-values based on H0 | |
2550 p.x <- 1-pnorm(x, mu0, sqrt(sd0^2+sd.e^2)) | |
2551 p.y <- 1-pnorm(y, mu0, sqrt(sd0^2+sd.e^2)) | |
2552 | |
2553 return(list(p.x=p.x, p.y=p.y, x=x, y=y, labels=labels)) | |
2554 | |
2555 } | |
2556 | |
2557 # compute the tradeoff and calibration | |
2558 forward.decoy.tradeoff.ndecoy <- function(xx, labels, ndecoy){ | |
2559 | |
2560 xx <- round(xx, 5) | |
2561 o <- order(xx, decreasing=T) | |
2562 | |
2563 rand <- 1-labels # if rand==0, consistent | |
2564 # order the random indicator in the same order | |
2565 rand.o <- rand[o] | |
2566 | |
2567 if(sum(rand.o) > ndecoy){ | |
2568 index.decoy <- which(cumsum(rand.o)==ndecoy) | |
2569 } else { | |
2570 index.decoy <- which(cumsum(rand.o)==sum(rand.o)) | |
2571 } | |
2572 | |
2573 cutoff.decoy <- xx[o][index.decoy] | |
2574 | |
2575 # only consider the unique ones | |
2576 cutoff.unique <- unique(xx[o]) | |
2577 | |
2578 cutoff <- cutoff.unique[cutoff.unique >= cutoff.decoy[length(cutoff.decoy)]] | |
2579 | |
2580 get.decoy.count <- function(cut.off){ | |
2581 above <- rep(0, length(xx)) | |
2582 above[xx >= cut.off] <- 1 | |
2583 decoy.count <- sum(above==1 & rand==1) | |
2584 return(decoy.count) | |
2585 } | |
2586 | |
2587 get.forward.count <- function(cut.off){ | |
2588 above <- rep(0, length(xx)) | |
2589 above[xx >= cut.off] <- 1 | |
2590 forward.count <- sum(above==1 & rand==0) | |
2591 return(forward.count) | |
2592 } | |
2593 | |
2594 get.est.fdr <- function(cut.off){ | |
2595 above <- rep(0, length(xx)) | |
2596 above[xx >= cut.off] <- 1 | |
2597 est.fdr <- 1-mean(xx[above==1]) | |
2598 return(est.fdr) | |
2599 } | |
2600 | |
2601 # assuming rand=0 is right | |
2602 get.false.neg.count <- function(cut.off){ | |
2603 below <- rep(0, length(xx)) | |
2604 below[xx < cut.off] <- 1 | |
2605 false.neg.count <- sum(below==1 & rand==0) | |
2606 return(false.neg.count) | |
2607 } | |
2608 | |
2609 get.false.pos.count <- function(cut.off){ | |
2610 above <- rep(0, length(xx)) | |
2611 above[xx >= cut.off] <- 1 | |
2612 false.pos.count <- sum(above==1 & rand==1) | |
2613 return(false.pos.count) | |
2614 } | |
2615 | |
2616 decoy <- sapply(cutoff, get.decoy.count) | |
2617 forward <- sapply(cutoff, get.forward.count) | |
2618 | |
2619 est.fdr <- sapply(cutoff, get.est.fdr) | |
2620 emp.fdr <- decoy/(decoy+forward) | |
2621 | |
2622 # compute specificity and sensitivity | |
2623 # assuming rand=1 is wrong and rand=0 is right | |
2624 false.neg <- sapply(cutoff, get.false.neg.count) | |
2625 false.pos <- sapply(cutoff, get.false.pos.count) | |
2626 | |
2627 true.pos <- sum(rand==0)-false.neg | |
2628 true.neg <- sum(rand==1)-false.pos | |
2629 | |
2630 sensitivity <- true.pos/(true.pos+false.neg) | |
2631 specificity <- true.neg/(true.neg+false.pos) | |
2632 | |
2633 return(list(decoy=decoy, forward=forward, cutoff=cutoff, est.fdr=est.fdr, emp.fdr=emp.fdr, sensitivity=sensitivity, specificity=specificity)) | |
2634 } | |
2635 | |
2636 | |
2637 # compute the em for jackknife and all data, and find FDR | |
2638 get.emp.jack <- function(a, p0){ | |
2639 | |
2640 nobs <- length(a$labels) | |
2641 est <- list() | |
2642 est.all <- list() | |
2643 | |
2644 temp.all <- em.transform(-a$p.x, -a$p.y, mu=1.5, sigma=1.4, rho=0.4, p=0.7, eps=0.01) | |
2645 # temp.all <- em.2copula.quick(a$p.x, a$p.y, p0=p0, rho1.0=0.7, | |
2646 # rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian") | |
2647 | |
2648 est.all$p <- temp.all$para$p | |
2649 est.all$rho1 <- temp.all$para$rho1 | |
2650 est.all$FDR <- get.FDR(temp.all$e.z) | |
2651 | |
2652 FDR <- list() | |
2653 p <- c() | |
2654 rho1 <- c() | |
2655 | |
2656 | |
2657 for(i in 1:nobs){ | |
2658 | |
2659 temp <- em.transform(-a$p.x[-i], -a$p.y[-i], mu=1.5, sigma=1.4, rho=0.4, p=0.7, eps=0.01) | |
2660 # temp <- em.2copula.quick(a$p.x[-i], a$p.y[-i], p0=p0, rho1.0=0.7, | |
2661 # rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian") | |
2662 | |
2663 est[[i]] <- list(p=temp$para$p, rho1=temp$para$rho1, FDR=get.FDR(temp$e.z)) | |
2664 | |
2665 FDR[[i]] <- est[[i]]$FDR # this is the FDR for top n peaks | |
2666 p[i] <- est[[i]]$p | |
2667 rho1[i] <- est[[i]]$rho1 | |
2668 } | |
2669 | |
2670 est.jack <- list(FDR=FDR, p=p, rho1=rho1) | |
2671 return(list(est.jack=est.jack, est.all=est.all)) | |
2672 } | |
2673 | |
2674 | |
2675 # get the npeaks corresponding to the nominal FDR estimated from the sample | |
2676 # and find the corresponding FDR from the entire data | |
2677 get.FDR.jack <- function(est, FDR.nominal){ | |
2678 | |
2679 nobs <- length(est$est.jack$FDR) | |
2680 FDR.all <- c() | |
2681 top.n <- c() | |
2682 | |
2683 for(i in 1:nobs){ | |
2684 top.n[i] <- max(which(est$est.jack$FDR[[i]] <= FDR.nominal)) | |
2685 FDR.all[i] <- est$est.all$FDR[top.n[i]] | |
2686 } | |
2687 | |
2688 invisible(list(FDR.all=FDR.all, top.n=top.n)) | |
2689 } | |
2690 | |
2691 # compute Jackknife peudonumber | |
2692 # a is the dataset | |
2693 get.emp.IF <- function(a, p0){ | |
2694 | |
2695 nobs <- length(a$labels) | |
2696 est <- list() | |
2697 est.all <- list() | |
2698 | |
2699 temp.all <- em.2copula.quick(a$p.x, a$p.y, p0=p0, rho1.0=0.7, | |
2700 rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian") | |
2701 | |
2702 est.all$p <- temp.all$para$p | |
2703 est.all$rho1 <- temp.all$para$rho1 | |
2704 est.all$FDR <- get.FDR(temp.all$e.z) | |
2705 | |
2706 IF.FDR <- list() | |
2707 IF.p <- c() | |
2708 IF.rho1 <- c() | |
2709 | |
2710 for(i in 1:nobs){ | |
2711 | |
2712 temp <- em.2copula.quick(a$p.x[-i], a$p.y[-i], p0=p0, rho1.0=0.7, | |
2713 rho2.0=0, eps=0.01, fix.p=T, stoc=F, fix.rho2=T, "gaussian") | |
2714 | |
2715 est[[i]] <- list(p=temp$para$p, rho1=temp$para$rho1, FDR=get.FDR(temp$e.z)) | |
2716 | |
2717 IF.FDR[[i]] <- (nobs-1)*(est.all$FDR[-nobs] - est[[i]]$FDR) # this is the FDR for top n peaks | |
2718 IF.p[i] <- (nobs-1)*(est.all$p - est[[i]]$p) | |
2719 IF.rho1[i] <- (nobs-1)*(est.all$rho1 - est[[i]]$rho1) | |
2720 } | |
2721 | |
2722 emp.IF <- list(FDR=IF.FDR, p=IF.p, rho1=IF.rho1) | |
2723 | |
2724 invisible(list(emp.IF=emp.IF, est.all=est.all, est=est)) | |
2725 } | |
2726 | |
2727 # e.z is the posterior probability of being in signal component | |
2728 get.FDR <- function(e.z){ | |
2729 | |
2730 e.z.o <- order(1-e.z) | |
2731 FDR <- cumsum(1-e.z[e.z.o])/c(1:length(e.z.o)) | |
2732 | |
2733 invisible(FDR) | |
2734 } | |
2735 | |
2736 # get the FDR of selecting the top n peaks | |
2737 # IF.est is the sample influence function | |
2738 # top.n | |
2739 get.IF.FDR <- function(IF.est, top.n){ | |
2740 | |
2741 nobs <- length(IF.est$emp.IF$FDR) | |
2742 FDR <- c() | |
2743 | |
2744 # influence function of p | |
2745 for(i in 1:nobs) | |
2746 FDR[i] <- IF.est$emp.IF$FDR[[i]][top.n] | |
2747 | |
2748 invisible(FDR) | |
2749 } | |
2750 | |
2751 # get the sample influence function for FDR at a given FDR size | |
2752 # 1. find the number of peaks selected at a given FDR computed from all obs | |
2753 # 2. use the number to find the sample influence function for FDR | |
2754 # IF.est$est.all is the FDR with all peaks | |
2755 get.IF.FDR.all <- function(IF.est, FDR.size){ | |
2756 | |
2757 top.n <- which.min(abs(IF.est$est.all$FDR -FDR.size)) | |
2758 nobs <- length(IF.est$est.all$FDR) | |
2759 FDR <- c() | |
2760 | |
2761 # influence function of p | |
2762 for(i in 1:nobs) | |
2763 FDR[i] <- IF.est$emp.IF$FDR[[i]][top.n] | |
2764 | |
2765 invisible(list(FDR=FDR, top.n=top.n)) | |
2766 } | |
2767 | |
2768 plot.simu.uri <- function(x, y){ | |
2769 | |
2770 tt <- seq(0.01, 0.99, by=0.01) | |
2771 uri <- sapply(tt, comp.uri.prob, u=x, v=y) | |
2772 uri.thin <- uri[seq(1, length(tt), by=3)] | |
2773 tt.thin <- tt[seq(1, length(tt), by=3)] | |
2774 duri <- (uri.thin[-1]-uri.thin[-length(uri.thin)])/(tt.thin[-1]-tt.thin[-length(tt.thin)]) | |
2775 uri.spl <- smooth.spline(tt, uri, df=6.4) | |
2776 uri.der <- predict(uri.spl, tt, deriv=1) | |
2777 | |
2778 par(mfrow=c(2,2)) | |
2779 plot(x[1:n0], y[1:n0]) | |
2780 points(x[(n0+1):n], y[(n0+1):n], col=2) | |
2781 plot(rank(-x)[1:n0], rank(-y)[1:n0]) | |
2782 points(rank(-x)[(1+n0):n], rank(-y)[(1+n0):n]) | |
2783 plot(tt, uri) | |
2784 lines(c(0,1), c(0,1), lty=2) | |
2785 title(paste("rho1=", rho1, " rho2=", rho2, "p=", p, sep="")) | |
2786 plot(tt.thin[-1], duri) | |
2787 lines(uri.der) | |
2788 abline(h=1) | |
2789 invisible(list(x=x, y=y, uri=uri, tt=tt, duri=duri, tt.thin=tt.thin, uri.der=uri.der)) | |
2790 | |
2791 } | |
2792 | |
2793 | |
2794 ###### new fitting procedure | |
2795 | |
2796 | |
2797 | |
2798 | |
2799 # 1. rank pairs | |
2800 | |
2801 # 2. initialization | |
2802 # 3. convert to pseudo-number | |
2803 | |
2804 # 4. EM | |
2805 | |
2806 # need plugin and test | |
2807 # find the middle point between the bins | |
2808 get.pseudo.mix <- function(x, mu, sigma, rho, p){ | |
2809 | |
2810 | |
2811 # first compute cdf for points on the grid | |
2812 # generate 200 points between [-3, mu+3*sigma] | |
2813 nw <- 1000 | |
2814 w <- seq(min(-3, mu-3*sigma), max(mu+3*sigma, 3), length=nw) | |
2815 w.cdf <- p*pnorm(w, mean=mu, sd=sigma) + (1-p)*pnorm(w, mean=0, sd=1) | |
2816 | |
2817 i <- 1 | |
2818 | |
2819 quan.x <- rep(NA, length(x)) | |
2820 | |
2821 for(i in c(1:nw)){ | |
2822 index <- which(x >= w.cdf[i] & x < w.cdf[i+1]) | |
2823 quan.x[index] <- (x[index]-w.cdf[i])*(w[i+1]-w[i])/(w.cdf[i+1]-w.cdf[i]) +w[i] | |
2824 } | |
2825 | |
2826 index <- which(x < w.cdf[1]) | |
2827 if(length(index)>0) | |
2828 quan.x[index] <- w[1] | |
2829 | |
2830 index <- which(x > w.cdf[nw]) | |
2831 if(length(index)>0) | |
2832 quan.x[index] <- w[nw] | |
2833 | |
2834 # linear.ext <- function(x, w, w.cdf){ | |
2835 # linear interpolation | |
2836 # index.up <- which(w.cdf>= x)[1] | |
2837 # left.index <- which(w.cdf <=x) | |
2838 # index.down <- left.index[length(left.index)] | |
2839 # quan.x <- (w[index.up] + w[index.down])/2 | |
2840 # } | |
2841 | |
2842 # x.pseudo <- sapply(x, linear.ext, w=w, w.cdf=w.cdf) | |
2843 | |
2844 # invisible(x.pseudo) | |
2845 invisible(quan.x) | |
2846 } | |
2847 | |
2848 | |
2849 # EM to compute the latent structure | |
2850 # steps: | |
2851 # 1. raw values are first transformed into pseudovalues | |
2852 # 2. EM is used to compute the underlining structure, which is a mixture | |
2853 # of two normals | |
2854 em.transform <- function(x, y, mu, sigma, rho, p, eps){ | |
2855 | |
2856 x.cdf.func <- ecdf(x) | |
2857 y.cdf.func <- ecdf(y) | |
2858 afactor <- length(x)/(length(x)+1) | |
2859 x.cdf <- x.cdf.func(x)*afactor | |
2860 y.cdf <- y.cdf.func(y)*afactor | |
2861 | |
2862 # initialization | |
2863 para <- list() | |
2864 para$mu <- mu | |
2865 para$sigma <- sigma | |
2866 para$rho <- rho | |
2867 para$p <- p | |
2868 | |
2869 j <- 1 | |
2870 to.run <- T | |
2871 loglik.trace <- c() | |
2872 loglik.inner.trace <- c() | |
2873 | |
2874 #to.run.inner <- T | |
2875 z.1 <- get.pseudo.mix(x.cdf, para$mu, para$sigma, para$rho, para$p) | |
2876 z.2 <- get.pseudo.mix(y.cdf, para$mu, para$sigma, para$rho, para$p) | |
2877 | |
2878 # cat("length(z1)", length(z.1), "\n") | |
2879 while(to.run){ | |
2880 | |
2881 # get pseudo value in each cycle | |
2882 # z.1 <- get.pseudo.mix(x.cdf, para$mu, para$sigma, para$rho, para$p) | |
2883 # z.2 <- get.pseudo.mix(y.cdf, para$mu, para$sigma, para$rho, para$p) | |
2884 | |
2885 i <- 1 | |
2886 while(to.run){ | |
2887 | |
2888 # EM for latent structure | |
2889 e.z <- e.step.2normal(z.1, z.2, para$mu, para$sigma, para$rho, para$p) | |
2890 para <- m.step.2normal(z.1, z.2, e.z) | |
2891 #para$rho <- rho | |
2892 #para$p <- p | |
2893 #para$mu <- mu | |
2894 #para$sigma <- sigma | |
2895 if(i > 1) | |
2896 l.old <- l.new | |
2897 | |
2898 # this is just the mixture likelihood of two-component Gaussian | |
2899 l.new <- loglik.2binormal(z.1, z.2, para$mu, para$sigma, para$rho, para$p) | |
2900 | |
2901 loglik.inner.trace[i] <- l.new | |
2902 | |
2903 if(i > 1){ | |
2904 to.run <- loglik.inner.trace[i]-loglik.inner.trace[i-1]>eps | |
2905 } | |
2906 | |
2907 | |
2908 # if(i > 2){ | |
2909 # l.inf <- loglik.inner.trace[i-2] + (loglik.inner.trace[i-1] - loglik.inner.trace[i-2])/(1-(loglik.inner.trace[i]-loglik.inner.trace[i-1])/(loglik.inner.trace[i-1]-loglik.inner.trace[i-2])) | |
2910 | |
2911 # if(loglik.inner.trace[i-1]!=loglik.inner.trace[i-2]) | |
2912 # to.run <- abs(l.inf - loglik.inner.trace[i]) > eps | |
2913 # else | |
2914 # to.run <- F | |
2915 | |
2916 # } | |
2917 | |
2918 cat("loglik.inner.trace[", i, "]=", loglik.inner.trace[i], "\n") | |
2919 cat("mu=", para$mu, "sigma=", para$sigma, "p=", para$p, "rho=", para$rho, "\n\n") | |
2920 | |
2921 i <- i+1 | |
2922 } | |
2923 | |
2924 | |
2925 # get pseudo value in each cycle | |
2926 z.1 <- get.pseudo.mix(x.cdf, para$mu, para$sigma, para$rho, para$p) | |
2927 z.2 <- get.pseudo.mix(y.cdf, para$mu, para$sigma, para$rho, para$p) | |
2928 | |
2929 if(j > 1) | |
2930 l.old.outer <- l.new.outer | |
2931 | |
2932 l.new.outer <- loglik.2binormal(z.1, z.2, para$mu, para$sigma, para$rho, para$p) | |
2933 | |
2934 loglik.trace[j] <- l.new.outer | |
2935 | |
2936 if(j == 1) | |
2937 to.run <- T | |
2938 else{ # stop when iteration>100 | |
2939 if(j > 100) | |
2940 to.run <- F | |
2941 else | |
2942 to.run <- l.new.outer - l.old.outer > eps | |
2943 } | |
2944 | |
2945 # if(j %% 10==0) | |
2946 cat("loglik.trace[", j, "]=", loglik.trace[j], "\n") | |
2947 cat("mu=", para$mu, "sigma=", para$sigma, "p=", para$p, "rho=", para$rho, "\n") | |
2948 | |
2949 j <- j+1 | |
2950 } | |
2951 | |
2952 bic <- -2*l.new + 4*log(length(z.1)) | |
2953 | |
2954 return(list(para=list(p=para$p, rho=para$rho, mu=para$mu, sigma=para$sigma), | |
2955 loglik=l.new, bic=bic, e.z=e.z, loglik.trace=loglik.trace)) | |
2956 } | |
2957 | |
2958 | |
2959 | |
2960 | |
2961 # compute log-likelihood for mixture of two bivariate normals | |
2962 loglik.2binormal <- function(z.1, z.2, mu, sigma, rho, p){ | |
2963 | |
2964 l.m <- sum(d.binormal(z.1, z.2, 0, 1, 0)+log(p*exp(d.binormal(z.1, z.2, mu, sigma, rho)-d.binormal(z.1, z.2, 0, 1, 0))+(1-p))) | |
2965 | |
2966 # l.m <- sum((p*d.binormal(z.1, z.2, mu, sigma, rho) + (1-p)*d.binormal(z.1, z.2, 0, 1, 0))) | |
2967 return(l.m) | |
2968 } | |
2969 | |
2970 # check this when rho=1 | |
2971 | |
2972 # density of binomial distribution with equal mean and sigma on both dimensions | |
2973 d.binormal <- function(z.1, z.2, mu, sigma, rho){ | |
2974 | |
2975 loglik <- (-log(2)-log(pi)-2*log(sigma) - log(1-rho^2)/2 - (0.5/(1-rho^2)/sigma^2)*((z.1-mu)^2 -2*rho*(z.1-mu)*(z.2-mu) + (z.2-mu)^2)) | |
2976 | |
2977 return(loglik) | |
2978 } | |
2979 | |
2980 # E-step for computing the latent strucutre | |
2981 # e.z is the prob to be in the consistent group | |
2982 # e.step for estimating posterior prob | |
2983 # z.1 and z.2 can be vectors or scalars | |
2984 e.step.2normal <- function(z.1, z.2, mu, sigma, rho, p){ | |
2985 | |
2986 e.z <- p/((1-p)*exp(d.binormal(z.1, z.2, 0, 1, 0)-d.binormal(z.1, z.2, mu, sigma, rho))+ p) | |
2987 | |
2988 invisible(e.z) | |
2989 } | |
2990 | |
2991 # M-step for computing the latent structure | |
2992 # m.step for estimating proportion, mean, sd and correlation coefficient | |
2993 m.step.2normal <- function(z.1, z.2, e.z){ | |
2994 | |
2995 p <- mean(e.z) | |
2996 mu <- sum((z.1+z.2)*e.z)/2/sum(e.z) | |
2997 sigma <- sqrt(sum(e.z*((z.1-mu)^2+(z.2-mu)^2))/2/sum(e.z)) | |
2998 rho <- 2*sum(e.z*(z.1-mu)*(z.2-mu))/(sum(e.z*((z.1-mu)^2+(z.2-mu)^2))) | |
2999 | |
3000 return(list(p=p, mu=mu, sigma=sigma, rho=rho)) | |
3001 } | |
3002 | |
3003 | |
3004 # assume top p percent of observations are true | |
3005 # x and y are ranks, estimate | |
3006 init <- function(x, y, x.label){ | |
3007 | |
3008 x.o <- order(x) | |
3009 | |
3010 x.ordered <- x[x.o] | |
3011 y.ordered <- y[x.o] | |
3012 x.label.ordered <- x.label[x.o] | |
3013 | |
3014 n <- length(x) | |
3015 p <- sum(x.label)/n | |
3016 | |
3017 rho <- cor(x.ordered[1:ceiling(p*n)], y.ordered[1:ceiling(p*n)]) | |
3018 | |
3019 temp <- find.mu.sigma(x.ordered, x.label.ordered) | |
3020 mu <- temp$mu | |
3021 sigma <- temp$sigma | |
3022 | |
3023 invisible(list(mu=mu, sigma=sigma, rho=rho, p=p)) | |
3024 | |
3025 } | |
3026 | |
3027 # find mu and sigma if the distributions of marginal ranks are known | |
3028 # take the medians of the two dist and map back to the original | |
3029 init.dist <- function(f0, f1){ | |
3030 | |
3031 # take the median in f0 | |
3032 index.median.0 <- which(f0$cdf>0.5)[1] | |
3033 q.0.small <- f0$cdf[index.median.0] # because f0 and f1 have the same bins | |
3034 q.1.small <- f1$cdf[index.median.0] | |
3035 | |
3036 # take the median in f1 | |
3037 index.median.1 <- which(f1$cdf>0.5)[1] | |
3038 q.0.big <- f0$cdf[index.median.1] # because f0 and f1 have the same bins | |
3039 q.1.big <- f1$cdf[index.median.1] | |
3040 | |
3041 # find pseudo value for x.middle[1] on normal(0,1) | |
3042 pseudo.small.0 <- qnorm(q.0.small, mean=0, sd=1) | |
3043 pseudo.small.1 <- qnorm(q.1.small, mean=0, sd=1) | |
3044 | |
3045 # find pseudo value for x.middle[2] on normal(0,1) | |
3046 pseudo.big.0 <- qnorm(q.0.big, mean=0, sd=1) | |
3047 pseudo.big.1 <- qnorm(q.1.big, mean=0, sd=1) | |
3048 | |
3049 mu <- (pseudo.small.0*pseudo.big.1 - pseudo.small.1*pseudo.big.0)/(pseudo.big.1-pseudo.small.1) | |
3050 | |
3051 sigma <- (pseudo.small.0-mu)/pseudo.small.1 | |
3052 | |
3053 return(list(mu=mu, sigma=sigma)) | |
3054 } | |
3055 | |
3056 # generate labels | |
3057 | |
3058 # find the part of data with overlap | |
3059 | |
3060 # find the percentile on noise and signal | |
3061 | |
3062 # Suppose there are signal and noise components, with mean=0 and sd=1 for noise | |
3063 # x and x.label are the rank of the observations and their labels, | |
3064 # find the mean and sd of the other component | |
3065 # x.label takes values of 0 and 1 | |
3066 find.mu.sigma <- function(x, x.label){ | |
3067 | |
3068 x.0 <- x[x.label==0] | |
3069 x.1 <- x[x.label==1] | |
3070 | |
3071 n.x0 <- length(x.0) | |
3072 n.x1 <- length(x.1) | |
3073 | |
3074 x.end <- c(min(x.0), min(x.1), max(x.0), max(x.1)) | |
3075 o <- order(x.end) | |
3076 x.middle <- x.end[o][c(2,3)] | |
3077 | |
3078 # the smaller end of the overlap | |
3079 q.1.small <- mean(x.1 <= x.middle[1])*n.x1/(n.x1+1) | |
3080 q.0.small <- mean(x.0 <= x.middle[1])*n.x0/(n.x0+1) | |
3081 | |
3082 # the bigger end of the overlap | |
3083 q.1.big <- mean(x.1 <= x.middle[2])*n.x1/(n.x1+1) | |
3084 q.0.big <- mean(x.0 <= x.middle[2])*n.x0/(n.x0+1) | |
3085 | |
3086 # find pseudo value for x.middle[1] on normal(0,1) | |
3087 pseudo.small.0 <- qnorm(q.0.small, mean=0, sd=1) | |
3088 pseudo.small.1 <- qnorm(q.1.small, mean=0, sd=1) | |
3089 | |
3090 # find pseudo value for x.middle[2] on normal(0,1) | |
3091 pseudo.big.0 <- qnorm(q.0.big, mean=0, sd=1) | |
3092 pseudo.big.1 <- qnorm(q.1.big, mean=0, sd=1) | |
3093 | |
3094 mu <- (pseudo.small.0*pseudo.big.1 - pseudo.small.1*pseudo.big.0)/(pseudo.big.1-pseudo.small.1) | |
3095 | |
3096 sigma <- (pseudo.small.0-mu)/pseudo.small.1 | |
3097 | |
3098 return(list(mu=mu, sigma=sigma)) | |
3099 } |