Mercurial > repos > idot > coverage_correlation
annotate corr.R @ 11:74bfa2464411
the bioconductor GRanges stuff blows
author | Ido Tamir <ido.tamir@imp.ac.at> |
---|---|
date | Sun, 18 Aug 2013 19:39:52 +0200 |
parents | ecad2cd5038f |
children | f3e037496c18 |
rev | line source |
---|---|
2 | 1 #!/usr/bin/env Rscript |
2 # | |
3 #for SGE | |
4 #$ -S /biosw/debian5-x86_64/R/3.0.0/bin/Rscript | |
5 #$ -q solexa.q | |
6 #$ -P pipeline | |
7 #$ -cwd | |
8 #$ -l vf=30G | |
9 | |
10 #args[1] <- comma separated bigwig input files | |
11 #args[2] <- comma separated names of files | |
12 #args[3] <- comma separated file formats | |
13 #args[4] <- output pdf | |
14 #args[5] <- output matrix | |
15 #args[6] <- title | |
16 #args[7] <- optional mappability bed file #not implemented because we have to change the coverage function (covWith0) to 1. intersection then diff because mappability can be wrong | |
17 | |
18 options("useFancyQuotes" = FALSE) | |
19 | |
20 library(rtracklayer) | |
21 library(lattice) | |
22 library(latticeExtra) | |
23 library(hexbin) | |
24 | |
25 ## creates the union of coverages i.e. mappable regions | |
26 ## todo: add mappability argument | |
27 createMappable <- function(coverages){ | |
28 suppressWarnings(Reduce(union, coverages)) #supress because missing chromosomes spit warinings | |
29 } | |
30 | |
31 ## adds mappable regions as 0 coverage to track | |
32 covWith0 <- function(cov, mappable){ | |
9
ecad2cd5038f
debug message was making truble
Ido Tamir <ido.tamir@imp.ac.at>
parents:
6
diff
changeset
|
33 print(paste("cov:", summary(cov))) |
3 | 34 c0 <- setdiff(mappable, cov) |
2 | 35 cus <- if(length(c0) > 0){ |
36 elementMetadata(c0)$score <- 0 | |
37 sort(c(c0, cov)) | |
38 }else{ | |
39 sort(cov) | |
40 } | |
41 rc <- Rle(score(cus), ranges(cus)@width) | |
42 rc | |
43 } | |
44 | |
45 ## unlisted rlelist is too long. Taking weighted mean as approximation. | |
46 correlate <- function(rlA, rlB){ | |
47 weighted.mean(cor(rlA, rlB), w=unlist(lapply(rlA, length))) | |
48 } | |
49 | |
50 | |
51 ## correlation dist | |
52 corrDist <- function(covRles, outnames){ | |
3 | 53 print("calculating correlation") |
54 vl <- length(covRles) | |
2 | 55 v <- 1:vl |
56 o <- matrix(NA,vl,vl) | |
57 colnames(o) <- outnames | |
58 rownames(o) <- outnames | |
59 tri <- lower.tri(o, diag=FALSE) | |
60 indic <- expand.grid(v,v) | |
61 pairs <- indic[tri,] | |
62 corrs <- apply(pairs,1, function(ins){ | |
63 index1 <- ins[1] | |
64 index2 <- ins[2] | |
65 cor(covRles[[index1]], covRles[[index2]]) | |
66 # correlate(covRles[[index1]], covRles[[index2]]) | |
67 }) | |
68 | |
69 o[lower.tri(o, diag=FALSE)] <- corrs | |
70 o[upper.tri(o)] <- t(o)[upper.tri(o)] | |
71 diag(o) <- 1 | |
72 o | |
73 } | |
74 | |
75 | |
76 | |
77 plotCorr <- function(corrMat, title){ | |
78 cor.d <- dist(corrMat) | |
79 cor.row <- as.dendrogram( hclust( cor.d )) | |
80 ord <- order.dendrogram(cor.row) | |
81 levelplot( | |
82 corrMat[ord,ord], | |
83 scales=list(x=list(rot=90)), | |
84 colorkey = list(space="left"), | |
85 col.regions=BTY(100), | |
86 xlab = "", | |
87 ylab = "pearson correlation coefficient", | |
88 legend = list(top=list( fun = dendrogramGrob, args= list( | |
89 x=cor.row, | |
90 ord = ord, | |
91 side = "top" | |
92 ))), | |
93 main=title | |
94 ) | |
95 | |
96 } | |
97 | |
98 calcAndPlot <- function(coverages, outnames, mappable, outnamePDF, outnameMat, title){ | |
99 | |
100 ca <- lapply(coverages, covWith0, mappable) | |
101 | |
102 cd <- corrDist(ca, outnames) | |
103 write.table(cd, outnameMat, quote=FALSE, sep="\t", col.names=FALSE) | |
104 pdf(outnamePDF) | |
105 print(plotCorr(cd, title)) | |
106 dev.off() | |
107 | |
108 } | |
109 | |
110 getCoverage <- function(infile, format){ | |
111 print(paste("reading", infile, format)) | |
112 values <- import(infile, format=format, asRangedData = FALSE) | |
113 if(tolower(format) %in% c("bigwig","wig")){ | |
114 values | |
115 }else{ | |
116 cov <- coverage(values) | |
117 gcov <- as(cov, "GRanges") | |
118 gcov[score(gcov) > 0] | |
119 } | |
120 } | |
121 | |
122 | |
123 getCoverages <- function(infiles, formats){ | |
124 apply(data.frame(file=infiles,format=formats),1, function(row){ getCoverage(row['file'], row['format']) }) | |
125 } | |
126 | |
6
fcf85568a102
dont know how this worked. added coverages to mappable
Ido Tamir <ido.tamir@imp.ac.at>
parents:
3
diff
changeset
|
127 getMappable <- function(coverages, mappable){ |
9
ecad2cd5038f
debug message was making truble
Ido Tamir <ido.tamir@imp.ac.at>
parents:
6
diff
changeset
|
128 print(paste("creating mappability", length(coverages), mappable)) |
6
fcf85568a102
dont know how this worked. added coverages to mappable
Ido Tamir <ido.tamir@imp.ac.at>
parents:
3
diff
changeset
|
129 if(is.na(mappable)){ |
2 | 130 createMappable(coverages) |
131 }else{ | |
132 import(mappable, format="bed", asRangedData = FALSE) | |
133 } | |
134 } | |
135 | |
11
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
136 #I hate this bioconductor ranges stuff more and more |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
137 #this is a super s**t api |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
138 mergeSeqInfo <- function(coverages){ |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
139 seqinfos <- do.call("rbind", lapply(coverages, function(cov){ d <- as.data.frame(cov@seqinfo); d$seqnames <- rownames(d); d})) |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
140 seqinfosMax <- do.call("rbind", by(seqinfos, seqinfos$seqnames, function(byChr){ byChr[order(byChr$seqlengths, decreasing=TRUE),][1,] })) |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
141 data.frame(seqnames=seqinfosMax$seqnames,seqlengths=seqinfosMax$seqlengths, isCircular=seqinfosMax$isCircular, genome=seqinfosMax$genome) |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
142 } |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
143 |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
144 |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
145 changeSeqInfo <- function(coverages){ |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
146 seqinfo <- mergeSeqInfo(coverages) |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
147 lapply(coverages, function(cov){ |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
148 si <- as.data.frame(cov@seqinfo) |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
149 si <- data.frame(chr=rownames(si), si) |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
150 sis <- subset(seqinfo, seqnames %in% si$chr) |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
151 si <- Seqinfo(seqnames=as.character(sis$seqnames), seqlengths=sis$seqlengths, isCircular=sis$isCircular, genome=sis$genome) |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
152 cov@seqinfo <- si |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
153 cov |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
154 }) |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
155 } |
2 | 156 |
157 args <- commandArgs(TRUE) | |
158 | |
159 infiles <- unlist(strsplit(args[1], ",")) | |
160 outnames <- unlist(strsplit(args[2], ",")) | |
161 formats <- unlist(strsplit(args[3], ",")) | |
162 outnamePDF <- args[4] | |
163 outnameMat <- args[5] | |
164 title <- args[6] | |
165 mappable <- args[7] | |
166 | |
167 lp <- list(infiles,outnames,formats,outnamePDF,outnameMat,title,mappable) | |
168 lpp <- sapply(lp, paste, sep=" ") | |
169 print("parsed input parameters") | |
170 print(lpp) | |
171 | |
172 | |
173 coverages <- getCoverages(infiles, formats) | |
11
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
174 coverages <- changeSeqInfo(coverages) |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
175 |
6
fcf85568a102
dont know how this worked. added coverages to mappable
Ido Tamir <ido.tamir@imp.ac.at>
parents:
3
diff
changeset
|
176 mappability <- getMappable(coverages, mappable) |
2 | 177 |
11
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
178 |
2 | 179 calcAndPlot(coverages, outnames, mappability, outnamePDF, outnameMat,title) |
180 | |
181 | |
182 | |
183 |