Mercurial > repos > idot > coverage_correlation
annotate corr.R @ 14:5a4cbfb5fbc5 default tip
loading R3.0.2
author | Ido Tamir <ido.tamir@imp.ac.at> |
---|---|
date | Mon, 27 Oct 2014 10:57:35 +0100 |
parents | de452b96da8e |
children |
rev | line source |
---|---|
2 | 1 #!/usr/bin/env Rscript |
2 # | |
3 #for SGE | |
4 #$ -S /biosw/debian5-x86_64/R/3.0.0/bin/Rscript | |
5 #$ -q solexa.q | |
6 #$ -P pipeline | |
7 #$ -cwd | |
8 #$ -l vf=30G | |
9 | |
10 #args[1] <- comma separated bigwig input files | |
11 #args[2] <- comma separated names of files | |
12 #args[3] <- comma separated file formats | |
13 #args[4] <- output pdf | |
14 #args[5] <- output matrix | |
15 #args[6] <- title | |
16 #args[7] <- optional mappability bed file #not implemented because we have to change the coverage function (covWith0) to 1. intersection then diff because mappability can be wrong | |
17 | |
18 options("useFancyQuotes" = FALSE) | |
19 | |
13 | 20 TEST=FALSE |
21 | |
2 | 22 library(rtracklayer) |
23 library(lattice) | |
24 library(latticeExtra) | |
25 library(hexbin) | |
13 | 26 library(Rsamtools) |
2 | 27 |
28 ## creates the union of coverages i.e. mappable regions | |
29 ## todo: add mappability argument | |
30 createMappable <- function(coverages){ | |
31 suppressWarnings(Reduce(union, coverages)) #supress because missing chromosomes spit warinings | |
32 } | |
33 | |
34 ## adds mappable regions as 0 coverage to track | |
35 covWith0 <- function(cov, mappable){ | |
9
ecad2cd5038f
debug message was making truble
Ido Tamir <ido.tamir@imp.ac.at>
parents:
6
diff
changeset
|
36 print(paste("cov:", summary(cov))) |
3 | 37 c0 <- setdiff(mappable, cov) |
2 | 38 cus <- if(length(c0) > 0){ |
39 elementMetadata(c0)$score <- 0 | |
40 sort(c(c0, cov)) | |
41 }else{ | |
42 sort(cov) | |
43 } | |
44 rc <- Rle(score(cus), ranges(cus)@width) | |
45 rc | |
46 } | |
47 | |
48 ## unlisted rlelist is too long. Taking weighted mean as approximation. | |
49 correlate <- function(rlA, rlB){ | |
50 weighted.mean(cor(rlA, rlB), w=unlist(lapply(rlA, length))) | |
51 } | |
52 | |
53 | |
54 ## correlation dist | |
55 corrDist <- function(covRles, outnames){ | |
3 | 56 print("calculating correlation") |
57 vl <- length(covRles) | |
2 | 58 v <- 1:vl |
59 o <- matrix(NA,vl,vl) | |
60 colnames(o) <- outnames | |
61 rownames(o) <- outnames | |
62 tri <- lower.tri(o, diag=FALSE) | |
63 indic <- expand.grid(v,v) | |
64 pairs <- indic[tri,] | |
65 corrs <- apply(pairs,1, function(ins){ | |
66 index1 <- ins[1] | |
67 index2 <- ins[2] | |
68 cor(covRles[[index1]], covRles[[index2]]) | |
69 # correlate(covRles[[index1]], covRles[[index2]]) | |
70 }) | |
71 | |
72 o[lower.tri(o, diag=FALSE)] <- corrs | |
73 o[upper.tri(o)] <- t(o)[upper.tri(o)] | |
74 diag(o) <- 1 | |
75 o | |
76 } | |
77 | |
78 | |
79 | |
80 plotCorr <- function(corrMat, title){ | |
81 cor.d <- dist(corrMat) | |
82 cor.row <- as.dendrogram( hclust( cor.d )) | |
83 ord <- order.dendrogram(cor.row) | |
84 levelplot( | |
85 corrMat[ord,ord], | |
86 scales=list(x=list(rot=90)), | |
87 colorkey = list(space="left"), | |
88 col.regions=BTY(100), | |
89 xlab = "", | |
90 ylab = "pearson correlation coefficient", | |
91 legend = list(top=list( fun = dendrogramGrob, args= list( | |
92 x=cor.row, | |
93 ord = ord, | |
94 side = "top" | |
95 ))), | |
96 main=title | |
97 ) | |
98 | |
99 } | |
100 | |
101 calcAndPlot <- function(coverages, outnames, mappable, outnamePDF, outnameMat, title){ | |
102 | |
103 ca <- lapply(coverages, covWith0, mappable) | |
104 | |
105 cd <- corrDist(ca, outnames) | |
106 write.table(cd, outnameMat, quote=FALSE, sep="\t", col.names=FALSE) | |
107 pdf(outnamePDF) | |
108 print(plotCorr(cd, title)) | |
109 dev.off() | |
110 | |
111 } | |
112 | |
113 getCoverage <- function(infile, format){ | |
114 print(paste("reading", infile, format)) | |
13 | 115 values <- if(format == "bam"){ |
116 import(infile, format=format) | |
117 }else{ | |
118 import(infile, format=format, asRangedData = FALSE) | |
119 } | |
2 | 120 if(tolower(format) %in% c("bigwig","wig")){ |
121 values | |
122 }else{ | |
123 cov <- coverage(values) | |
124 gcov <- as(cov, "GRanges") | |
125 gcov[score(gcov) > 0] | |
126 } | |
127 } | |
128 | |
129 | |
130 getCoverages <- function(infiles, formats){ | |
131 apply(data.frame(file=infiles,format=formats),1, function(row){ getCoverage(row['file'], row['format']) }) | |
132 } | |
133 | |
6
fcf85568a102
dont know how this worked. added coverages to mappable
Ido Tamir <ido.tamir@imp.ac.at>
parents:
3
diff
changeset
|
134 getMappable <- function(coverages, mappable){ |
9
ecad2cd5038f
debug message was making truble
Ido Tamir <ido.tamir@imp.ac.at>
parents:
6
diff
changeset
|
135 print(paste("creating mappability", length(coverages), mappable)) |
6
fcf85568a102
dont know how this worked. added coverages to mappable
Ido Tamir <ido.tamir@imp.ac.at>
parents:
3
diff
changeset
|
136 if(is.na(mappable)){ |
2 | 137 createMappable(coverages) |
138 }else{ | |
139 import(mappable, format="bed", asRangedData = FALSE) | |
140 } | |
141 } | |
142 | |
11
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
143 #I hate this bioconductor ranges stuff more and more |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
144 #this is a super s**t api |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
145 mergeSeqInfo <- function(coverages){ |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
146 seqinfos <- do.call("rbind", lapply(coverages, function(cov){ d <- as.data.frame(cov@seqinfo); d$seqnames <- rownames(d); d})) |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
147 seqinfosMax <- do.call("rbind", by(seqinfos, seqinfos$seqnames, function(byChr){ byChr[order(byChr$seqlengths, decreasing=TRUE),][1,] })) |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
148 data.frame(seqnames=seqinfosMax$seqnames,seqlengths=seqinfosMax$seqlengths, isCircular=seqinfosMax$isCircular, genome=seqinfosMax$genome) |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
149 } |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
150 |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
151 |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
152 changeSeqInfo <- function(coverages){ |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
153 seqinfo <- mergeSeqInfo(coverages) |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
154 lapply(coverages, function(cov){ |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
155 si <- as.data.frame(cov@seqinfo) |
12
f3e037496c18
its even checking the levels
Ido Tamir <ido.tamir@imp.ac.at>
parents:
11
diff
changeset
|
156 si <- data.frame(chr=rownames(si), si, stringsAsFactors=FALSE) |
f3e037496c18
its even checking the levels
Ido Tamir <ido.tamir@imp.ac.at>
parents:
11
diff
changeset
|
157 sis <- subset(seqinfo, seqnames %in% si$chr) |
f3e037496c18
its even checking the levels
Ido Tamir <ido.tamir@imp.ac.at>
parents:
11
diff
changeset
|
158 sis <- sis[match(si$chr, sis$seqnames),] |
11
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
159 si <- Seqinfo(seqnames=as.character(sis$seqnames), seqlengths=sis$seqlengths, isCircular=sis$isCircular, genome=sis$genome) |
12
f3e037496c18
its even checking the levels
Ido Tamir <ido.tamir@imp.ac.at>
parents:
11
diff
changeset
|
160 cov@seqinfo <- si |
11
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
161 cov |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
162 }) |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
163 } |
2 | 164 |
165 args <- commandArgs(TRUE) | |
166 | |
13 | 167 if(TEST){ |
168 args[1] <- c("file1.bed,file2.bed,file3.bed,file4.bed") | |
169 args[1] <- c("file1.bam,file2.bam,file3.bam,file4.bam") | |
170 args[2] <- c("file1,file2,file3,file4") | |
171 args[3] <- c("bam,bam,bam,bam") | |
172 args[4] <- "out.pdf" | |
173 args[5] <- "mat.mat" | |
174 args[6] <- "title" | |
175 args[7] <- NA | |
176 } | |
177 | |
2 | 178 infiles <- unlist(strsplit(args[1], ",")) |
179 outnames <- unlist(strsplit(args[2], ",")) | |
180 formats <- unlist(strsplit(args[3], ",")) | |
181 outnamePDF <- args[4] | |
182 outnameMat <- args[5] | |
183 title <- args[6] | |
184 mappable <- args[7] | |
185 | |
186 lp <- list(infiles,outnames,formats,outnamePDF,outnameMat,title,mappable) | |
187 lpp <- sapply(lp, paste, sep=" ") | |
188 print("parsed input parameters") | |
189 print(lpp) | |
190 | |
191 | |
192 coverages <- getCoverages(infiles, formats) | |
11
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
193 coverages <- changeSeqInfo(coverages) |
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
194 |
6
fcf85568a102
dont know how this worked. added coverages to mappable
Ido Tamir <ido.tamir@imp.ac.at>
parents:
3
diff
changeset
|
195 mappability <- getMappable(coverages, mappable) |
2 | 196 |
11
74bfa2464411
the bioconductor GRanges stuff blows
Ido Tamir <ido.tamir@imp.ac.at>
parents:
9
diff
changeset
|
197 |
2 | 198 calcAndPlot(coverages, outnames, mappability, outnamePDF, outnameMat,title) |
199 | |
200 | |
201 | |
202 |