Mercurial > repos > idot > coverage_correlation
view corr.R @ 14:5a4cbfb5fbc5 default tip
loading R3.0.2
author | Ido Tamir <ido.tamir@imp.ac.at> |
---|---|
date | Mon, 27 Oct 2014 10:57:35 +0100 |
parents | de452b96da8e |
children |
line wrap: on
line source
#!/usr/bin/env Rscript # #for SGE #$ -S /biosw/debian5-x86_64/R/3.0.0/bin/Rscript #$ -q solexa.q #$ -P pipeline #$ -cwd #$ -l vf=30G #args[1] <- comma separated bigwig input files #args[2] <- comma separated names of files #args[3] <- comma separated file formats #args[4] <- output pdf #args[5] <- output matrix #args[6] <- title #args[7] <- optional mappability bed file #not implemented because we have to change the coverage function (covWith0) to 1. intersection then diff because mappability can be wrong options("useFancyQuotes" = FALSE) TEST=FALSE library(rtracklayer) library(lattice) library(latticeExtra) library(hexbin) library(Rsamtools) ## creates the union of coverages i.e. mappable regions ## todo: add mappability argument createMappable <- function(coverages){ suppressWarnings(Reduce(union, coverages)) #supress because missing chromosomes spit warinings } ## adds mappable regions as 0 coverage to track covWith0 <- function(cov, mappable){ print(paste("cov:", summary(cov))) c0 <- setdiff(mappable, cov) cus <- if(length(c0) > 0){ elementMetadata(c0)$score <- 0 sort(c(c0, cov)) }else{ sort(cov) } rc <- Rle(score(cus), ranges(cus)@width) rc } ## unlisted rlelist is too long. Taking weighted mean as approximation. correlate <- function(rlA, rlB){ weighted.mean(cor(rlA, rlB), w=unlist(lapply(rlA, length))) } ## correlation dist corrDist <- function(covRles, outnames){ print("calculating correlation") vl <- length(covRles) v <- 1:vl o <- matrix(NA,vl,vl) colnames(o) <- outnames rownames(o) <- outnames tri <- lower.tri(o, diag=FALSE) indic <- expand.grid(v,v) pairs <- indic[tri,] corrs <- apply(pairs,1, function(ins){ index1 <- ins[1] index2 <- ins[2] cor(covRles[[index1]], covRles[[index2]]) # correlate(covRles[[index1]], covRles[[index2]]) }) o[lower.tri(o, diag=FALSE)] <- corrs o[upper.tri(o)] <- t(o)[upper.tri(o)] diag(o) <- 1 o } plotCorr <- function(corrMat, title){ cor.d <- dist(corrMat) cor.row <- as.dendrogram( hclust( cor.d )) ord <- order.dendrogram(cor.row) levelplot( corrMat[ord,ord], scales=list(x=list(rot=90)), colorkey = list(space="left"), col.regions=BTY(100), xlab = "", ylab = "pearson correlation coefficient", legend = list(top=list( fun = dendrogramGrob, args= list( x=cor.row, ord = ord, side = "top" ))), main=title ) } calcAndPlot <- function(coverages, outnames, mappable, outnamePDF, outnameMat, title){ ca <- lapply(coverages, covWith0, mappable) cd <- corrDist(ca, outnames) write.table(cd, outnameMat, quote=FALSE, sep="\t", col.names=FALSE) pdf(outnamePDF) print(plotCorr(cd, title)) dev.off() } getCoverage <- function(infile, format){ print(paste("reading", infile, format)) values <- if(format == "bam"){ import(infile, format=format) }else{ import(infile, format=format, asRangedData = FALSE) } if(tolower(format) %in% c("bigwig","wig")){ values }else{ cov <- coverage(values) gcov <- as(cov, "GRanges") gcov[score(gcov) > 0] } } getCoverages <- function(infiles, formats){ apply(data.frame(file=infiles,format=formats),1, function(row){ getCoverage(row['file'], row['format']) }) } getMappable <- function(coverages, mappable){ print(paste("creating mappability", length(coverages), mappable)) if(is.na(mappable)){ createMappable(coverages) }else{ import(mappable, format="bed", asRangedData = FALSE) } } #I hate this bioconductor ranges stuff more and more #this is a super s**t api mergeSeqInfo <- function(coverages){ seqinfos <- do.call("rbind", lapply(coverages, function(cov){ d <- as.data.frame(cov@seqinfo); d$seqnames <- rownames(d); d})) seqinfosMax <- do.call("rbind", by(seqinfos, seqinfos$seqnames, function(byChr){ byChr[order(byChr$seqlengths, decreasing=TRUE),][1,] })) data.frame(seqnames=seqinfosMax$seqnames,seqlengths=seqinfosMax$seqlengths, isCircular=seqinfosMax$isCircular, genome=seqinfosMax$genome) } changeSeqInfo <- function(coverages){ seqinfo <- mergeSeqInfo(coverages) lapply(coverages, function(cov){ si <- as.data.frame(cov@seqinfo) si <- data.frame(chr=rownames(si), si, stringsAsFactors=FALSE) sis <- subset(seqinfo, seqnames %in% si$chr) sis <- sis[match(si$chr, sis$seqnames),] si <- Seqinfo(seqnames=as.character(sis$seqnames), seqlengths=sis$seqlengths, isCircular=sis$isCircular, genome=sis$genome) cov@seqinfo <- si cov }) } args <- commandArgs(TRUE) if(TEST){ args[1] <- c("file1.bed,file2.bed,file3.bed,file4.bed") args[1] <- c("file1.bam,file2.bam,file3.bam,file4.bam") args[2] <- c("file1,file2,file3,file4") args[3] <- c("bam,bam,bam,bam") args[4] <- "out.pdf" args[5] <- "mat.mat" args[6] <- "title" args[7] <- NA } infiles <- unlist(strsplit(args[1], ",")) outnames <- unlist(strsplit(args[2], ",")) formats <- unlist(strsplit(args[3], ",")) outnamePDF <- args[4] outnameMat <- args[5] title <- args[6] mappable <- args[7] lp <- list(infiles,outnames,formats,outnamePDF,outnameMat,title,mappable) lpp <- sapply(lp, paste, sep=" ") print("parsed input parameters") print(lpp) coverages <- getCoverages(infiles, formats) coverages <- changeSeqInfo(coverages) mappability <- getMappable(coverages, mappable) calcAndPlot(coverages, outnames, mappability, outnamePDF, outnameMat,title)