view corr.R @ 14:5a4cbfb5fbc5 default tip

loading R3.0.2
author Ido Tamir <ido.tamir@imp.ac.at>
date Mon, 27 Oct 2014 10:57:35 +0100
parents de452b96da8e
children
line wrap: on
line source

#!/usr/bin/env Rscript
#
#for SGE
#$ -S /biosw/debian5-x86_64/R/3.0.0/bin/Rscript 
#$ -q solexa.q
#$ -P pipeline
#$ -cwd
#$ -l vf=30G

#args[1] <- comma separated bigwig input files
#args[2] <- comma separated names of files
#args[3] <- comma separated file formats
#args[4] <- output pdf
#args[5] <- output matrix
#args[6] <- title
#args[7] <- optional mappability bed file #not implemented because we have to change the coverage function (covWith0) to 1. intersection then diff because mappability can be wrong

options("useFancyQuotes" = FALSE)

TEST=FALSE

library(rtracklayer)
library(lattice)
library(latticeExtra)
library(hexbin)
library(Rsamtools)

## creates the union of coverages i.e. mappable regions
## todo: add mappability argument
createMappable <- function(coverages){
    suppressWarnings(Reduce(union, coverages)) #supress because missing chromosomes spit warinings
}

## adds mappable regions as 0 coverage to track 
covWith0 <- function(cov, mappable){
    print(paste("cov:", summary(cov)))
	c0 <- setdiff(mappable, cov)
    cus <- if(length(c0) > 0){
	  elementMetadata(c0)$score <- 0
      sort(c(c0, cov))
	}else{
      sort(cov)
    }
    rc <- Rle(score(cus), ranges(cus)@width)
    rc
}

## unlisted rlelist is too long. Taking weighted mean as approximation.
correlate <- function(rlA, rlB){
   weighted.mean(cor(rlA, rlB), w=unlist(lapply(rlA, length)))
}


## correlation dist
corrDist <- function(covRles, outnames){
    print("calculating correlation")
	vl <- length(covRles)
    v <- 1:vl
    o <- matrix(NA,vl,vl)
    colnames(o) <- outnames
    rownames(o) <- outnames
    tri <- lower.tri(o, diag=FALSE)   
    indic <- expand.grid(v,v)
    pairs <- indic[tri,]
    corrs <- apply(pairs,1, function(ins){
        index1 <- ins[1]
        index2 <- ins[2]
        cor(covRles[[index1]], covRles[[index2]])
	   # correlate(covRles[[index1]], covRles[[index2]])
    })
    
    o[lower.tri(o, diag=FALSE)] <- corrs
    o[upper.tri(o)] <- t(o)[upper.tri(o)]
    diag(o) <- 1
    o
}



plotCorr <- function(corrMat, title){
    cor.d <- dist(corrMat)
    cor.row <- as.dendrogram( hclust( cor.d ))
    ord <- order.dendrogram(cor.row)
    levelplot(
       corrMat[ord,ord],
       scales=list(x=list(rot=90)),
       colorkey = list(space="left"),
       col.regions=BTY(100),
       xlab = "",
       ylab = "pearson correlation coefficient",
       legend = list(top=list( fun = dendrogramGrob, args= list(
                             x=cor.row,
                             ord = ord,
                             side = "top"
                             ))),
	   main=title	   
	   )

}

calcAndPlot <- function(coverages, outnames, mappable, outnamePDF, outnameMat, title){

    ca <- lapply(coverages, covWith0, mappable)
    
    cd <- corrDist(ca, outnames)
	write.table(cd, outnameMat, quote=FALSE, sep="\t", col.names=FALSE)
    pdf(outnamePDF)
	print(plotCorr(cd, title))
    dev.off()

}

getCoverage <- function(infile, format){
  print(paste("reading", infile, format))
  values <- if(format == "bam"){
     import(infile, format=format)
  }else{
     import(infile, format=format, asRangedData = FALSE)
  } 
  if(tolower(format) %in% c("bigwig","wig")){
     values
  }else{
     cov <- coverage(values)
     gcov <- as(cov, "GRanges")
     gcov[score(gcov) > 0]
  }
}


getCoverages <- function(infiles, formats){
   apply(data.frame(file=infiles,format=formats),1, function(row){ getCoverage(row['file'], row['format']) })
}

getMappable <- function(coverages, mappable){
    print(paste("creating mappability", length(coverages), mappable))
	if(is.na(mappable)){
	    createMappable(coverages)
    }else{
        import(mappable, format="bed", asRangedData = FALSE)
    }
}

#I hate this bioconductor ranges stuff more and more
#this is a super s**t api
mergeSeqInfo <- function(coverages){
  seqinfos <- do.call("rbind", lapply(coverages, function(cov){ d <- as.data.frame(cov@seqinfo); d$seqnames <- rownames(d); d}))
  seqinfosMax <- do.call("rbind", by(seqinfos, seqinfos$seqnames, function(byChr){ byChr[order(byChr$seqlengths, decreasing=TRUE),][1,] }))
  data.frame(seqnames=seqinfosMax$seqnames,seqlengths=seqinfosMax$seqlengths, isCircular=seqinfosMax$isCircular, genome=seqinfosMax$genome)
}


changeSeqInfo <- function(coverages){
  seqinfo <- mergeSeqInfo(coverages)
  lapply(coverages, function(cov){ 
	  si <- as.data.frame(cov@seqinfo)
      si <- data.frame(chr=rownames(si), si, stringsAsFactors=FALSE)
	  sis <- subset(seqinfo, seqnames %in% si$chr)  
	  sis <- sis[match(si$chr, sis$seqnames),]	  
	  si <- Seqinfo(seqnames=as.character(sis$seqnames), seqlengths=sis$seqlengths, isCircular=sis$isCircular, genome=sis$genome)   
	  cov@seqinfo <- si
	  cov
  })
}

args <- commandArgs(TRUE)

if(TEST){
  args[1] <- c("file1.bed,file2.bed,file3.bed,file4.bed")
  args[1] <- c("file1.bam,file2.bam,file3.bam,file4.bam")
  args[2] <- c("file1,file2,file3,file4")
  args[3] <- c("bam,bam,bam,bam")
  args[4] <- "out.pdf"
  args[5] <- "mat.mat"
  args[6] <- "title"
  args[7] <- NA
}

infiles <- unlist(strsplit(args[1], ","))
outnames <- unlist(strsplit(args[2], ","))
formats <- unlist(strsplit(args[3], ","))
outnamePDF <- args[4]
outnameMat <- args[5]
title <- args[6]
mappable <- args[7]

lp <- list(infiles,outnames,formats,outnamePDF,outnameMat,title,mappable)
lpp <- sapply(lp, paste, sep=" ")
print("parsed input parameters")
print(lpp)


coverages <- getCoverages(infiles, formats)
coverages <- changeSeqInfo(coverages)

mappability <- getMappable(coverages, mappable)


calcAndPlot(coverages, outnames, mappability, outnamePDF, outnameMat,title)