diff NGSrich_0.5.5/R/eval_enrichment.R @ 0:89ad0a9cca52 default tip

Uploaded
author pfrommolt
date Mon, 21 Nov 2011 08:12:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/NGSrich_0.5.5/R/eval_enrichment.R	Mon Nov 21 08:12:19 2011 -0500
@@ -0,0 +1,401 @@
+#!/usr/bin/Rscript
+
+xmlfile=as.character(commandArgs()[6])
+bedfile=as.character(commandArgs()[7])
+outdir=as.character(commandArgs()[8])
+genome=as.character(commandArgs()[9])
+poor=as.numeric(commandArgs()[10])
+high=as.numeric(commandArgs()[11])
+samplename=as.character(commandArgs()[12])
+targetfile=as.character(commandArgs()[13])
+details=as.numeric(commandArgs()[14])
+cutoff=0
+sdetails=""
+#samplename0=strsplit(strsplit(xmlfile,"_")[[1]][1],"/")[[1]]
+#samplename<-samplename0[length(samplename0)]
+outfile=paste(outdir,"/",samplename,"_enrichment.html",sep="")
+
+##Read XML summary
+cat("Reading XML file ... ")
+xmltag<-function(line){
+	return(strsplit(strsplit(line,">")[[1]][2],"</")[[1]][1])
+}
+xml<-readLines(xmlfile)
+numreads<-numeric(0)
+nextfold=0
+for(line in xml){
+  if(length(grep("ReadLength",line)>0)){
+    readlength<-as.numeric(xmltag(line))
+  }
+  if(length(grep("NumberReads",line)>0)){
+    numreads<-c(numreads,as.numeric(xmltag(line)))
+  }
+  if(length(grep("AvTargetCoverage",line)>0)){
+    averagecov=as.numeric(xmltag(line))
+  }
+  if(details==1){
+	sdetails="<a href=\"chromosomes/chromosomes.html\">[Show Details]</a><br/>"
+  }
+  if(length(grep("SDTargetCoverage",line)>0)){
+    stddevcov=as.numeric(xmltag(line))
+  }
+  if(length(grep("TargetSize",line)>0)){
+    targetsize=as.numeric(xmltag(line))
+  }
+  if(length(grep("<from1x>",line)>0)){nextfold=1}
+  else{
+    if((nextfold==1) && length(grep("PercBases",line))>0){
+      sample1<-as.numeric(xmltag(line))
+      nextfold=0
+    }
+  }
+  if(length(grep("<from5x>",line)>0)){nextfold=5}
+  else{
+    if((nextfold==5) && length(grep("PercBases",line))>0){
+      sample5<-as.numeric(xmltag(line))
+      nextfold=0
+    }
+  }
+  if(length(grep("<from10x>",line)>0)){nextfold=10}	
+  else{
+    if((nextfold==10) && length(grep("PercBases",line))>0){
+      sample10<-as.numeric(xmltag(line))
+      nextfold=0
+    }
+  }
+  if(length(grep("<from20x>",line)>0)){nextfold=20}
+  else{
+    if((nextfold==20) && length(grep("PercBases",line))>0){
+      sample20<-as.numeric(xmltag(line))
+      nextfold=0
+    }
+  }
+  if(length(grep("<from30x>",line)>0)){nextfold=30}
+  else{
+    if((nextfold==30) && length(grep("PercBases",line))>0){
+      
+      sample30<-as.numeric(xmltag(line))
+      nextfold=0
+    }
+  }
+}
+
+numreads_total<-numreads[1]
+numreads_target<-numreads[3]
+tpkm=round(numreads_target/((targetsize/1000)*(numreads_total/1000000)),2)
+
+cat("ready.\n")
+
+
+##Read BED enrichment file and summarize output
+cat("Reading BED file ... ")
+bed<-read.table(bedfile,stringsAsFactors=FALSE)
+area0_2<-sum(bed$V5<2)
+area2_10<-sum((bed$V5>=2) & (bed$V5<10))
+area10_20<-sum((bed$V5>=10) & (bed$V5<20))
+area20_30<-sum((bed$V5>=20) & (bed$V5<30))
+area30_50<-sum((bed$V5>=30) & (bed$V5<50))
+area50_100<-sum((bed$V5>=50) & (bed$V5<100))
+areagr100<-sum(bed$V5>100)
+cat("ready.\n")
+
+
+##Create pieplot
+cat("Creating coverage pieplot ... ")
+png(file=paste(outdir,"/plots/",samplename,"_pieplot.png",sep=""),width=580)
+par(mar=c(1,7,1,7))
+pie(c(area0_2,area2_10,area10_20,area20_30,area30_50,area50_100,areagr100),
+    labels=c(paste("0x to 2x (",area0_2,")",sep=""),
+      paste("2x to 10x (",area2_10,")",sep=""),
+      paste("10x to 20x (",area10_20,")",sep=""),
+      paste("20x to 30x (",area20_30,")",sep=""),
+      paste("30x to 50x (",area30_50,")",sep=""),
+      paste("50x to 100x (",area50_100,")",sep=""),
+      paste("above 100x (",areagr100,")",sep="")),
+    col=c("gray30","gray40","gray50","gray60","gray70","gray80","gray90"),main="") ##Mean Coverage of Target Regions")
+garbage<-dev.off()
+cat("ready.\n")
+
+
+cat("Preparing coverage barplots ... ")
+maxgenemean=0
+genes<-levels(as.factor(bed$V4))
+genes<-genes[genes!="unknown"]
+ngenes<-length(genes)
+
+for(i in 1:ngenes){
+  genemean<-mean(bed[bed$V4==genes[i],5])
+  if(maxgenemean<genemean){maxgenemean=genemean}
+}
+
+maxgenemean=maxgenemean-(maxgenemean%%100)+100
+if((cutoff>0) && (maxgenemean>cutoff)){
+  maxgenemean=cutoff
+}
+cat("ready.\n")
+
+fwidth<-function(genes){
+    fwidth0=10*length(genes)
+    if(length(genes)<=100){fwidth0=10*length(genes)}
+    if(length(genes)<=70){fwidth0=10*length(genes)}
+    if(length(genes)<=40){fwidth0=25*length(genes)}
+    if(length(genes)<=20){fwidth0=30*length(genes)}
+    if(length(genes)<=10){fwidth0=35*length(genes)}
+    if(length(genes)<=5){fwidth0=50*length(genes)}
+    return(fwidth0)
+}
+
+
+chromosomes<-levels(as.factor(bed$V1))
+if(length(genes)>=2000){
+  cat("Creating coverage barplots ... ")
+  for(chr in chromosomes){
+
+    chrbed<-bed[bed$V1==chr,]
+    genemean<-numeric(0)
+    genes<-levels(as.factor(chrbed$V4))
+    genes<-genes[genes!="unknown"]
+
+    for(i in 1:length(genes)){
+      if(cutoff>0){genemean[i]<-min(mean(chrbed[chrbed$V4==genes[i],5]),cutoff)}
+      else{genemean[i]<-mean(chrbed[chrbed$V4==genes[i],5])}
+    }
+
+    png(file=paste(outdir,"/plots/",samplename,"_target_coverage_",chr,".png",sep=""),width=fwidth(genes),height=450)
+    par(mar=c(7,5,1,2))
+    barplot(as.numeric(genemean),names.arg=genes,las=2,
+            ylab=paste("Average Coverage at Gene Locus (",chr,")",sep=""),col="tomato3",xlim=c(0.043*length(genemean),1.2*length(genes)-0.043*length(genemean)),ylim=c(0,maxgenemean))
+    lineh=0
+    while(lineh<maxgenemean){
+      lineh=lineh+100
+      lines(c(-100,1000+length(genes)),c(lineh,lineh),col="gray")
+    }
+    garbage<-dev.off()
+  }
+  cat("ready.\n")
+} else{
+  cat("Creating coverage barplot ... ")
+  for(i in 1:length(genes)){
+    if(cutoff>0){genemean[i]<-min(mean(bed[bed$V4==genes[i],5]),cutoff)}
+    else{genemean[i]<-mean(bed[bed$V4==genes[i],5])}
+  }
+
+
+  png(file=paste(outdir,"/plots/",samplename,"_target_coverage.png",sep=""),width=fwidth(genes),height=450)
+  par(mar=c(7,5,1,2))
+  barplot(as.numeric(genemean),names.arg=genes,las=2,
+          ylab="Average Coverage at Gene Locus",col="tomato3",xlim=c(0,1.2*ngenes),ylim=c(0,maxgenemean))
+  lineh=0
+  while(lineh<maxgenemean){
+    lineh=lineh+100
+    lines(c(-100,1000+length(genes)),c(lineh,lineh),col="gray")
+  }
+  garbage<-dev.off()
+  cat("ready.\n")
+}
+
+
+##Searching for poorly/highly covered genes
+cat("Searching for poorly (<",poor,"x) and highly (>",high,"x) covered genes ... ",sep="")
+genes<-levels(as.factor(bed$V4))
+chr<-start<-end<-gene1<-ave<-stddev<-gr1x<-gr5x<-gr10x<-gr20x<-gr30x<-numeric(0)
+for(gene in genes){
+
+  bed0<-bed[bed$V4==gene,]
+  len<-bed0[,3]-bed0[,2]
+  fold1<-fold5<-fold10<-fold20<-fold30<-numeric(0)
+ for(i in 1:nrow(bed0)){fold1<-c(fold1,as.numeric(bed0$V6[i]))}
+ for(i in 1:nrow(bed0)){fold5<-c(fold5,as.numeric(bed0$V7[i]))}
+ for(i in 1:nrow(bed0)){fold10<-c(fold10,as.numeric(bed0$V8[i]))}
+ for(i in 1:nrow(bed0)){fold20<-c(fold20,as.numeric(bed0$V9[i]))}
+
+ for(i in 1:nrow(bed0)){fold30<-c(fold30,as.numeric(bed0$V10[i]))}
+  fold1_gene<-paste(round(sum(fold1*len)/sum(len),2),"%",sep="")
+  fold5_gene<-paste(round(sum(fold5*len)/sum(len),2),"%",sep="")
+  fold10_gene<-paste(round(sum(fold10*len)/sum(len),2),"%",sep="")
+  fold20_gene<-paste(round(sum(fold20*len)/sum(len),2),"%",sep="")
+  fold30_gene<-paste(round(sum(fold30*len)/sum(len),2),"%",sep="")
+  
+  chr<-c(chr,bed0[1,1])
+  start<-c(start,min(bed0[,2]))
+  end<-c(end,max(bed0[,3]))
+  gene1<-c(gene1,gene)
+  ave<-c(ave,round(mean(bed0[,5]),2))
+  if(is.na(sd(bed0[,5]))){
+    stddev<-c(stddev,0)
+  }
+  else{
+    stddev<-c(stddev,round(sd(bed0[,5]),2))
+  }
+  
+  gr1x<-c(gr1x,fold1_gene)
+  gr5x<-c(gr5x,fold5_gene)
+  gr10x<-c(gr10x,fold10_gene)
+  gr20x<-c(gr20x,fold20_gene)
+  gr30x<-c(gr30x,fold30_gene)
+  
+}
+
+bed_gene<-data.frame(chr,start,end,gene1,ave,stddev,gr1x,gr5x,gr10x,gr20x,gr30x)
+
+poorly_covered<-bed_gene[bed_gene$ave<poor,]
+poorly_covered<-poorly_covered[order(poorly_covered[,5],decreasing=FALSE),]
+highly_covered<-bed_gene[bed_gene$ave>high,]
+highly_covered<-highly_covered[order(highly_covered[,5],decreasing=TRUE),]
+
+linkpoor<-paste("http://www.genome.ucsc.edu/cgi-bin/hgTracks?&db=",genome,"&position=",
+                poorly_covered$chr,"%3A",poorly_covered$start,"-",
+                poorly_covered$end,"&hgt.suggest=&pix=800&Submit=submit&hgsid=183341879",sep="")
+linkhigh<-paste("http://www.genome.ucsc.edu/cgi-bin/hgTracks?&db=",genome,"&position=",
+                highly_covered$chr,"%3A",highly_covered$start,"-",
+                highly_covered$end,"&hgt.suggest=&pix=800&Submit=submit&hgsid=183341879",sep="")
+cat("ready.\n",sep="")
+
+
+##Output HTML document
+cat("Writing HTML output ... ");
+
+cat(file=outfile,paste(
+      "<html>\n",
+      "<head>\n",
+      "<title>Enrichment Performance</title>\n",
+      "<style type=\"text/css\">\n",
+      "  body{font-family:sans-serif;}\n",
+      "  h2,h3{color: darkblue;}\n",
+      "  a{color:darkblue;}\n",
+      "  table.output td{",
+      "    padding: 4px; background-color: lightskyblue;",
+      "    border: 1px solid #000; border-color: darkblue;",
+      "  }\n",
+      "</style>\n",
+      "\n",
+      "<script language=\"JavaScript\">\n",
+      "  var questionClass=\"chrView\";\n",
+      "  function collapseAll(){\n",
+      "    var allSections = document.getElementsByTagName(\"div\");\n",
+      "    for(i=0; i<allSections.length; i++){\n",
+      "      if(allSections[i].className==questionClass){\n",
+      "        allSections[i].style.display=\"none\";\n",
+      "      }\n",
+      "    }\n",
+      "  }\n",
+      "  function expand(name){\n",
+      "    collapseAll();\n",
+      "    var newStyle=\"\";\n",
+      "    if(document.getElementById(name).style.display!=\"block\"){\n",
+      "      newStyle=\"block\";\n",
+      "    }\n",
+      "    else{\n",
+      "      newStyle=\"none\";\n",
+      "    }\n",
+      "    document.getElementById(name).style.display=newStyle;\n",
+      "  }\n",
+      "</script>\n",      
+      "</head>\n",
+      "\n",
+      "<body onload=\"expand('",chromosomes[1],"')\">",
+      "<h2>Enrichment Performance of Sample ",samplename,"</h2>\n",
+      "<table>\n",
+      "<tr>\n",
+      "<td>\n",
+      "<h3>Summary Statistics</h3>\n",
+      "<table class=\"output\">\n",
+      "  <tr><td><b># Reads</b></td><td>",numreads_total,"</td></tr>\n",
+      "  <tr><td><b># On Target &plusmn 100 bp</b></td><td>",numreads_target,"</td></tr>\n",
+      "  <tr><td><b>Target Size (bp)</b><td>",targetsize,"</td></tr>\n",
+      "  <tr><td><b># Target Regions</b><td>",nrow(bed),"</td></tr>\n",
+      "  <tr><td><b>Coverage Mean</b></td><td>",averagecov,"</td></tr>\n",
+      "  <tr><td><b>Coverage Std Dev</b></td><td>",stddevcov,"</td></tr>\n",
+      "  <tr><td><b>Covered 1x</b></td><td>",sample1,"</td></tr>\n",
+      "  <tr><td><b>Covered 5x</b></td><td>",sample5,"</td></tr>\n",
+      "  <tr><td><b>Covered 10x</b></td><td>",sample10,"</td></tr>\n",
+      "  <tr><td><b>Covered 20x</b></td><td>",sample20,"</td></tr>\n",
+      "  <tr><td><b>Covered 30x</b></td><td>",sample30,"</td></tr>\n",
+      "  <tr><td><b>TPKM</b></td><td>",tpkm,"</td></tr>\n",
+      "</table>\n",
+      "</td>\n",
+      "<td width=\"10%\"></td>",
+      "<td>\n",
+      "<img src=\"plots/",samplename,"_pieplot.png\"></img>\n",
+      "</td>\n",
+      "</tr>\n",
+      "</table>\n",
+      "<br/>\n",
+      "<h2>Genewise Target Coverage</h2>",sdetails,"<br/>\n",sep=""))
+      
+
+if(ngenes>=2000){
+for(chromosome in chromosomes){
+  cat(file=outfile,paste("<a href=\"javascript:expand('",chromosome,"')\">",chromosome,"</a>\n",sep=""),append=TRUE)
+}
+for(chromosome in chromosomes){
+  cat(file=outfile,paste("<div style=\"height:480px; overflow:auto;\" class=\"chrView\" id=\"",chromosome,"\"><img src=\"plots/",samplename,"_target_coverage_",chromosome,".png\"></img></div>\n",sep=""),append=TRUE)
+}
+} else{
+  cat(file=outfile,paste("<div style=\"height:480px; overflow:auto;\"><img src=\"plots/",samplename,"_target_coverage.png\"></img></div>\n",sep=""),append=TRUE)
+}
+
+cat(file=outfile,paste("<br/><br/>\n",
+      "<h2>Poorly Covered Genes (Cutoff: ",poor,"x)</h2>\n",sep=""),append=TRUE)
+if(nrow(poorly_covered)==0){
+	cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE)
+} else{
+cat(file=outfile,paste(
+      "<table class=\"output\">\n",
+      "  <tr>\n",
+      "    <td><b>Region</b></td><td><b>Gene</b></td>\n",
+      "    <td><b>Coverage Mean</b></td>\n",
+      "    <td><b>Covered 1x</b></td><td><b>Covered 5x</b></td>\n",
+      "    <td><b>Covered 10x</b></td><td><b>Covered 20x</b></td>\n",
+      "    <td><b>Covered 30x</b></td><td><b>External Link</b></td>\n",
+      "  </tr>\n",sep=""),append=TRUE)
+
+for(i in 1:nrow(poorly_covered)){
+cat(file=outfile,paste(
+      "  <tr>\n",
+      "    <td>",poorly_covered[i,1],":",poorly_covered[i,2],"-",poorly_covered[i,3],"</td>\n",
+      "    <td>",poorly_covered[i,4],"</td>\n",
+      "    <td>",poorly_covered[i,5],"</td>\n",
+      "    <td>",poorly_covered[i,7],"</td>\n",
+      "    <td>",poorly_covered[i,8],"</td>\n",
+      "    <td>",poorly_covered[i,9],"</td>\n",
+      "    <td>",poorly_covered[i,10],"</td>\n",
+      "    <td>",poorly_covered[i,11],"</td>\n",
+      "    <td><a href=\"",linkpoor[i],"\">Show in Genome Browser</a></td>\n",
+      "  </tr>\n",sep=""),append=TRUE)
+}
+
+cat(file=outfile,"</table><br/>\n",append=TRUE)
+}
+
+cat(file=outfile,paste("<h2>Highly Covered Genes (Cutoff: ",high,"x)</h2>\n",sep=""),append=TRUE)
+if(nrow(highly_covered)==0){
+ cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE)
+} else{
+cat(file=outfile,paste(
+      "<table class=\"output\">\n",
+      "  <tr>\n",
+      "    <td><b>Target</b></td><td><b>Gene</b></td><td><b>Coverage Mean</b></td>",
+      "    <td><b>Covered 1x</b></td><td><b>Covered 5x</b></td><td><b>Covered 10x</b></td>",
+      "    <td><b>Covered 20x</b></td><td><b>Covered 30x</b></td><td><b>External Link</b></td>\n",
+      "  </tr>\n",sep=""),append=TRUE)
+for(i in 1:nrow(highly_covered)){
+cat(file=outfile,paste("  <tr>\n",
+      "    <td>",highly_covered[i,1],":",highly_covered[i,2],"-",highly_covered[i,3],"</td>\n",
+      "    <td>",highly_covered[i,4],"</td>\n",
+      "    <td>",highly_covered[i,5],"</td>\n",
+      "    <td>",highly_covered[i,7],"</td>\n",
+      "    <td>",highly_covered[i,8],"</td>\n",
+      "    <td>",highly_covered[i,9],"</td>\n",
+      "    <td>",highly_covered[i,10],"</td>\n",
+      "    <td>",highly_covered[i,11],"</td>\n",
+      "    <td><a href=\"",linkhigh[i],"\">Show in Genome Browser</a></td>\n",
+      "  </tr>",sep=""),append=TRUE)
+}
+
+cat(file=outfile,"</table>\n",append=TRUE)
+}
+cat(file=outfile,"<br/>BED file used for specification of target regions:<br/>",targetfile,append=TRUE)
+cat(file=outfile,"</body>\n</html>\n",append=TRUE)
+
+cat("ready.\n")