diff NGSrich_0.5.5/R/summarize_enrichment.R @ 0:89ad0a9cca52 default tip

Uploaded
author pfrommolt
date Mon, 21 Nov 2011 08:12:19 -0500
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/NGSrich_0.5.5/R/summarize_enrichment.R	Mon Nov 21 08:12:19 2011 -0500
@@ -0,0 +1,223 @@
+#!/usr/bin/Rscript
+
+infile=as.character(commandArgs()[6])
+outdir=as.character(commandArgs()[7])
+poor=as.numeric(commandArgs()[8])
+high=as.numeric(commandArgs()[9])
+
+
+outfile=paste(outdir,"/summary.html",sep="")
+cutoff=c(poor,high)
+
+xmlfiles<-bedfiles<-character(0)
+indirs<-as.character(readLines(infile))
+for(thisdir in indirs){
+  thisdir=paste(thisdir,"/data",sep="")
+  xmlfiles=c(xmlfiles,list.files(thisdir,pattern=".xml",full.names=TRUE))
+  bedfiles=c(bedfiles,list.files(thisdir,pattern=".bed",full.names=TRUE))
+}
+nsamples<-length(indirs)
+
+samplenames<-character(0)
+diffs<-matrix(nrow=5,ncol=nsamples)
+averagecov<-stddevcov<-targetsize<-numreads_total<-numreads_target<-numeric(0)
+sample1<-sample5<-sample10<-sample20<-sample30<-numeric(0)
+
+
+##Read XML input files
+xmltag<-function(line){
+  return(strsplit(strsplit(line,">")[[1]][2],"</")[[1]][1])
+}
+i=1
+nextfold=0
+for(file in xmlfiles){
+  numreads<-numeric(0)
+  inlines<-readLines(file)
+  for(line in inlines){
+    if(length(grep("SampleName",line)>0)){samplenames<-c(samplenames,xmltag(line))}
+    if(length(grep("NumberReads",line)>0)){numreads<-c(numreads,as.numeric(xmltag(line)))}
+    if(length(grep("AvTargetCoverage",line)>0)){averagecov=c(averagecov,as.numeric(xmltag(line)))}
+    if(length(grep("SDTargetCoverage",line)>0)){stddevcov=c(stddevcov,as.numeric(xmltag(line)))}
+    if(length(grep("TargetSize",line)>0)){targetsize=as.numeric(xmltag(line))}
+    if(length(grep("<from1x>",line)>0)){nextfold=1}
+    else{
+      if((nextfold==1) && length(grep("PercBases",line))>0){
+        sample1<-c(sample1,as.numeric(xmltag(line)))
+        nextfold=0
+      }
+    }
+    if(length(grep("<from5x>",line)>0)){nextfold=5}
+    else{
+      if((nextfold==5) && length(grep("PercBases",line))>0){
+        sample5<-c(sample5,as.numeric(xmltag(line)))
+        nextfold=0
+      }
+    }
+    if(length(grep("<from10x>",line)>0)){nextfold=10}	
+    else{
+      if((nextfold==10) && length(grep("PercBases",line))>0){
+        sample10<-c(sample10,as.numeric(xmltag(line)))
+        nextfold=0
+      }
+    }
+    if(length(grep("<from20x>",line)>0)){nextfold=20}
+    else{
+      if((nextfold==20) && length(grep("PercBases",line))>0){
+        sample20<-c(sample20,as.numeric(xmltag(line)))
+        nextfold=0
+      }
+    }
+    if(length(grep("<from30x>",line)>0)){nextfold=30}
+    else{
+      if((nextfold==30) && length(grep("PercBases",line))>0){
+        sample30<-c(sample30,as.numeric(xmltag(line)))
+        nextfold=0
+      }
+    }
+  }
+  numreads_total<-c(numreads_total,numreads[1])
+  numreads_target<-c(numreads_target,numreads[2])
+  diffs[,i]<-c(sample30[i],sample20[i]-sample30[i],sample10[i]-sample20[i],sample5[i]-sample10[i],sample1[i]-sample5[i])
+  i=i+1
+}
+
+
+##Create barplot
+rownames(diffs)<-c("Covered 30x","Covered 20x","Covered 10x","Covered 5x","Covered 1x")
+plot_samplenames<-samplenames
+s=0
+for(sname in plot_samplenames){
+  s=s+1
+  if(nchar(sname)>10){plot_samplenames[s]<-paste("Sample ",s,"\n(table above)",sep="")}
+}
+colnames(diffs)<-plot_samplenames
+png(file=paste(outdir,"/summary_plot.png",sep=""),width=66*(4+nsamples),height=400)
+par(oma=c(1,0,0,1))
+barplot(diffs,
+       	legend=TRUE,
+        col=c("darkblue","steelblue1","tomato1","tomato3","tomato4"),
+        xlim=c(0,4+nsamples),ylim=c(0,100),
+        ylab="Percentage",las=2 ) #,
+#        args.legend=list(x=nsamples+4,y=80))
+dev.off()
+
+
+
+##Read BED files
+sametarget=TRUE
+thisbed<-read.table(bedfiles[1],stringsAsFactors=FALSE)
+genes<-levels(as.factor(thisbed$V4))
+meancov<-matrix(nrow=length(genes),ncol=length(bedfiles))
+chr<-start<-end<-character(0)
+
+i=1
+for(file in bedfiles){
+  thisbed<-read.table(file,stringsAsFactors=FALSE)
+  genes_thissample<-levels(as.factor(thisbed$V4))
+  if(length(genes_thissample)!=length(genes)){sametarget=FALSE}
+  for(j in 1:length(genes)){
+    thisgene<-thisbed[thisbed$V4==genes[j],]
+    chr[j]<-thisgene[1,1]
+    start[j]<-thisgene[1,2]
+    end[j]<-thisgene[1,3]
+    meancov[j,i]<-as.numeric(mean(thisgene[,5]))
+  }
+  i=i+1
+}
+
+meancov<-data.frame(chr,start,end,genes,meancov)
+for(i in 1:nrow(meancov)){
+  print(i)
+  meancov$ave[i]<-round(mean(as.numeric(meancov[i,4+(1:nsamples)])),4)
+  meancov$stddev[i]<-round(sd(as.numeric(meancov[i,4+(1:nsamples)])),4)
+  meancov$mini[i]<-round(min(as.numeric(meancov[i,4+(1:nsamples)])),4)
+  meancov$maxi[i]<-round(max(as.numeric(meancov[i,4+(1:nsamples)])),4)
+}
+meancov_sorted<-meancov[order(meancov$ave),]
+poorly_all<-meancov_sorted[meancov_sorted$max<cutoff[1],]
+
+
+##Write HTML output
+cat(file=outfile,paste(
+      "<html>\n",
+      "<head>\n",
+      "<title>Enrichment Performance</title>\n",
+      "<style type=\"text/css\">\n",
+      "body{font-family:sans-serif;}\n",
+      "h2,h3{color: darkblue;}\n",
+      "a{color: darkblue;}\n",
+      "   table.output td{\n",
+      "   padding: 4px; background-color: lightskyblue;\n",
+      "   border: 1px solid #000; border-color: darkblue;\n",
+      "}\n",
+      "</Style>\n",
+      "</head>\n",
+      "<body>\n",
+      "<h2>Target Enrichment Performance</h2><br/>\n",
+      "<table class=\"output\">\n",
+      "  <tr>\n",
+      "  <td><b>Sample</b></td><td><b># Reads</b></td><td><b># On Target</b></td><td><b>Coverage Mean</b></td><td><b>Coverage Std Dev</b></td><td><b>Covered 1x</b></td><td><b>Covered 5x</b></td><td><b>Covered 10x</b></td><td><b>Covered 20x</b></td><td><b>Covered 30x</b></td><td><b>Details</b></td>\n",
+      "</tr>\n",sep=""))
+for(i in 1:nsamples){
+  cat(file=outfile,paste(
+        "<tr><td>",samplenames[i],"</td><td>",numreads_total[i],"</td>",
+        "<td>",numreads_target[i],"</td><td>",averagecov[i],"</td>",
+        "<td>",stddevcov[i],"</td><td>",sample1[i],"</td><td>",sample5[i],"</td>",
+        "<td>",sample10[i],"</td><td>",sample20[i],"</td><td>",sample30[i],"</td>",
+        "<td><a href=\"",rev(strsplit(indirs[i],"/")[[1]])[1],"/",samplenames[i],"_enrichment.html\">Show</a></td></tr>\n",
+        sep=""),append=TRUE)
+}
+cat(file=outfile,paste(
+      "</table>\n",
+      "<br/><br/>\n",
+      "<h2>Target Base Percentage Covered</h2>\n",
+      "<img src=\"summary_plot.png\"></img>\n",sep=""),append=TRUE)
+if(!sametarget){
+	warning("Target regions are not the same for all samples.")
+} else{
+cat(file=outfile,paste(
+      "<h2>Poorly Covered in All Samples (Cutoff: ",cutoff[1],"x)</h2>\n",sep=""),append=TRUE)
+if(nrow(poorly_all)==0){
+cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE)
+} else{
+cat(file=outfile,paste(
+      "<table class=\"output\">\n",
+      "<tr>\n",
+      "<td><b>Target Region</b></td><td><b>Gene</b></td><td><b>Mean Across Samples</b></td><td><b>Std Dev Across Samples</b></td>\n",sep=""),append=TRUE)
+for(i in 1:nrow(poorly_all)){
+  cat(file=outfile,paste(
+        "<tr>\n",
+        "<td>",poorly_all$chr[i],":",poorly_all$start[i],"-",poorly_all$end[i],"</td>\n",
+        "<td>",poorly_all$genes[i],"</td>\n",
+        "<td>",poorly_all$ave[i],"</td>\n",
+        "<td>",poorly_all$stddev[i],"</td>\n",
+        "<tr>\n",sep=""),append=TRUE)
+}
+cat(file=outfile,"</table>\n",append=TRUE)
+}
+
+meancov_sorted<-meancov[order(meancov$ave,decreasing=TRUE),] ##Highly Covered
+highly_all<-meancov_sorted[meancov_sorted$min>cutoff[2],]
+cat(file=outfile,paste(
+      "<h2>Highly Covered in All Samples (Cutoff: ",cutoff[2],"x)</h2>\n",sep=""),append=TRUE)
+if(nrow(highly_all)==0){
+cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE)
+} else{
+	cat(file=outfile,paste("<table class=\"output\">\n",
+      "<tr>\n",
+      "<td><b>Target Region</b></td><td><b>Gene</b></td><td><b>Mean Across Sample</b></td><td><b>Std Dev Across Samples</b></td>\n",sep=""),append=TRUE)
+for(i in 1:nrow(highly_all)){
+  cat(file=outfile,paste(
+        "<tr>\n",
+        "<td>",highly_all$chr[i],":",highly_all$start[i],"-",highly_all$end[i],"</td>\n",
+        "<td>",highly_all$genes[i],"</td>\n",
+        "<td>",highly_all$ave[i],"</td>\n",
+        "<td>",highly_all$stddev[i],"</td>\n",
+        "<tr>\n",sep=""),append=TRUE)
+}
+cat(file=outfile,"</table>\n",append=TRUE)
+}
+
+}
+
+cat(file=outfile,"</body>\n</html>\n",append=TRUE)