Mercurial > repos > pfrommolt > ngsrich
diff NGSrich_0.5.5/R/eval_enrichment.R @ 0:89ad0a9cca52 default tip
Uploaded
author | pfrommolt |
---|---|
date | Mon, 21 Nov 2011 08:12:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/NGSrich_0.5.5/R/eval_enrichment.R Mon Nov 21 08:12:19 2011 -0500 @@ -0,0 +1,401 @@ +#!/usr/bin/Rscript + +xmlfile=as.character(commandArgs()[6]) +bedfile=as.character(commandArgs()[7]) +outdir=as.character(commandArgs()[8]) +genome=as.character(commandArgs()[9]) +poor=as.numeric(commandArgs()[10]) +high=as.numeric(commandArgs()[11]) +samplename=as.character(commandArgs()[12]) +targetfile=as.character(commandArgs()[13]) +details=as.numeric(commandArgs()[14]) +cutoff=0 +sdetails="" +#samplename0=strsplit(strsplit(xmlfile,"_")[[1]][1],"/")[[1]] +#samplename<-samplename0[length(samplename0)] +outfile=paste(outdir,"/",samplename,"_enrichment.html",sep="") + +##Read XML summary +cat("Reading XML file ... ") +xmltag<-function(line){ + return(strsplit(strsplit(line,">")[[1]][2],"</")[[1]][1]) +} +xml<-readLines(xmlfile) +numreads<-numeric(0) +nextfold=0 +for(line in xml){ + if(length(grep("ReadLength",line)>0)){ + readlength<-as.numeric(xmltag(line)) + } + if(length(grep("NumberReads",line)>0)){ + numreads<-c(numreads,as.numeric(xmltag(line))) + } + if(length(grep("AvTargetCoverage",line)>0)){ + averagecov=as.numeric(xmltag(line)) + } + if(details==1){ + sdetails="<a href=\"chromosomes/chromosomes.html\">[Show Details]</a><br/>" + } + if(length(grep("SDTargetCoverage",line)>0)){ + stddevcov=as.numeric(xmltag(line)) + } + if(length(grep("TargetSize",line)>0)){ + targetsize=as.numeric(xmltag(line)) + } + if(length(grep("<from1x>",line)>0)){nextfold=1} + else{ + if((nextfold==1) && length(grep("PercBases",line))>0){ + sample1<-as.numeric(xmltag(line)) + nextfold=0 + } + } + if(length(grep("<from5x>",line)>0)){nextfold=5} + else{ + if((nextfold==5) && length(grep("PercBases",line))>0){ + sample5<-as.numeric(xmltag(line)) + nextfold=0 + } + } + if(length(grep("<from10x>",line)>0)){nextfold=10} + else{ + if((nextfold==10) && length(grep("PercBases",line))>0){ + sample10<-as.numeric(xmltag(line)) + nextfold=0 + } + } + if(length(grep("<from20x>",line)>0)){nextfold=20} + else{ + if((nextfold==20) && length(grep("PercBases",line))>0){ + sample20<-as.numeric(xmltag(line)) + nextfold=0 + } + } + if(length(grep("<from30x>",line)>0)){nextfold=30} + else{ + if((nextfold==30) && length(grep("PercBases",line))>0){ + + sample30<-as.numeric(xmltag(line)) + nextfold=0 + } + } +} + +numreads_total<-numreads[1] +numreads_target<-numreads[3] +tpkm=round(numreads_target/((targetsize/1000)*(numreads_total/1000000)),2) + +cat("ready.\n") + + +##Read BED enrichment file and summarize output +cat("Reading BED file ... ") +bed<-read.table(bedfile,stringsAsFactors=FALSE) +area0_2<-sum(bed$V5<2) +area2_10<-sum((bed$V5>=2) & (bed$V5<10)) +area10_20<-sum((bed$V5>=10) & (bed$V5<20)) +area20_30<-sum((bed$V5>=20) & (bed$V5<30)) +area30_50<-sum((bed$V5>=30) & (bed$V5<50)) +area50_100<-sum((bed$V5>=50) & (bed$V5<100)) +areagr100<-sum(bed$V5>100) +cat("ready.\n") + + +##Create pieplot +cat("Creating coverage pieplot ... ") +png(file=paste(outdir,"/plots/",samplename,"_pieplot.png",sep=""),width=580) +par(mar=c(1,7,1,7)) +pie(c(area0_2,area2_10,area10_20,area20_30,area30_50,area50_100,areagr100), + labels=c(paste("0x to 2x (",area0_2,")",sep=""), + paste("2x to 10x (",area2_10,")",sep=""), + paste("10x to 20x (",area10_20,")",sep=""), + paste("20x to 30x (",area20_30,")",sep=""), + paste("30x to 50x (",area30_50,")",sep=""), + paste("50x to 100x (",area50_100,")",sep=""), + paste("above 100x (",areagr100,")",sep="")), + col=c("gray30","gray40","gray50","gray60","gray70","gray80","gray90"),main="") ##Mean Coverage of Target Regions") +garbage<-dev.off() +cat("ready.\n") + + +cat("Preparing coverage barplots ... ") +maxgenemean=0 +genes<-levels(as.factor(bed$V4)) +genes<-genes[genes!="unknown"] +ngenes<-length(genes) + +for(i in 1:ngenes){ + genemean<-mean(bed[bed$V4==genes[i],5]) + if(maxgenemean<genemean){maxgenemean=genemean} +} + +maxgenemean=maxgenemean-(maxgenemean%%100)+100 +if((cutoff>0) && (maxgenemean>cutoff)){ + maxgenemean=cutoff +} +cat("ready.\n") + +fwidth<-function(genes){ + fwidth0=10*length(genes) + if(length(genes)<=100){fwidth0=10*length(genes)} + if(length(genes)<=70){fwidth0=10*length(genes)} + if(length(genes)<=40){fwidth0=25*length(genes)} + if(length(genes)<=20){fwidth0=30*length(genes)} + if(length(genes)<=10){fwidth0=35*length(genes)} + if(length(genes)<=5){fwidth0=50*length(genes)} + return(fwidth0) +} + + +chromosomes<-levels(as.factor(bed$V1)) +if(length(genes)>=2000){ + cat("Creating coverage barplots ... ") + for(chr in chromosomes){ + + chrbed<-bed[bed$V1==chr,] + genemean<-numeric(0) + genes<-levels(as.factor(chrbed$V4)) + genes<-genes[genes!="unknown"] + + for(i in 1:length(genes)){ + if(cutoff>0){genemean[i]<-min(mean(chrbed[chrbed$V4==genes[i],5]),cutoff)} + else{genemean[i]<-mean(chrbed[chrbed$V4==genes[i],5])} + } + + png(file=paste(outdir,"/plots/",samplename,"_target_coverage_",chr,".png",sep=""),width=fwidth(genes),height=450) + par(mar=c(7,5,1,2)) + barplot(as.numeric(genemean),names.arg=genes,las=2, + ylab=paste("Average Coverage at Gene Locus (",chr,")",sep=""),col="tomato3",xlim=c(0.043*length(genemean),1.2*length(genes)-0.043*length(genemean)),ylim=c(0,maxgenemean)) + lineh=0 + while(lineh<maxgenemean){ + lineh=lineh+100 + lines(c(-100,1000+length(genes)),c(lineh,lineh),col="gray") + } + garbage<-dev.off() + } + cat("ready.\n") +} else{ + cat("Creating coverage barplot ... ") + for(i in 1:length(genes)){ + if(cutoff>0){genemean[i]<-min(mean(bed[bed$V4==genes[i],5]),cutoff)} + else{genemean[i]<-mean(bed[bed$V4==genes[i],5])} + } + + + png(file=paste(outdir,"/plots/",samplename,"_target_coverage.png",sep=""),width=fwidth(genes),height=450) + par(mar=c(7,5,1,2)) + barplot(as.numeric(genemean),names.arg=genes,las=2, + ylab="Average Coverage at Gene Locus",col="tomato3",xlim=c(0,1.2*ngenes),ylim=c(0,maxgenemean)) + lineh=0 + while(lineh<maxgenemean){ + lineh=lineh+100 + lines(c(-100,1000+length(genes)),c(lineh,lineh),col="gray") + } + garbage<-dev.off() + cat("ready.\n") +} + + +##Searching for poorly/highly covered genes +cat("Searching for poorly (<",poor,"x) and highly (>",high,"x) covered genes ... ",sep="") +genes<-levels(as.factor(bed$V4)) +chr<-start<-end<-gene1<-ave<-stddev<-gr1x<-gr5x<-gr10x<-gr20x<-gr30x<-numeric(0) +for(gene in genes){ + + bed0<-bed[bed$V4==gene,] + len<-bed0[,3]-bed0[,2] + fold1<-fold5<-fold10<-fold20<-fold30<-numeric(0) + for(i in 1:nrow(bed0)){fold1<-c(fold1,as.numeric(bed0$V6[i]))} + for(i in 1:nrow(bed0)){fold5<-c(fold5,as.numeric(bed0$V7[i]))} + for(i in 1:nrow(bed0)){fold10<-c(fold10,as.numeric(bed0$V8[i]))} + for(i in 1:nrow(bed0)){fold20<-c(fold20,as.numeric(bed0$V9[i]))} + + for(i in 1:nrow(bed0)){fold30<-c(fold30,as.numeric(bed0$V10[i]))} + fold1_gene<-paste(round(sum(fold1*len)/sum(len),2),"%",sep="") + fold5_gene<-paste(round(sum(fold5*len)/sum(len),2),"%",sep="") + fold10_gene<-paste(round(sum(fold10*len)/sum(len),2),"%",sep="") + fold20_gene<-paste(round(sum(fold20*len)/sum(len),2),"%",sep="") + fold30_gene<-paste(round(sum(fold30*len)/sum(len),2),"%",sep="") + + chr<-c(chr,bed0[1,1]) + start<-c(start,min(bed0[,2])) + end<-c(end,max(bed0[,3])) + gene1<-c(gene1,gene) + ave<-c(ave,round(mean(bed0[,5]),2)) + if(is.na(sd(bed0[,5]))){ + stddev<-c(stddev,0) + } + else{ + stddev<-c(stddev,round(sd(bed0[,5]),2)) + } + + gr1x<-c(gr1x,fold1_gene) + gr5x<-c(gr5x,fold5_gene) + gr10x<-c(gr10x,fold10_gene) + gr20x<-c(gr20x,fold20_gene) + gr30x<-c(gr30x,fold30_gene) + +} + +bed_gene<-data.frame(chr,start,end,gene1,ave,stddev,gr1x,gr5x,gr10x,gr20x,gr30x) + +poorly_covered<-bed_gene[bed_gene$ave<poor,] +poorly_covered<-poorly_covered[order(poorly_covered[,5],decreasing=FALSE),] +highly_covered<-bed_gene[bed_gene$ave>high,] +highly_covered<-highly_covered[order(highly_covered[,5],decreasing=TRUE),] + +linkpoor<-paste("http://www.genome.ucsc.edu/cgi-bin/hgTracks?&db=",genome,"&position=", + poorly_covered$chr,"%3A",poorly_covered$start,"-", + poorly_covered$end,"&hgt.suggest=&pix=800&Submit=submit&hgsid=183341879",sep="") +linkhigh<-paste("http://www.genome.ucsc.edu/cgi-bin/hgTracks?&db=",genome,"&position=", + highly_covered$chr,"%3A",highly_covered$start,"-", + highly_covered$end,"&hgt.suggest=&pix=800&Submit=submit&hgsid=183341879",sep="") +cat("ready.\n",sep="") + + +##Output HTML document +cat("Writing HTML output ... "); + +cat(file=outfile,paste( + "<html>\n", + "<head>\n", + "<title>Enrichment Performance</title>\n", + "<style type=\"text/css\">\n", + " body{font-family:sans-serif;}\n", + " h2,h3{color: darkblue;}\n", + " a{color:darkblue;}\n", + " table.output td{", + " padding: 4px; background-color: lightskyblue;", + " border: 1px solid #000; border-color: darkblue;", + " }\n", + "</style>\n", + "\n", + "<script language=\"JavaScript\">\n", + " var questionClass=\"chrView\";\n", + " function collapseAll(){\n", + " var allSections = document.getElementsByTagName(\"div\");\n", + " for(i=0; i<allSections.length; i++){\n", + " if(allSections[i].className==questionClass){\n", + " allSections[i].style.display=\"none\";\n", + " }\n", + " }\n", + " }\n", + " function expand(name){\n", + " collapseAll();\n", + " var newStyle=\"\";\n", + " if(document.getElementById(name).style.display!=\"block\"){\n", + " newStyle=\"block\";\n", + " }\n", + " else{\n", + " newStyle=\"none\";\n", + " }\n", + " document.getElementById(name).style.display=newStyle;\n", + " }\n", + "</script>\n", + "</head>\n", + "\n", + "<body onload=\"expand('",chromosomes[1],"')\">", + "<h2>Enrichment Performance of Sample ",samplename,"</h2>\n", + "<table>\n", + "<tr>\n", + "<td>\n", + "<h3>Summary Statistics</h3>\n", + "<table class=\"output\">\n", + " <tr><td><b># Reads</b></td><td>",numreads_total,"</td></tr>\n", + " <tr><td><b># On Target ± 100 bp</b></td><td>",numreads_target,"</td></tr>\n", + " <tr><td><b>Target Size (bp)</b><td>",targetsize,"</td></tr>\n", + " <tr><td><b># Target Regions</b><td>",nrow(bed),"</td></tr>\n", + " <tr><td><b>Coverage Mean</b></td><td>",averagecov,"</td></tr>\n", + " <tr><td><b>Coverage Std Dev</b></td><td>",stddevcov,"</td></tr>\n", + " <tr><td><b>Covered 1x</b></td><td>",sample1,"</td></tr>\n", + " <tr><td><b>Covered 5x</b></td><td>",sample5,"</td></tr>\n", + " <tr><td><b>Covered 10x</b></td><td>",sample10,"</td></tr>\n", + " <tr><td><b>Covered 20x</b></td><td>",sample20,"</td></tr>\n", + " <tr><td><b>Covered 30x</b></td><td>",sample30,"</td></tr>\n", + " <tr><td><b>TPKM</b></td><td>",tpkm,"</td></tr>\n", + "</table>\n", + "</td>\n", + "<td width=\"10%\"></td>", + "<td>\n", + "<img src=\"plots/",samplename,"_pieplot.png\"></img>\n", + "</td>\n", + "</tr>\n", + "</table>\n", + "<br/>\n", + "<h2>Genewise Target Coverage</h2>",sdetails,"<br/>\n",sep="")) + + +if(ngenes>=2000){ +for(chromosome in chromosomes){ + cat(file=outfile,paste("<a href=\"javascript:expand('",chromosome,"')\">",chromosome,"</a>\n",sep=""),append=TRUE) +} +for(chromosome in chromosomes){ + cat(file=outfile,paste("<div style=\"height:480px; overflow:auto;\" class=\"chrView\" id=\"",chromosome,"\"><img src=\"plots/",samplename,"_target_coverage_",chromosome,".png\"></img></div>\n",sep=""),append=TRUE) +} +} else{ + cat(file=outfile,paste("<div style=\"height:480px; overflow:auto;\"><img src=\"plots/",samplename,"_target_coverage.png\"></img></div>\n",sep=""),append=TRUE) +} + +cat(file=outfile,paste("<br/><br/>\n", + "<h2>Poorly Covered Genes (Cutoff: ",poor,"x)</h2>\n",sep=""),append=TRUE) +if(nrow(poorly_covered)==0){ + cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE) +} else{ +cat(file=outfile,paste( + "<table class=\"output\">\n", + " <tr>\n", + " <td><b>Region</b></td><td><b>Gene</b></td>\n", + " <td><b>Coverage Mean</b></td>\n", + " <td><b>Covered 1x</b></td><td><b>Covered 5x</b></td>\n", + " <td><b>Covered 10x</b></td><td><b>Covered 20x</b></td>\n", + " <td><b>Covered 30x</b></td><td><b>External Link</b></td>\n", + " </tr>\n",sep=""),append=TRUE) + +for(i in 1:nrow(poorly_covered)){ +cat(file=outfile,paste( + " <tr>\n", + " <td>",poorly_covered[i,1],":",poorly_covered[i,2],"-",poorly_covered[i,3],"</td>\n", + " <td>",poorly_covered[i,4],"</td>\n", + " <td>",poorly_covered[i,5],"</td>\n", + " <td>",poorly_covered[i,7],"</td>\n", + " <td>",poorly_covered[i,8],"</td>\n", + " <td>",poorly_covered[i,9],"</td>\n", + " <td>",poorly_covered[i,10],"</td>\n", + " <td>",poorly_covered[i,11],"</td>\n", + " <td><a href=\"",linkpoor[i],"\">Show in Genome Browser</a></td>\n", + " </tr>\n",sep=""),append=TRUE) +} + +cat(file=outfile,"</table><br/>\n",append=TRUE) +} + +cat(file=outfile,paste("<h2>Highly Covered Genes (Cutoff: ",high,"x)</h2>\n",sep=""),append=TRUE) +if(nrow(highly_covered)==0){ + cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE) +} else{ +cat(file=outfile,paste( + "<table class=\"output\">\n", + " <tr>\n", + " <td><b>Target</b></td><td><b>Gene</b></td><td><b>Coverage Mean</b></td>", + " <td><b>Covered 1x</b></td><td><b>Covered 5x</b></td><td><b>Covered 10x</b></td>", + " <td><b>Covered 20x</b></td><td><b>Covered 30x</b></td><td><b>External Link</b></td>\n", + " </tr>\n",sep=""),append=TRUE) +for(i in 1:nrow(highly_covered)){ +cat(file=outfile,paste(" <tr>\n", + " <td>",highly_covered[i,1],":",highly_covered[i,2],"-",highly_covered[i,3],"</td>\n", + " <td>",highly_covered[i,4],"</td>\n", + " <td>",highly_covered[i,5],"</td>\n", + " <td>",highly_covered[i,7],"</td>\n", + " <td>",highly_covered[i,8],"</td>\n", + " <td>",highly_covered[i,9],"</td>\n", + " <td>",highly_covered[i,10],"</td>\n", + " <td>",highly_covered[i,11],"</td>\n", + " <td><a href=\"",linkhigh[i],"\">Show in Genome Browser</a></td>\n", + " </tr>",sep=""),append=TRUE) +} + +cat(file=outfile,"</table>\n",append=TRUE) +} +cat(file=outfile,"<br/>BED file used for specification of target regions:<br/>",targetfile,append=TRUE) +cat(file=outfile,"</body>\n</html>\n",append=TRUE) + +cat("ready.\n")