Mercurial > repos > pfrommolt > ngsrich
diff NGSrich_0.5.5/R/summarize_enrichment.R @ 0:89ad0a9cca52 default tip
Uploaded
author | pfrommolt |
---|---|
date | Mon, 21 Nov 2011 08:12:19 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/NGSrich_0.5.5/R/summarize_enrichment.R Mon Nov 21 08:12:19 2011 -0500 @@ -0,0 +1,223 @@ +#!/usr/bin/Rscript + +infile=as.character(commandArgs()[6]) +outdir=as.character(commandArgs()[7]) +poor=as.numeric(commandArgs()[8]) +high=as.numeric(commandArgs()[9]) + + +outfile=paste(outdir,"/summary.html",sep="") +cutoff=c(poor,high) + +xmlfiles<-bedfiles<-character(0) +indirs<-as.character(readLines(infile)) +for(thisdir in indirs){ + thisdir=paste(thisdir,"/data",sep="") + xmlfiles=c(xmlfiles,list.files(thisdir,pattern=".xml",full.names=TRUE)) + bedfiles=c(bedfiles,list.files(thisdir,pattern=".bed",full.names=TRUE)) +} +nsamples<-length(indirs) + +samplenames<-character(0) +diffs<-matrix(nrow=5,ncol=nsamples) +averagecov<-stddevcov<-targetsize<-numreads_total<-numreads_target<-numeric(0) +sample1<-sample5<-sample10<-sample20<-sample30<-numeric(0) + + +##Read XML input files +xmltag<-function(line){ + return(strsplit(strsplit(line,">")[[1]][2],"</")[[1]][1]) +} +i=1 +nextfold=0 +for(file in xmlfiles){ + numreads<-numeric(0) + inlines<-readLines(file) + for(line in inlines){ + if(length(grep("SampleName",line)>0)){samplenames<-c(samplenames,xmltag(line))} + if(length(grep("NumberReads",line)>0)){numreads<-c(numreads,as.numeric(xmltag(line)))} + if(length(grep("AvTargetCoverage",line)>0)){averagecov=c(averagecov,as.numeric(xmltag(line)))} + if(length(grep("SDTargetCoverage",line)>0)){stddevcov=c(stddevcov,as.numeric(xmltag(line)))} + if(length(grep("TargetSize",line)>0)){targetsize=as.numeric(xmltag(line))} + if(length(grep("<from1x>",line)>0)){nextfold=1} + else{ + if((nextfold==1) && length(grep("PercBases",line))>0){ + sample1<-c(sample1,as.numeric(xmltag(line))) + nextfold=0 + } + } + if(length(grep("<from5x>",line)>0)){nextfold=5} + else{ + if((nextfold==5) && length(grep("PercBases",line))>0){ + sample5<-c(sample5,as.numeric(xmltag(line))) + nextfold=0 + } + } + if(length(grep("<from10x>",line)>0)){nextfold=10} + else{ + if((nextfold==10) && length(grep("PercBases",line))>0){ + sample10<-c(sample10,as.numeric(xmltag(line))) + nextfold=0 + } + } + if(length(grep("<from20x>",line)>0)){nextfold=20} + else{ + if((nextfold==20) && length(grep("PercBases",line))>0){ + sample20<-c(sample20,as.numeric(xmltag(line))) + nextfold=0 + } + } + if(length(grep("<from30x>",line)>0)){nextfold=30} + else{ + if((nextfold==30) && length(grep("PercBases",line))>0){ + sample30<-c(sample30,as.numeric(xmltag(line))) + nextfold=0 + } + } + } + numreads_total<-c(numreads_total,numreads[1]) + numreads_target<-c(numreads_target,numreads[2]) + diffs[,i]<-c(sample30[i],sample20[i]-sample30[i],sample10[i]-sample20[i],sample5[i]-sample10[i],sample1[i]-sample5[i]) + i=i+1 +} + + +##Create barplot +rownames(diffs)<-c("Covered 30x","Covered 20x","Covered 10x","Covered 5x","Covered 1x") +plot_samplenames<-samplenames +s=0 +for(sname in plot_samplenames){ + s=s+1 + if(nchar(sname)>10){plot_samplenames[s]<-paste("Sample ",s,"\n(table above)",sep="")} +} +colnames(diffs)<-plot_samplenames +png(file=paste(outdir,"/summary_plot.png",sep=""),width=66*(4+nsamples),height=400) +par(oma=c(1,0,0,1)) +barplot(diffs, + legend=TRUE, + col=c("darkblue","steelblue1","tomato1","tomato3","tomato4"), + xlim=c(0,4+nsamples),ylim=c(0,100), + ylab="Percentage",las=2 ) #, +# args.legend=list(x=nsamples+4,y=80)) +dev.off() + + + +##Read BED files +sametarget=TRUE +thisbed<-read.table(bedfiles[1],stringsAsFactors=FALSE) +genes<-levels(as.factor(thisbed$V4)) +meancov<-matrix(nrow=length(genes),ncol=length(bedfiles)) +chr<-start<-end<-character(0) + +i=1 +for(file in bedfiles){ + thisbed<-read.table(file,stringsAsFactors=FALSE) + genes_thissample<-levels(as.factor(thisbed$V4)) + if(length(genes_thissample)!=length(genes)){sametarget=FALSE} + for(j in 1:length(genes)){ + thisgene<-thisbed[thisbed$V4==genes[j],] + chr[j]<-thisgene[1,1] + start[j]<-thisgene[1,2] + end[j]<-thisgene[1,3] + meancov[j,i]<-as.numeric(mean(thisgene[,5])) + } + i=i+1 +} + +meancov<-data.frame(chr,start,end,genes,meancov) +for(i in 1:nrow(meancov)){ + print(i) + meancov$ave[i]<-round(mean(as.numeric(meancov[i,4+(1:nsamples)])),4) + meancov$stddev[i]<-round(sd(as.numeric(meancov[i,4+(1:nsamples)])),4) + meancov$mini[i]<-round(min(as.numeric(meancov[i,4+(1:nsamples)])),4) + meancov$maxi[i]<-round(max(as.numeric(meancov[i,4+(1:nsamples)])),4) +} +meancov_sorted<-meancov[order(meancov$ave),] +poorly_all<-meancov_sorted[meancov_sorted$max<cutoff[1],] + + +##Write HTML output +cat(file=outfile,paste( + "<html>\n", + "<head>\n", + "<title>Enrichment Performance</title>\n", + "<style type=\"text/css\">\n", + "body{font-family:sans-serif;}\n", + "h2,h3{color: darkblue;}\n", + "a{color: darkblue;}\n", + " table.output td{\n", + " padding: 4px; background-color: lightskyblue;\n", + " border: 1px solid #000; border-color: darkblue;\n", + "}\n", + "</Style>\n", + "</head>\n", + "<body>\n", + "<h2>Target Enrichment Performance</h2><br/>\n", + "<table class=\"output\">\n", + " <tr>\n", + " <td><b>Sample</b></td><td><b># Reads</b></td><td><b># On Target</b></td><td><b>Coverage Mean</b></td><td><b>Coverage Std Dev</b></td><td><b>Covered 1x</b></td><td><b>Covered 5x</b></td><td><b>Covered 10x</b></td><td><b>Covered 20x</b></td><td><b>Covered 30x</b></td><td><b>Details</b></td>\n", + "</tr>\n",sep="")) +for(i in 1:nsamples){ + cat(file=outfile,paste( + "<tr><td>",samplenames[i],"</td><td>",numreads_total[i],"</td>", + "<td>",numreads_target[i],"</td><td>",averagecov[i],"</td>", + "<td>",stddevcov[i],"</td><td>",sample1[i],"</td><td>",sample5[i],"</td>", + "<td>",sample10[i],"</td><td>",sample20[i],"</td><td>",sample30[i],"</td>", + "<td><a href=\"",rev(strsplit(indirs[i],"/")[[1]])[1],"/",samplenames[i],"_enrichment.html\">Show</a></td></tr>\n", + sep=""),append=TRUE) +} +cat(file=outfile,paste( + "</table>\n", + "<br/><br/>\n", + "<h2>Target Base Percentage Covered</h2>\n", + "<img src=\"summary_plot.png\"></img>\n",sep=""),append=TRUE) +if(!sametarget){ + warning("Target regions are not the same for all samples.") +} else{ +cat(file=outfile,paste( + "<h2>Poorly Covered in All Samples (Cutoff: ",cutoff[1],"x)</h2>\n",sep=""),append=TRUE) +if(nrow(poorly_all)==0){ +cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE) +} else{ +cat(file=outfile,paste( + "<table class=\"output\">\n", + "<tr>\n", + "<td><b>Target Region</b></td><td><b>Gene</b></td><td><b>Mean Across Samples</b></td><td><b>Std Dev Across Samples</b></td>\n",sep=""),append=TRUE) +for(i in 1:nrow(poorly_all)){ + cat(file=outfile,paste( + "<tr>\n", + "<td>",poorly_all$chr[i],":",poorly_all$start[i],"-",poorly_all$end[i],"</td>\n", + "<td>",poorly_all$genes[i],"</td>\n", + "<td>",poorly_all$ave[i],"</td>\n", + "<td>",poorly_all$stddev[i],"</td>\n", + "<tr>\n",sep=""),append=TRUE) +} +cat(file=outfile,"</table>\n",append=TRUE) +} + +meancov_sorted<-meancov[order(meancov$ave,decreasing=TRUE),] ##Highly Covered +highly_all<-meancov_sorted[meancov_sorted$min>cutoff[2],] +cat(file=outfile,paste( + "<h2>Highly Covered in All Samples (Cutoff: ",cutoff[2],"x)</h2>\n",sep=""),append=TRUE) +if(nrow(highly_all)==0){ +cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE) +} else{ + cat(file=outfile,paste("<table class=\"output\">\n", + "<tr>\n", + "<td><b>Target Region</b></td><td><b>Gene</b></td><td><b>Mean Across Sample</b></td><td><b>Std Dev Across Samples</b></td>\n",sep=""),append=TRUE) +for(i in 1:nrow(highly_all)){ + cat(file=outfile,paste( + "<tr>\n", + "<td>",highly_all$chr[i],":",highly_all$start[i],"-",highly_all$end[i],"</td>\n", + "<td>",highly_all$genes[i],"</td>\n", + "<td>",highly_all$ave[i],"</td>\n", + "<td>",highly_all$stddev[i],"</td>\n", + "<tr>\n",sep=""),append=TRUE) +} +cat(file=outfile,"</table>\n",append=TRUE) +} + +} + +cat(file=outfile,"</body>\n</html>\n",append=TRUE)