Mercurial > repos > pfrommolt > ngsrich
view NGSrich_0.5.5/R/summarize_enrichment.R @ 0:89ad0a9cca52 default tip
Uploaded
author | pfrommolt |
---|---|
date | Mon, 21 Nov 2011 08:12:19 -0500 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/Rscript infile=as.character(commandArgs()[6]) outdir=as.character(commandArgs()[7]) poor=as.numeric(commandArgs()[8]) high=as.numeric(commandArgs()[9]) outfile=paste(outdir,"/summary.html",sep="") cutoff=c(poor,high) xmlfiles<-bedfiles<-character(0) indirs<-as.character(readLines(infile)) for(thisdir in indirs){ thisdir=paste(thisdir,"/data",sep="") xmlfiles=c(xmlfiles,list.files(thisdir,pattern=".xml",full.names=TRUE)) bedfiles=c(bedfiles,list.files(thisdir,pattern=".bed",full.names=TRUE)) } nsamples<-length(indirs) samplenames<-character(0) diffs<-matrix(nrow=5,ncol=nsamples) averagecov<-stddevcov<-targetsize<-numreads_total<-numreads_target<-numeric(0) sample1<-sample5<-sample10<-sample20<-sample30<-numeric(0) ##Read XML input files xmltag<-function(line){ return(strsplit(strsplit(line,">")[[1]][2],"</")[[1]][1]) } i=1 nextfold=0 for(file in xmlfiles){ numreads<-numeric(0) inlines<-readLines(file) for(line in inlines){ if(length(grep("SampleName",line)>0)){samplenames<-c(samplenames,xmltag(line))} if(length(grep("NumberReads",line)>0)){numreads<-c(numreads,as.numeric(xmltag(line)))} if(length(grep("AvTargetCoverage",line)>0)){averagecov=c(averagecov,as.numeric(xmltag(line)))} if(length(grep("SDTargetCoverage",line)>0)){stddevcov=c(stddevcov,as.numeric(xmltag(line)))} if(length(grep("TargetSize",line)>0)){targetsize=as.numeric(xmltag(line))} if(length(grep("<from1x>",line)>0)){nextfold=1} else{ if((nextfold==1) && length(grep("PercBases",line))>0){ sample1<-c(sample1,as.numeric(xmltag(line))) nextfold=0 } } if(length(grep("<from5x>",line)>0)){nextfold=5} else{ if((nextfold==5) && length(grep("PercBases",line))>0){ sample5<-c(sample5,as.numeric(xmltag(line))) nextfold=0 } } if(length(grep("<from10x>",line)>0)){nextfold=10} else{ if((nextfold==10) && length(grep("PercBases",line))>0){ sample10<-c(sample10,as.numeric(xmltag(line))) nextfold=0 } } if(length(grep("<from20x>",line)>0)){nextfold=20} else{ if((nextfold==20) && length(grep("PercBases",line))>0){ sample20<-c(sample20,as.numeric(xmltag(line))) nextfold=0 } } if(length(grep("<from30x>",line)>0)){nextfold=30} else{ if((nextfold==30) && length(grep("PercBases",line))>0){ sample30<-c(sample30,as.numeric(xmltag(line))) nextfold=0 } } } numreads_total<-c(numreads_total,numreads[1]) numreads_target<-c(numreads_target,numreads[2]) diffs[,i]<-c(sample30[i],sample20[i]-sample30[i],sample10[i]-sample20[i],sample5[i]-sample10[i],sample1[i]-sample5[i]) i=i+1 } ##Create barplot rownames(diffs)<-c("Covered 30x","Covered 20x","Covered 10x","Covered 5x","Covered 1x") plot_samplenames<-samplenames s=0 for(sname in plot_samplenames){ s=s+1 if(nchar(sname)>10){plot_samplenames[s]<-paste("Sample ",s,"\n(table above)",sep="")} } colnames(diffs)<-plot_samplenames png(file=paste(outdir,"/summary_plot.png",sep=""),width=66*(4+nsamples),height=400) par(oma=c(1,0,0,1)) barplot(diffs, legend=TRUE, col=c("darkblue","steelblue1","tomato1","tomato3","tomato4"), xlim=c(0,4+nsamples),ylim=c(0,100), ylab="Percentage",las=2 ) #, # args.legend=list(x=nsamples+4,y=80)) dev.off() ##Read BED files sametarget=TRUE thisbed<-read.table(bedfiles[1],stringsAsFactors=FALSE) genes<-levels(as.factor(thisbed$V4)) meancov<-matrix(nrow=length(genes),ncol=length(bedfiles)) chr<-start<-end<-character(0) i=1 for(file in bedfiles){ thisbed<-read.table(file,stringsAsFactors=FALSE) genes_thissample<-levels(as.factor(thisbed$V4)) if(length(genes_thissample)!=length(genes)){sametarget=FALSE} for(j in 1:length(genes)){ thisgene<-thisbed[thisbed$V4==genes[j],] chr[j]<-thisgene[1,1] start[j]<-thisgene[1,2] end[j]<-thisgene[1,3] meancov[j,i]<-as.numeric(mean(thisgene[,5])) } i=i+1 } meancov<-data.frame(chr,start,end,genes,meancov) for(i in 1:nrow(meancov)){ print(i) meancov$ave[i]<-round(mean(as.numeric(meancov[i,4+(1:nsamples)])),4) meancov$stddev[i]<-round(sd(as.numeric(meancov[i,4+(1:nsamples)])),4) meancov$mini[i]<-round(min(as.numeric(meancov[i,4+(1:nsamples)])),4) meancov$maxi[i]<-round(max(as.numeric(meancov[i,4+(1:nsamples)])),4) } meancov_sorted<-meancov[order(meancov$ave),] poorly_all<-meancov_sorted[meancov_sorted$max<cutoff[1],] ##Write HTML output cat(file=outfile,paste( "<html>\n", "<head>\n", "<title>Enrichment Performance</title>\n", "<style type=\"text/css\">\n", "body{font-family:sans-serif;}\n", "h2,h3{color: darkblue;}\n", "a{color: darkblue;}\n", " table.output td{\n", " padding: 4px; background-color: lightskyblue;\n", " border: 1px solid #000; border-color: darkblue;\n", "}\n", "</Style>\n", "</head>\n", "<body>\n", "<h2>Target Enrichment Performance</h2><br/>\n", "<table class=\"output\">\n", " <tr>\n", " <td><b>Sample</b></td><td><b># Reads</b></td><td><b># On Target</b></td><td><b>Coverage Mean</b></td><td><b>Coverage Std Dev</b></td><td><b>Covered 1x</b></td><td><b>Covered 5x</b></td><td><b>Covered 10x</b></td><td><b>Covered 20x</b></td><td><b>Covered 30x</b></td><td><b>Details</b></td>\n", "</tr>\n",sep="")) for(i in 1:nsamples){ cat(file=outfile,paste( "<tr><td>",samplenames[i],"</td><td>",numreads_total[i],"</td>", "<td>",numreads_target[i],"</td><td>",averagecov[i],"</td>", "<td>",stddevcov[i],"</td><td>",sample1[i],"</td><td>",sample5[i],"</td>", "<td>",sample10[i],"</td><td>",sample20[i],"</td><td>",sample30[i],"</td>", "<td><a href=\"",rev(strsplit(indirs[i],"/")[[1]])[1],"/",samplenames[i],"_enrichment.html\">Show</a></td></tr>\n", sep=""),append=TRUE) } cat(file=outfile,paste( "</table>\n", "<br/><br/>\n", "<h2>Target Base Percentage Covered</h2>\n", "<img src=\"summary_plot.png\"></img>\n",sep=""),append=TRUE) if(!sametarget){ warning("Target regions are not the same for all samples.") } else{ cat(file=outfile,paste( "<h2>Poorly Covered in All Samples (Cutoff: ",cutoff[1],"x)</h2>\n",sep=""),append=TRUE) if(nrow(poorly_all)==0){ cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE) } else{ cat(file=outfile,paste( "<table class=\"output\">\n", "<tr>\n", "<td><b>Target Region</b></td><td><b>Gene</b></td><td><b>Mean Across Samples</b></td><td><b>Std Dev Across Samples</b></td>\n",sep=""),append=TRUE) for(i in 1:nrow(poorly_all)){ cat(file=outfile,paste( "<tr>\n", "<td>",poorly_all$chr[i],":",poorly_all$start[i],"-",poorly_all$end[i],"</td>\n", "<td>",poorly_all$genes[i],"</td>\n", "<td>",poorly_all$ave[i],"</td>\n", "<td>",poorly_all$stddev[i],"</td>\n", "<tr>\n",sep=""),append=TRUE) } cat(file=outfile,"</table>\n",append=TRUE) } meancov_sorted<-meancov[order(meancov$ave,decreasing=TRUE),] ##Highly Covered highly_all<-meancov_sorted[meancov_sorted$min>cutoff[2],] cat(file=outfile,paste( "<h2>Highly Covered in All Samples (Cutoff: ",cutoff[2],"x)</h2>\n",sep=""),append=TRUE) if(nrow(highly_all)==0){ cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE) } else{ cat(file=outfile,paste("<table class=\"output\">\n", "<tr>\n", "<td><b>Target Region</b></td><td><b>Gene</b></td><td><b>Mean Across Sample</b></td><td><b>Std Dev Across Samples</b></td>\n",sep=""),append=TRUE) for(i in 1:nrow(highly_all)){ cat(file=outfile,paste( "<tr>\n", "<td>",highly_all$chr[i],":",highly_all$start[i],"-",highly_all$end[i],"</td>\n", "<td>",highly_all$genes[i],"</td>\n", "<td>",highly_all$ave[i],"</td>\n", "<td>",highly_all$stddev[i],"</td>\n", "<tr>\n",sep=""),append=TRUE) } cat(file=outfile,"</table>\n",append=TRUE) } } cat(file=outfile,"</body>\n</html>\n",append=TRUE)