Mercurial > repos > pfrommolt > ngsrich

#!/usr/bin/Rscript

infile=as.character(commandArgs()[6])
outdir=as.character(commandArgs()[7])
poor=as.numeric(commandArgs()[8])
high=as.numeric(commandArgs()[9])


outfile=paste(outdir,"/summary.html",sep="")
cutoff=c(poor,high)

xmlfiles<-bedfiles<-character(0)
indirs<-as.character(readLines(infile))
for(thisdir in indirs){
  thisdir=paste(thisdir,"/data",sep="")
  xmlfiles=c(xmlfiles,list.files(thisdir,pattern=".xml",full.names=TRUE))
  bedfiles=c(bedfiles,list.files(thisdir,pattern=".bed",full.names=TRUE))
}
nsamples<-length(indirs)

samplenames<-character(0)
diffs<-matrix(nrow=5,ncol=nsamples)
averagecov<-stddevcov<-targetsize<-numreads_total<-numreads_target<-numeric(0)
sample1<-sample5<-sample10<-sample20<-sample30<-numeric(0)


##Read XML input files
xmltag<-function(line){
  return(strsplit(strsplit(line,">")[[1]][2],"</")[[1]][1])
}
i=1
nextfold=0
for(file in xmlfiles){
  numreads<-numeric(0)
  inlines<-readLines(file)
  for(line in inlines){
    if(length(grep("SampleName",line)>0)){samplenames<-c(samplenames,xmltag(line))}
    if(length(grep("NumberReads",line)>0)){numreads<-c(numreads,as.numeric(xmltag(line)))}
    if(length(grep("AvTargetCoverage",line)>0)){averagecov=c(averagecov,as.numeric(xmltag(line)))}
    if(length(grep("SDTargetCoverage",line)>0)){stddevcov=c(stddevcov,as.numeric(xmltag(line)))}
    if(length(grep("TargetSize",line)>0)){targetsize=as.numeric(xmltag(line))}
    if(length(grep("<from1x>",line)>0)){nextfold=1}
    else{
      if((nextfold==1) && length(grep("PercBases",line))>0){
        sample1<-c(sample1,as.numeric(xmltag(line)))
        nextfold=0
      }
    }
    if(length(grep("<from5x>",line)>0)){nextfold=5}
    else{
      if((nextfold==5) && length(grep("PercBases",line))>0){
        sample5<-c(sample5,as.numeric(xmltag(line)))
        nextfold=0
      }
    }
    if(length(grep("<from10x>",line)>0)){nextfold=10}
    else{
      if((nextfold==10) && length(grep("PercBases",line))>0){
        sample10<-c(sample10,as.numeric(xmltag(line)))
        nextfold=0
      }
    }
    if(length(grep("<from20x>",line)>0)){nextfold=20}
    else{
      if((nextfold==20) && length(grep("PercBases",line))>0){
        sample20<-c(sample20,as.numeric(xmltag(line)))
        nextfold=0
      }
    }
    if(length(grep("<from30x>",line)>0)){nextfold=30}
    else{
      if((nextfold==30) && length(grep("PercBases",line))>0){
        sample30<-c(sample30,as.numeric(xmltag(line)))
        nextfold=0
      }
    }
  }
  numreads_total<-c(numreads_total,numreads[1])
  numreads_target<-c(numreads_target,numreads[2])
  diffs[,i]<-c(sample30[i],sample20[i]-sample30[i],sample10[i]-sample20[i],sample5[i]-sample10[i],sample1[i]-sample5[i])
  i=i+1
}


##Create barplot
rownames(diffs)<-c("Covered 30x","Covered 20x","Covered 10x","Covered 5x","Covered 1x")
plot_samplenames<-samplenames
s=0
for(sname in plot_samplenames){
  s=s+1
  if(nchar(sname)>10){plot_samplenames[s]<-paste("Sample ",s,"\n(table above)",sep="")}
}
colnames(diffs)<-plot_samplenames
png(file=paste(outdir,"/summary_plot.png",sep=""),width=66*(4+nsamples),height=400)
par(oma=c(1,0,0,1))
barplot(diffs,
       	legend=TRUE,
        col=c("darkblue","steelblue1","tomato1","tomato3","tomato4"),
        xlim=c(0,4+nsamples),ylim=c(0,100),
        ylab="Percentage",las=2 ) #,
#        args.legend=list(x=nsamples+4,y=80))
dev.off()


##Read BED files
sametarget=TRUE
thisbed<-read.table(bedfiles[1],stringsAsFactors=FALSE)
genes<-levels(as.factor(thisbed$V4))
meancov<-matrix(nrow=length(genes),ncol=length(bedfiles))
chr<-start<-end<-character(0)

i=1
for(file in bedfiles){
  thisbed<-read.table(file,stringsAsFactors=FALSE)
  genes_thissample<-levels(as.factor(thisbed$V4))
  if(length(genes_thissample)!=length(genes)){sametarget=FALSE}
  for(j in 1:length(genes)){
    thisgene<-thisbed[thisbed$V4==genes[j],]
    chr[j]<-thisgene[1,1]
    start[j]<-thisgene[1,2]
    end[j]<-thisgene[1,3]
    meancov[j,i]<-as.numeric(mean(thisgene[,5]))
  }
  i=i+1
}

meancov<-data.frame(chr,start,end,genes,meancov)
for(i in 1:nrow(meancov)){
  print(i)
  meancov$ave[i]<-round(mean(as.numeric(meancov[i,4+(1:nsamples)])),4)
  meancov$stddev[i]<-round(sd(as.numeric(meancov[i,4+(1:nsamples)])),4)
  meancov$mini[i]<-round(min(as.numeric(meancov[i,4+(1:nsamples)])),4)
  meancov$maxi[i]<-round(max(as.numeric(meancov[i,4+(1:nsamples)])),4)
}
meancov_sorted<-meancov[order(meancov$ave),]
poorly_all<-meancov_sorted[meancov_sorted$max<cutoff[1],]


##Write HTML output
cat(file=outfile,paste(
      "<html>\n",
      "<head>\n",
      "<title>Enrichment Performance</title>\n",
      "<style type=\"text/css\">\n",
      "body{font-family:sans-serif;}\n",
      "h2,h3{color: darkblue;}\n",
      "a{color: darkblue;}\n",
      "   table.output td{\n",
      "   padding: 4px; background-color: lightskyblue;\n",
      "   border: 1px solid #000; border-color: darkblue;\n",
      "}\n",
      "</Style>\n",
      "</head>\n",
      "<body>\n",
      "<h2>Target Enrichment Performance</h2><br/>\n",
      "<table class=\"output\">\n",
      "  <tr>\n",
      "  <td><b>Sample</b></td><td><b># Reads</b></td><td><b># On Target</b></td><td><b>Coverage Mean</b></td><td><b>Coverage Std Dev</b></td><td><b>Covered 1x</b></td><td><b>Covered 5x</b></td><td><b>Covered 10x</b></td><td><b>Covered 20x</b></td><td><b>Covered 30x</b></td><td><b>Details</b></td>\n",
      "</tr>\n",sep=""))
for(i in 1:nsamples){
  cat(file=outfile,paste(
        "<tr><td>",samplenames[i],"</td><td>",numreads_total[i],"</td>",
        "<td>",numreads_target[i],"</td><td>",averagecov[i],"</td>",
        "<td>",stddevcov[i],"</td><td>",sample1[i],"</td><td>",sample5[i],"</td>",
        "<td>",sample10[i],"</td><td>",sample20[i],"</td><td>",sample30[i],"</td>",
        "<td><a href=\"",rev(strsplit(indirs[i],"/")[[1]])[1],"/",samplenames[i],"_enrichment.html\">Show</a></td></tr>\n",
        sep=""),append=TRUE)
}
cat(file=outfile,paste(
      "</table>\n",
      "<br/><br/>\n",
      "<h2>Target Base Percentage Covered</h2>\n",
      "<img src=\"summary_plot.png\"></img>\n",sep=""),append=TRUE)
if(!sametarget){
	warning("Target regions are not the same for all samples.")
} else{
cat(file=outfile,paste(
      "<h2>Poorly Covered in All Samples (Cutoff: ",cutoff[1],"x)</h2>\n",sep=""),append=TRUE)
if(nrow(poorly_all)==0){
cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE)
} else{
cat(file=outfile,paste(
      "<table class=\"output\">\n",
      "<tr>\n",
      "<td><b>Target Region</b></td><td><b>Gene</b></td><td><b>Mean Across Samples</b></td><td><b>Std Dev Across Samples</b></td>\n",sep=""),append=TRUE)
for(i in 1:nrow(poorly_all)){
  cat(file=outfile,paste(
        "<tr>\n",
        "<td>",poorly_all$chr[i],":",poorly_all$start[i],"-",poorly_all$end[i],"</td>\n",
        "<td>",poorly_all$genes[i],"</td>\n",
        "<td>",poorly_all$ave[i],"</td>\n",
        "<td>",poorly_all$stddev[i],"</td>\n",
        "<tr>\n",sep=""),append=TRUE)
}
cat(file=outfile,"</table>\n",append=TRUE)
}

meancov_sorted<-meancov[order(meancov$ave,decreasing=TRUE),] ##Highly Covered
highly_all<-meancov_sorted[meancov_sorted$min>cutoff[2],]
cat(file=outfile,paste(
      "<h2>Highly Covered in All Samples (Cutoff: ",cutoff[2],"x)</h2>\n",sep=""),append=TRUE)
if(nrow(highly_all)==0){
cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE)
} else{
	cat(file=outfile,paste("<table class=\"output\">\n",
      "<tr>\n",
      "<td><b>Target Region</b></td><td><b>Gene</b></td><td><b>Mean Across Sample</b></td><td><b>Std Dev Across Samples</b></td>\n",sep=""),append=TRUE)
for(i in 1:nrow(highly_all)){
  cat(file=outfile,paste(
        "<tr>\n",
        "<td>",highly_all$chr[i],":",highly_all$start[i],"-",highly_all$end[i],"</td>\n",
        "<td>",highly_all$genes[i],"</td>\n",
        "<td>",highly_all$ave[i],"</td>\n",
        "<td>",highly_all$stddev[i],"</td>\n",
        "<tr>\n",sep=""),append=TRUE)
}
cat(file=outfile,"</table>\n",append=TRUE)
}

}

cat(file=outfile,"</body>\n</html>\n",append=TRUE)
author	pfrommolt
date	Mon, 21 Nov 2011 08:12:19 -0500
parents
children