Mercurial > repos > pfrommolt > ngsrich
comparison NGSrich_0.5.5/R/eval_enrichment.R @ 0:89ad0a9cca52 default tip
Uploaded
| author | pfrommolt |
|---|---|
| date | Mon, 21 Nov 2011 08:12:19 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:89ad0a9cca52 |
|---|---|
| 1 #!/usr/bin/Rscript | |
| 2 | |
| 3 xmlfile=as.character(commandArgs()[6]) | |
| 4 bedfile=as.character(commandArgs()[7]) | |
| 5 outdir=as.character(commandArgs()[8]) | |
| 6 genome=as.character(commandArgs()[9]) | |
| 7 poor=as.numeric(commandArgs()[10]) | |
| 8 high=as.numeric(commandArgs()[11]) | |
| 9 samplename=as.character(commandArgs()[12]) | |
| 10 targetfile=as.character(commandArgs()[13]) | |
| 11 details=as.numeric(commandArgs()[14]) | |
| 12 cutoff=0 | |
| 13 sdetails="" | |
| 14 #samplename0=strsplit(strsplit(xmlfile,"_")[[1]][1],"/")[[1]] | |
| 15 #samplename<-samplename0[length(samplename0)] | |
| 16 outfile=paste(outdir,"/",samplename,"_enrichment.html",sep="") | |
| 17 | |
| 18 ##Read XML summary | |
| 19 cat("Reading XML file ... ") | |
| 20 xmltag<-function(line){ | |
| 21 return(strsplit(strsplit(line,">")[[1]][2],"</")[[1]][1]) | |
| 22 } | |
| 23 xml<-readLines(xmlfile) | |
| 24 numreads<-numeric(0) | |
| 25 nextfold=0 | |
| 26 for(line in xml){ | |
| 27 if(length(grep("ReadLength",line)>0)){ | |
| 28 readlength<-as.numeric(xmltag(line)) | |
| 29 } | |
| 30 if(length(grep("NumberReads",line)>0)){ | |
| 31 numreads<-c(numreads,as.numeric(xmltag(line))) | |
| 32 } | |
| 33 if(length(grep("AvTargetCoverage",line)>0)){ | |
| 34 averagecov=as.numeric(xmltag(line)) | |
| 35 } | |
| 36 if(details==1){ | |
| 37 sdetails="<a href=\"chromosomes/chromosomes.html\">[Show Details]</a><br/>" | |
| 38 } | |
| 39 if(length(grep("SDTargetCoverage",line)>0)){ | |
| 40 stddevcov=as.numeric(xmltag(line)) | |
| 41 } | |
| 42 if(length(grep("TargetSize",line)>0)){ | |
| 43 targetsize=as.numeric(xmltag(line)) | |
| 44 } | |
| 45 if(length(grep("<from1x>",line)>0)){nextfold=1} | |
| 46 else{ | |
| 47 if((nextfold==1) && length(grep("PercBases",line))>0){ | |
| 48 sample1<-as.numeric(xmltag(line)) | |
| 49 nextfold=0 | |
| 50 } | |
| 51 } | |
| 52 if(length(grep("<from5x>",line)>0)){nextfold=5} | |
| 53 else{ | |
| 54 if((nextfold==5) && length(grep("PercBases",line))>0){ | |
| 55 sample5<-as.numeric(xmltag(line)) | |
| 56 nextfold=0 | |
| 57 } | |
| 58 } | |
| 59 if(length(grep("<from10x>",line)>0)){nextfold=10} | |
| 60 else{ | |
| 61 if((nextfold==10) && length(grep("PercBases",line))>0){ | |
| 62 sample10<-as.numeric(xmltag(line)) | |
| 63 nextfold=0 | |
| 64 } | |
| 65 } | |
| 66 if(length(grep("<from20x>",line)>0)){nextfold=20} | |
| 67 else{ | |
| 68 if((nextfold==20) && length(grep("PercBases",line))>0){ | |
| 69 sample20<-as.numeric(xmltag(line)) | |
| 70 nextfold=0 | |
| 71 } | |
| 72 } | |
| 73 if(length(grep("<from30x>",line)>0)){nextfold=30} | |
| 74 else{ | |
| 75 if((nextfold==30) && length(grep("PercBases",line))>0){ | |
| 76 | |
| 77 sample30<-as.numeric(xmltag(line)) | |
| 78 nextfold=0 | |
| 79 } | |
| 80 } | |
| 81 } | |
| 82 | |
| 83 numreads_total<-numreads[1] | |
| 84 numreads_target<-numreads[3] | |
| 85 tpkm=round(numreads_target/((targetsize/1000)*(numreads_total/1000000)),2) | |
| 86 | |
| 87 cat("ready.\n") | |
| 88 | |
| 89 | |
| 90 ##Read BED enrichment file and summarize output | |
| 91 cat("Reading BED file ... ") | |
| 92 bed<-read.table(bedfile,stringsAsFactors=FALSE) | |
| 93 area0_2<-sum(bed$V5<2) | |
| 94 area2_10<-sum((bed$V5>=2) & (bed$V5<10)) | |
| 95 area10_20<-sum((bed$V5>=10) & (bed$V5<20)) | |
| 96 area20_30<-sum((bed$V5>=20) & (bed$V5<30)) | |
| 97 area30_50<-sum((bed$V5>=30) & (bed$V5<50)) | |
| 98 area50_100<-sum((bed$V5>=50) & (bed$V5<100)) | |
| 99 areagr100<-sum(bed$V5>100) | |
| 100 cat("ready.\n") | |
| 101 | |
| 102 | |
| 103 ##Create pieplot | |
| 104 cat("Creating coverage pieplot ... ") | |
| 105 png(file=paste(outdir,"/plots/",samplename,"_pieplot.png",sep=""),width=580) | |
| 106 par(mar=c(1,7,1,7)) | |
| 107 pie(c(area0_2,area2_10,area10_20,area20_30,area30_50,area50_100,areagr100), | |
| 108 labels=c(paste("0x to 2x (",area0_2,")",sep=""), | |
| 109 paste("2x to 10x (",area2_10,")",sep=""), | |
| 110 paste("10x to 20x (",area10_20,")",sep=""), | |
| 111 paste("20x to 30x (",area20_30,")",sep=""), | |
| 112 paste("30x to 50x (",area30_50,")",sep=""), | |
| 113 paste("50x to 100x (",area50_100,")",sep=""), | |
| 114 paste("above 100x (",areagr100,")",sep="")), | |
| 115 col=c("gray30","gray40","gray50","gray60","gray70","gray80","gray90"),main="") ##Mean Coverage of Target Regions") | |
| 116 garbage<-dev.off() | |
| 117 cat("ready.\n") | |
| 118 | |
| 119 | |
| 120 cat("Preparing coverage barplots ... ") | |
| 121 maxgenemean=0 | |
| 122 genes<-levels(as.factor(bed$V4)) | |
| 123 genes<-genes[genes!="unknown"] | |
| 124 ngenes<-length(genes) | |
| 125 | |
| 126 for(i in 1:ngenes){ | |
| 127 genemean<-mean(bed[bed$V4==genes[i],5]) | |
| 128 if(maxgenemean<genemean){maxgenemean=genemean} | |
| 129 } | |
| 130 | |
| 131 maxgenemean=maxgenemean-(maxgenemean%%100)+100 | |
| 132 if((cutoff>0) && (maxgenemean>cutoff)){ | |
| 133 maxgenemean=cutoff | |
| 134 } | |
| 135 cat("ready.\n") | |
| 136 | |
| 137 fwidth<-function(genes){ | |
| 138 fwidth0=10*length(genes) | |
| 139 if(length(genes)<=100){fwidth0=10*length(genes)} | |
| 140 if(length(genes)<=70){fwidth0=10*length(genes)} | |
| 141 if(length(genes)<=40){fwidth0=25*length(genes)} | |
| 142 if(length(genes)<=20){fwidth0=30*length(genes)} | |
| 143 if(length(genes)<=10){fwidth0=35*length(genes)} | |
| 144 if(length(genes)<=5){fwidth0=50*length(genes)} | |
| 145 return(fwidth0) | |
| 146 } | |
| 147 | |
| 148 | |
| 149 chromosomes<-levels(as.factor(bed$V1)) | |
| 150 if(length(genes)>=2000){ | |
| 151 cat("Creating coverage barplots ... ") | |
| 152 for(chr in chromosomes){ | |
| 153 | |
| 154 chrbed<-bed[bed$V1==chr,] | |
| 155 genemean<-numeric(0) | |
| 156 genes<-levels(as.factor(chrbed$V4)) | |
| 157 genes<-genes[genes!="unknown"] | |
| 158 | |
| 159 for(i in 1:length(genes)){ | |
| 160 if(cutoff>0){genemean[i]<-min(mean(chrbed[chrbed$V4==genes[i],5]),cutoff)} | |
| 161 else{genemean[i]<-mean(chrbed[chrbed$V4==genes[i],5])} | |
| 162 } | |
| 163 | |
| 164 png(file=paste(outdir,"/plots/",samplename,"_target_coverage_",chr,".png",sep=""),width=fwidth(genes),height=450) | |
| 165 par(mar=c(7,5,1,2)) | |
| 166 barplot(as.numeric(genemean),names.arg=genes,las=2, | |
| 167 ylab=paste("Average Coverage at Gene Locus (",chr,")",sep=""),col="tomato3",xlim=c(0.043*length(genemean),1.2*length(genes)-0.043*length(genemean)),ylim=c(0,maxgenemean)) | |
| 168 lineh=0 | |
| 169 while(lineh<maxgenemean){ | |
| 170 lineh=lineh+100 | |
| 171 lines(c(-100,1000+length(genes)),c(lineh,lineh),col="gray") | |
| 172 } | |
| 173 garbage<-dev.off() | |
| 174 } | |
| 175 cat("ready.\n") | |
| 176 } else{ | |
| 177 cat("Creating coverage barplot ... ") | |
| 178 for(i in 1:length(genes)){ | |
| 179 if(cutoff>0){genemean[i]<-min(mean(bed[bed$V4==genes[i],5]),cutoff)} | |
| 180 else{genemean[i]<-mean(bed[bed$V4==genes[i],5])} | |
| 181 } | |
| 182 | |
| 183 | |
| 184 png(file=paste(outdir,"/plots/",samplename,"_target_coverage.png",sep=""),width=fwidth(genes),height=450) | |
| 185 par(mar=c(7,5,1,2)) | |
| 186 barplot(as.numeric(genemean),names.arg=genes,las=2, | |
| 187 ylab="Average Coverage at Gene Locus",col="tomato3",xlim=c(0,1.2*ngenes),ylim=c(0,maxgenemean)) | |
| 188 lineh=0 | |
| 189 while(lineh<maxgenemean){ | |
| 190 lineh=lineh+100 | |
| 191 lines(c(-100,1000+length(genes)),c(lineh,lineh),col="gray") | |
| 192 } | |
| 193 garbage<-dev.off() | |
| 194 cat("ready.\n") | |
| 195 } | |
| 196 | |
| 197 | |
| 198 ##Searching for poorly/highly covered genes | |
| 199 cat("Searching for poorly (<",poor,"x) and highly (>",high,"x) covered genes ... ",sep="") | |
| 200 genes<-levels(as.factor(bed$V4)) | |
| 201 chr<-start<-end<-gene1<-ave<-stddev<-gr1x<-gr5x<-gr10x<-gr20x<-gr30x<-numeric(0) | |
| 202 for(gene in genes){ | |
| 203 | |
| 204 bed0<-bed[bed$V4==gene,] | |
| 205 len<-bed0[,3]-bed0[,2] | |
| 206 fold1<-fold5<-fold10<-fold20<-fold30<-numeric(0) | |
| 207 for(i in 1:nrow(bed0)){fold1<-c(fold1,as.numeric(bed0$V6[i]))} | |
| 208 for(i in 1:nrow(bed0)){fold5<-c(fold5,as.numeric(bed0$V7[i]))} | |
| 209 for(i in 1:nrow(bed0)){fold10<-c(fold10,as.numeric(bed0$V8[i]))} | |
| 210 for(i in 1:nrow(bed0)){fold20<-c(fold20,as.numeric(bed0$V9[i]))} | |
| 211 | |
| 212 for(i in 1:nrow(bed0)){fold30<-c(fold30,as.numeric(bed0$V10[i]))} | |
| 213 fold1_gene<-paste(round(sum(fold1*len)/sum(len),2),"%",sep="") | |
| 214 fold5_gene<-paste(round(sum(fold5*len)/sum(len),2),"%",sep="") | |
| 215 fold10_gene<-paste(round(sum(fold10*len)/sum(len),2),"%",sep="") | |
| 216 fold20_gene<-paste(round(sum(fold20*len)/sum(len),2),"%",sep="") | |
| 217 fold30_gene<-paste(round(sum(fold30*len)/sum(len),2),"%",sep="") | |
| 218 | |
| 219 chr<-c(chr,bed0[1,1]) | |
| 220 start<-c(start,min(bed0[,2])) | |
| 221 end<-c(end,max(bed0[,3])) | |
| 222 gene1<-c(gene1,gene) | |
| 223 ave<-c(ave,round(mean(bed0[,5]),2)) | |
| 224 if(is.na(sd(bed0[,5]))){ | |
| 225 stddev<-c(stddev,0) | |
| 226 } | |
| 227 else{ | |
| 228 stddev<-c(stddev,round(sd(bed0[,5]),2)) | |
| 229 } | |
| 230 | |
| 231 gr1x<-c(gr1x,fold1_gene) | |
| 232 gr5x<-c(gr5x,fold5_gene) | |
| 233 gr10x<-c(gr10x,fold10_gene) | |
| 234 gr20x<-c(gr20x,fold20_gene) | |
| 235 gr30x<-c(gr30x,fold30_gene) | |
| 236 | |
| 237 } | |
| 238 | |
| 239 bed_gene<-data.frame(chr,start,end,gene1,ave,stddev,gr1x,gr5x,gr10x,gr20x,gr30x) | |
| 240 | |
| 241 poorly_covered<-bed_gene[bed_gene$ave<poor,] | |
| 242 poorly_covered<-poorly_covered[order(poorly_covered[,5],decreasing=FALSE),] | |
| 243 highly_covered<-bed_gene[bed_gene$ave>high,] | |
| 244 highly_covered<-highly_covered[order(highly_covered[,5],decreasing=TRUE),] | |
| 245 | |
| 246 linkpoor<-paste("http://www.genome.ucsc.edu/cgi-bin/hgTracks?&db=",genome,"&position=", | |
| 247 poorly_covered$chr,"%3A",poorly_covered$start,"-", | |
| 248 poorly_covered$end,"&hgt.suggest=&pix=800&Submit=submit&hgsid=183341879",sep="") | |
| 249 linkhigh<-paste("http://www.genome.ucsc.edu/cgi-bin/hgTracks?&db=",genome,"&position=", | |
| 250 highly_covered$chr,"%3A",highly_covered$start,"-", | |
| 251 highly_covered$end,"&hgt.suggest=&pix=800&Submit=submit&hgsid=183341879",sep="") | |
| 252 cat("ready.\n",sep="") | |
| 253 | |
| 254 | |
| 255 ##Output HTML document | |
| 256 cat("Writing HTML output ... "); | |
| 257 | |
| 258 cat(file=outfile,paste( | |
| 259 "<html>\n", | |
| 260 "<head>\n", | |
| 261 "<title>Enrichment Performance</title>\n", | |
| 262 "<style type=\"text/css\">\n", | |
| 263 " body{font-family:sans-serif;}\n", | |
| 264 " h2,h3{color: darkblue;}\n", | |
| 265 " a{color:darkblue;}\n", | |
| 266 " table.output td{", | |
| 267 " padding: 4px; background-color: lightskyblue;", | |
| 268 " border: 1px solid #000; border-color: darkblue;", | |
| 269 " }\n", | |
| 270 "</style>\n", | |
| 271 "\n", | |
| 272 "<script language=\"JavaScript\">\n", | |
| 273 " var questionClass=\"chrView\";\n", | |
| 274 " function collapseAll(){\n", | |
| 275 " var allSections = document.getElementsByTagName(\"div\");\n", | |
| 276 " for(i=0; i<allSections.length; i++){\n", | |
| 277 " if(allSections[i].className==questionClass){\n", | |
| 278 " allSections[i].style.display=\"none\";\n", | |
| 279 " }\n", | |
| 280 " }\n", | |
| 281 " }\n", | |
| 282 " function expand(name){\n", | |
| 283 " collapseAll();\n", | |
| 284 " var newStyle=\"\";\n", | |
| 285 " if(document.getElementById(name).style.display!=\"block\"){\n", | |
| 286 " newStyle=\"block\";\n", | |
| 287 " }\n", | |
| 288 " else{\n", | |
| 289 " newStyle=\"none\";\n", | |
| 290 " }\n", | |
| 291 " document.getElementById(name).style.display=newStyle;\n", | |
| 292 " }\n", | |
| 293 "</script>\n", | |
| 294 "</head>\n", | |
| 295 "\n", | |
| 296 "<body onload=\"expand('",chromosomes[1],"')\">", | |
| 297 "<h2>Enrichment Performance of Sample ",samplename,"</h2>\n", | |
| 298 "<table>\n", | |
| 299 "<tr>\n", | |
| 300 "<td>\n", | |
| 301 "<h3>Summary Statistics</h3>\n", | |
| 302 "<table class=\"output\">\n", | |
| 303 " <tr><td><b># Reads</b></td><td>",numreads_total,"</td></tr>\n", | |
| 304 " <tr><td><b># On Target ± 100 bp</b></td><td>",numreads_target,"</td></tr>\n", | |
| 305 " <tr><td><b>Target Size (bp)</b><td>",targetsize,"</td></tr>\n", | |
| 306 " <tr><td><b># Target Regions</b><td>",nrow(bed),"</td></tr>\n", | |
| 307 " <tr><td><b>Coverage Mean</b></td><td>",averagecov,"</td></tr>\n", | |
| 308 " <tr><td><b>Coverage Std Dev</b></td><td>",stddevcov,"</td></tr>\n", | |
| 309 " <tr><td><b>Covered 1x</b></td><td>",sample1,"</td></tr>\n", | |
| 310 " <tr><td><b>Covered 5x</b></td><td>",sample5,"</td></tr>\n", | |
| 311 " <tr><td><b>Covered 10x</b></td><td>",sample10,"</td></tr>\n", | |
| 312 " <tr><td><b>Covered 20x</b></td><td>",sample20,"</td></tr>\n", | |
| 313 " <tr><td><b>Covered 30x</b></td><td>",sample30,"</td></tr>\n", | |
| 314 " <tr><td><b>TPKM</b></td><td>",tpkm,"</td></tr>\n", | |
| 315 "</table>\n", | |
| 316 "</td>\n", | |
| 317 "<td width=\"10%\"></td>", | |
| 318 "<td>\n", | |
| 319 "<img src=\"plots/",samplename,"_pieplot.png\"></img>\n", | |
| 320 "</td>\n", | |
| 321 "</tr>\n", | |
| 322 "</table>\n", | |
| 323 "<br/>\n", | |
| 324 "<h2>Genewise Target Coverage</h2>",sdetails,"<br/>\n",sep="")) | |
| 325 | |
| 326 | |
| 327 if(ngenes>=2000){ | |
| 328 for(chromosome in chromosomes){ | |
| 329 cat(file=outfile,paste("<a href=\"javascript:expand('",chromosome,"')\">",chromosome,"</a>\n",sep=""),append=TRUE) | |
| 330 } | |
| 331 for(chromosome in chromosomes){ | |
| 332 cat(file=outfile,paste("<div style=\"height:480px; overflow:auto;\" class=\"chrView\" id=\"",chromosome,"\"><img src=\"plots/",samplename,"_target_coverage_",chromosome,".png\"></img></div>\n",sep=""),append=TRUE) | |
| 333 } | |
| 334 } else{ | |
| 335 cat(file=outfile,paste("<div style=\"height:480px; overflow:auto;\"><img src=\"plots/",samplename,"_target_coverage.png\"></img></div>\n",sep=""),append=TRUE) | |
| 336 } | |
| 337 | |
| 338 cat(file=outfile,paste("<br/><br/>\n", | |
| 339 "<h2>Poorly Covered Genes (Cutoff: ",poor,"x)</h2>\n",sep=""),append=TRUE) | |
| 340 if(nrow(poorly_covered)==0){ | |
| 341 cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE) | |
| 342 } else{ | |
| 343 cat(file=outfile,paste( | |
| 344 "<table class=\"output\">\n", | |
| 345 " <tr>\n", | |
| 346 " <td><b>Region</b></td><td><b>Gene</b></td>\n", | |
| 347 " <td><b>Coverage Mean</b></td>\n", | |
| 348 " <td><b>Covered 1x</b></td><td><b>Covered 5x</b></td>\n", | |
| 349 " <td><b>Covered 10x</b></td><td><b>Covered 20x</b></td>\n", | |
| 350 " <td><b>Covered 30x</b></td><td><b>External Link</b></td>\n", | |
| 351 " </tr>\n",sep=""),append=TRUE) | |
| 352 | |
| 353 for(i in 1:nrow(poorly_covered)){ | |
| 354 cat(file=outfile,paste( | |
| 355 " <tr>\n", | |
| 356 " <td>",poorly_covered[i,1],":",poorly_covered[i,2],"-",poorly_covered[i,3],"</td>\n", | |
| 357 " <td>",poorly_covered[i,4],"</td>\n", | |
| 358 " <td>",poorly_covered[i,5],"</td>\n", | |
| 359 " <td>",poorly_covered[i,7],"</td>\n", | |
| 360 " <td>",poorly_covered[i,8],"</td>\n", | |
| 361 " <td>",poorly_covered[i,9],"</td>\n", | |
| 362 " <td>",poorly_covered[i,10],"</td>\n", | |
| 363 " <td>",poorly_covered[i,11],"</td>\n", | |
| 364 " <td><a href=\"",linkpoor[i],"\">Show in Genome Browser</a></td>\n", | |
| 365 " </tr>\n",sep=""),append=TRUE) | |
| 366 } | |
| 367 | |
| 368 cat(file=outfile,"</table><br/>\n",append=TRUE) | |
| 369 } | |
| 370 | |
| 371 cat(file=outfile,paste("<h2>Highly Covered Genes (Cutoff: ",high,"x)</h2>\n",sep=""),append=TRUE) | |
| 372 if(nrow(highly_covered)==0){ | |
| 373 cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE) | |
| 374 } else{ | |
| 375 cat(file=outfile,paste( | |
| 376 "<table class=\"output\">\n", | |
| 377 " <tr>\n", | |
| 378 " <td><b>Target</b></td><td><b>Gene</b></td><td><b>Coverage Mean</b></td>", | |
| 379 " <td><b>Covered 1x</b></td><td><b>Covered 5x</b></td><td><b>Covered 10x</b></td>", | |
| 380 " <td><b>Covered 20x</b></td><td><b>Covered 30x</b></td><td><b>External Link</b></td>\n", | |
| 381 " </tr>\n",sep=""),append=TRUE) | |
| 382 for(i in 1:nrow(highly_covered)){ | |
| 383 cat(file=outfile,paste(" <tr>\n", | |
| 384 " <td>",highly_covered[i,1],":",highly_covered[i,2],"-",highly_covered[i,3],"</td>\n", | |
| 385 " <td>",highly_covered[i,4],"</td>\n", | |
| 386 " <td>",highly_covered[i,5],"</td>\n", | |
| 387 " <td>",highly_covered[i,7],"</td>\n", | |
| 388 " <td>",highly_covered[i,8],"</td>\n", | |
| 389 " <td>",highly_covered[i,9],"</td>\n", | |
| 390 " <td>",highly_covered[i,10],"</td>\n", | |
| 391 " <td>",highly_covered[i,11],"</td>\n", | |
| 392 " <td><a href=\"",linkhigh[i],"\">Show in Genome Browser</a></td>\n", | |
| 393 " </tr>",sep=""),append=TRUE) | |
| 394 } | |
| 395 | |
| 396 cat(file=outfile,"</table>\n",append=TRUE) | |
| 397 } | |
| 398 cat(file=outfile,"<br/>BED file used for specification of target regions:<br/>",targetfile,append=TRUE) | |
| 399 cat(file=outfile,"</body>\n</html>\n",append=TRUE) | |
| 400 | |
| 401 cat("ready.\n") |
