0
|
1 #!/usr/bin/Rscript
|
|
2
|
|
3 xmlfile=as.character(commandArgs()[6])
|
|
4 bedfile=as.character(commandArgs()[7])
|
|
5 outdir=as.character(commandArgs()[8])
|
|
6 genome=as.character(commandArgs()[9])
|
|
7 poor=as.numeric(commandArgs()[10])
|
|
8 high=as.numeric(commandArgs()[11])
|
|
9 samplename=as.character(commandArgs()[12])
|
|
10 targetfile=as.character(commandArgs()[13])
|
|
11 details=as.numeric(commandArgs()[14])
|
|
12 cutoff=0
|
|
13 sdetails=""
|
|
14 #samplename0=strsplit(strsplit(xmlfile,"_")[[1]][1],"/")[[1]]
|
|
15 #samplename<-samplename0[length(samplename0)]
|
|
16 outfile=paste(outdir,"/",samplename,"_enrichment.html",sep="")
|
|
17
|
|
18 ##Read XML summary
|
|
19 cat("Reading XML file ... ")
|
|
20 xmltag<-function(line){
|
|
21 return(strsplit(strsplit(line,">")[[1]][2],"</")[[1]][1])
|
|
22 }
|
|
23 xml<-readLines(xmlfile)
|
|
24 numreads<-numeric(0)
|
|
25 nextfold=0
|
|
26 for(line in xml){
|
|
27 if(length(grep("ReadLength",line)>0)){
|
|
28 readlength<-as.numeric(xmltag(line))
|
|
29 }
|
|
30 if(length(grep("NumberReads",line)>0)){
|
|
31 numreads<-c(numreads,as.numeric(xmltag(line)))
|
|
32 }
|
|
33 if(length(grep("AvTargetCoverage",line)>0)){
|
|
34 averagecov=as.numeric(xmltag(line))
|
|
35 }
|
|
36 if(details==1){
|
|
37 sdetails="<a href=\"chromosomes/chromosomes.html\">[Show Details]</a><br/>"
|
|
38 }
|
|
39 if(length(grep("SDTargetCoverage",line)>0)){
|
|
40 stddevcov=as.numeric(xmltag(line))
|
|
41 }
|
|
42 if(length(grep("TargetSize",line)>0)){
|
|
43 targetsize=as.numeric(xmltag(line))
|
|
44 }
|
|
45 if(length(grep("<from1x>",line)>0)){nextfold=1}
|
|
46 else{
|
|
47 if((nextfold==1) && length(grep("PercBases",line))>0){
|
|
48 sample1<-as.numeric(xmltag(line))
|
|
49 nextfold=0
|
|
50 }
|
|
51 }
|
|
52 if(length(grep("<from5x>",line)>0)){nextfold=5}
|
|
53 else{
|
|
54 if((nextfold==5) && length(grep("PercBases",line))>0){
|
|
55 sample5<-as.numeric(xmltag(line))
|
|
56 nextfold=0
|
|
57 }
|
|
58 }
|
|
59 if(length(grep("<from10x>",line)>0)){nextfold=10}
|
|
60 else{
|
|
61 if((nextfold==10) && length(grep("PercBases",line))>0){
|
|
62 sample10<-as.numeric(xmltag(line))
|
|
63 nextfold=0
|
|
64 }
|
|
65 }
|
|
66 if(length(grep("<from20x>",line)>0)){nextfold=20}
|
|
67 else{
|
|
68 if((nextfold==20) && length(grep("PercBases",line))>0){
|
|
69 sample20<-as.numeric(xmltag(line))
|
|
70 nextfold=0
|
|
71 }
|
|
72 }
|
|
73 if(length(grep("<from30x>",line)>0)){nextfold=30}
|
|
74 else{
|
|
75 if((nextfold==30) && length(grep("PercBases",line))>0){
|
|
76
|
|
77 sample30<-as.numeric(xmltag(line))
|
|
78 nextfold=0
|
|
79 }
|
|
80 }
|
|
81 }
|
|
82
|
|
83 numreads_total<-numreads[1]
|
|
84 numreads_target<-numreads[3]
|
|
85 tpkm=round(numreads_target/((targetsize/1000)*(numreads_total/1000000)),2)
|
|
86
|
|
87 cat("ready.\n")
|
|
88
|
|
89
|
|
90 ##Read BED enrichment file and summarize output
|
|
91 cat("Reading BED file ... ")
|
|
92 bed<-read.table(bedfile,stringsAsFactors=FALSE)
|
|
93 area0_2<-sum(bed$V5<2)
|
|
94 area2_10<-sum((bed$V5>=2) & (bed$V5<10))
|
|
95 area10_20<-sum((bed$V5>=10) & (bed$V5<20))
|
|
96 area20_30<-sum((bed$V5>=20) & (bed$V5<30))
|
|
97 area30_50<-sum((bed$V5>=30) & (bed$V5<50))
|
|
98 area50_100<-sum((bed$V5>=50) & (bed$V5<100))
|
|
99 areagr100<-sum(bed$V5>100)
|
|
100 cat("ready.\n")
|
|
101
|
|
102
|
|
103 ##Create pieplot
|
|
104 cat("Creating coverage pieplot ... ")
|
|
105 png(file=paste(outdir,"/plots/",samplename,"_pieplot.png",sep=""),width=580)
|
|
106 par(mar=c(1,7,1,7))
|
|
107 pie(c(area0_2,area2_10,area10_20,area20_30,area30_50,area50_100,areagr100),
|
|
108 labels=c(paste("0x to 2x (",area0_2,")",sep=""),
|
|
109 paste("2x to 10x (",area2_10,")",sep=""),
|
|
110 paste("10x to 20x (",area10_20,")",sep=""),
|
|
111 paste("20x to 30x (",area20_30,")",sep=""),
|
|
112 paste("30x to 50x (",area30_50,")",sep=""),
|
|
113 paste("50x to 100x (",area50_100,")",sep=""),
|
|
114 paste("above 100x (",areagr100,")",sep="")),
|
|
115 col=c("gray30","gray40","gray50","gray60","gray70","gray80","gray90"),main="") ##Mean Coverage of Target Regions")
|
|
116 garbage<-dev.off()
|
|
117 cat("ready.\n")
|
|
118
|
|
119
|
|
120 cat("Preparing coverage barplots ... ")
|
|
121 maxgenemean=0
|
|
122 genes<-levels(as.factor(bed$V4))
|
|
123 genes<-genes[genes!="unknown"]
|
|
124 ngenes<-length(genes)
|
|
125
|
|
126 for(i in 1:ngenes){
|
|
127 genemean<-mean(bed[bed$V4==genes[i],5])
|
|
128 if(maxgenemean<genemean){maxgenemean=genemean}
|
|
129 }
|
|
130
|
|
131 maxgenemean=maxgenemean-(maxgenemean%%100)+100
|
|
132 if((cutoff>0) && (maxgenemean>cutoff)){
|
|
133 maxgenemean=cutoff
|
|
134 }
|
|
135 cat("ready.\n")
|
|
136
|
|
137 fwidth<-function(genes){
|
|
138 fwidth0=10*length(genes)
|
|
139 if(length(genes)<=100){fwidth0=10*length(genes)}
|
|
140 if(length(genes)<=70){fwidth0=10*length(genes)}
|
|
141 if(length(genes)<=40){fwidth0=25*length(genes)}
|
|
142 if(length(genes)<=20){fwidth0=30*length(genes)}
|
|
143 if(length(genes)<=10){fwidth0=35*length(genes)}
|
|
144 if(length(genes)<=5){fwidth0=50*length(genes)}
|
|
145 return(fwidth0)
|
|
146 }
|
|
147
|
|
148
|
|
149 chromosomes<-levels(as.factor(bed$V1))
|
|
150 if(length(genes)>=2000){
|
|
151 cat("Creating coverage barplots ... ")
|
|
152 for(chr in chromosomes){
|
|
153
|
|
154 chrbed<-bed[bed$V1==chr,]
|
|
155 genemean<-numeric(0)
|
|
156 genes<-levels(as.factor(chrbed$V4))
|
|
157 genes<-genes[genes!="unknown"]
|
|
158
|
|
159 for(i in 1:length(genes)){
|
|
160 if(cutoff>0){genemean[i]<-min(mean(chrbed[chrbed$V4==genes[i],5]),cutoff)}
|
|
161 else{genemean[i]<-mean(chrbed[chrbed$V4==genes[i],5])}
|
|
162 }
|
|
163
|
|
164 png(file=paste(outdir,"/plots/",samplename,"_target_coverage_",chr,".png",sep=""),width=fwidth(genes),height=450)
|
|
165 par(mar=c(7,5,1,2))
|
|
166 barplot(as.numeric(genemean),names.arg=genes,las=2,
|
|
167 ylab=paste("Average Coverage at Gene Locus (",chr,")",sep=""),col="tomato3",xlim=c(0.043*length(genemean),1.2*length(genes)-0.043*length(genemean)),ylim=c(0,maxgenemean))
|
|
168 lineh=0
|
|
169 while(lineh<maxgenemean){
|
|
170 lineh=lineh+100
|
|
171 lines(c(-100,1000+length(genes)),c(lineh,lineh),col="gray")
|
|
172 }
|
|
173 garbage<-dev.off()
|
|
174 }
|
|
175 cat("ready.\n")
|
|
176 } else{
|
|
177 cat("Creating coverage barplot ... ")
|
|
178 for(i in 1:length(genes)){
|
|
179 if(cutoff>0){genemean[i]<-min(mean(bed[bed$V4==genes[i],5]),cutoff)}
|
|
180 else{genemean[i]<-mean(bed[bed$V4==genes[i],5])}
|
|
181 }
|
|
182
|
|
183
|
|
184 png(file=paste(outdir,"/plots/",samplename,"_target_coverage.png",sep=""),width=fwidth(genes),height=450)
|
|
185 par(mar=c(7,5,1,2))
|
|
186 barplot(as.numeric(genemean),names.arg=genes,las=2,
|
|
187 ylab="Average Coverage at Gene Locus",col="tomato3",xlim=c(0,1.2*ngenes),ylim=c(0,maxgenemean))
|
|
188 lineh=0
|
|
189 while(lineh<maxgenemean){
|
|
190 lineh=lineh+100
|
|
191 lines(c(-100,1000+length(genes)),c(lineh,lineh),col="gray")
|
|
192 }
|
|
193 garbage<-dev.off()
|
|
194 cat("ready.\n")
|
|
195 }
|
|
196
|
|
197
|
|
198 ##Searching for poorly/highly covered genes
|
|
199 cat("Searching for poorly (<",poor,"x) and highly (>",high,"x) covered genes ... ",sep="")
|
|
200 genes<-levels(as.factor(bed$V4))
|
|
201 chr<-start<-end<-gene1<-ave<-stddev<-gr1x<-gr5x<-gr10x<-gr20x<-gr30x<-numeric(0)
|
|
202 for(gene in genes){
|
|
203
|
|
204 bed0<-bed[bed$V4==gene,]
|
|
205 len<-bed0[,3]-bed0[,2]
|
|
206 fold1<-fold5<-fold10<-fold20<-fold30<-numeric(0)
|
|
207 for(i in 1:nrow(bed0)){fold1<-c(fold1,as.numeric(bed0$V6[i]))}
|
|
208 for(i in 1:nrow(bed0)){fold5<-c(fold5,as.numeric(bed0$V7[i]))}
|
|
209 for(i in 1:nrow(bed0)){fold10<-c(fold10,as.numeric(bed0$V8[i]))}
|
|
210 for(i in 1:nrow(bed0)){fold20<-c(fold20,as.numeric(bed0$V9[i]))}
|
|
211
|
|
212 for(i in 1:nrow(bed0)){fold30<-c(fold30,as.numeric(bed0$V10[i]))}
|
|
213 fold1_gene<-paste(round(sum(fold1*len)/sum(len),2),"%",sep="")
|
|
214 fold5_gene<-paste(round(sum(fold5*len)/sum(len),2),"%",sep="")
|
|
215 fold10_gene<-paste(round(sum(fold10*len)/sum(len),2),"%",sep="")
|
|
216 fold20_gene<-paste(round(sum(fold20*len)/sum(len),2),"%",sep="")
|
|
217 fold30_gene<-paste(round(sum(fold30*len)/sum(len),2),"%",sep="")
|
|
218
|
|
219 chr<-c(chr,bed0[1,1])
|
|
220 start<-c(start,min(bed0[,2]))
|
|
221 end<-c(end,max(bed0[,3]))
|
|
222 gene1<-c(gene1,gene)
|
|
223 ave<-c(ave,round(mean(bed0[,5]),2))
|
|
224 if(is.na(sd(bed0[,5]))){
|
|
225 stddev<-c(stddev,0)
|
|
226 }
|
|
227 else{
|
|
228 stddev<-c(stddev,round(sd(bed0[,5]),2))
|
|
229 }
|
|
230
|
|
231 gr1x<-c(gr1x,fold1_gene)
|
|
232 gr5x<-c(gr5x,fold5_gene)
|
|
233 gr10x<-c(gr10x,fold10_gene)
|
|
234 gr20x<-c(gr20x,fold20_gene)
|
|
235 gr30x<-c(gr30x,fold30_gene)
|
|
236
|
|
237 }
|
|
238
|
|
239 bed_gene<-data.frame(chr,start,end,gene1,ave,stddev,gr1x,gr5x,gr10x,gr20x,gr30x)
|
|
240
|
|
241 poorly_covered<-bed_gene[bed_gene$ave<poor,]
|
|
242 poorly_covered<-poorly_covered[order(poorly_covered[,5],decreasing=FALSE),]
|
|
243 highly_covered<-bed_gene[bed_gene$ave>high,]
|
|
244 highly_covered<-highly_covered[order(highly_covered[,5],decreasing=TRUE),]
|
|
245
|
|
246 linkpoor<-paste("http://www.genome.ucsc.edu/cgi-bin/hgTracks?&db=",genome,"&position=",
|
|
247 poorly_covered$chr,"%3A",poorly_covered$start,"-",
|
|
248 poorly_covered$end,"&hgt.suggest=&pix=800&Submit=submit&hgsid=183341879",sep="")
|
|
249 linkhigh<-paste("http://www.genome.ucsc.edu/cgi-bin/hgTracks?&db=",genome,"&position=",
|
|
250 highly_covered$chr,"%3A",highly_covered$start,"-",
|
|
251 highly_covered$end,"&hgt.suggest=&pix=800&Submit=submit&hgsid=183341879",sep="")
|
|
252 cat("ready.\n",sep="")
|
|
253
|
|
254
|
|
255 ##Output HTML document
|
|
256 cat("Writing HTML output ... ");
|
|
257
|
|
258 cat(file=outfile,paste(
|
|
259 "<html>\n",
|
|
260 "<head>\n",
|
|
261 "<title>Enrichment Performance</title>\n",
|
|
262 "<style type=\"text/css\">\n",
|
|
263 " body{font-family:sans-serif;}\n",
|
|
264 " h2,h3{color: darkblue;}\n",
|
|
265 " a{color:darkblue;}\n",
|
|
266 " table.output td{",
|
|
267 " padding: 4px; background-color: lightskyblue;",
|
|
268 " border: 1px solid #000; border-color: darkblue;",
|
|
269 " }\n",
|
|
270 "</style>\n",
|
|
271 "\n",
|
|
272 "<script language=\"JavaScript\">\n",
|
|
273 " var questionClass=\"chrView\";\n",
|
|
274 " function collapseAll(){\n",
|
|
275 " var allSections = document.getElementsByTagName(\"div\");\n",
|
|
276 " for(i=0; i<allSections.length; i++){\n",
|
|
277 " if(allSections[i].className==questionClass){\n",
|
|
278 " allSections[i].style.display=\"none\";\n",
|
|
279 " }\n",
|
|
280 " }\n",
|
|
281 " }\n",
|
|
282 " function expand(name){\n",
|
|
283 " collapseAll();\n",
|
|
284 " var newStyle=\"\";\n",
|
|
285 " if(document.getElementById(name).style.display!=\"block\"){\n",
|
|
286 " newStyle=\"block\";\n",
|
|
287 " }\n",
|
|
288 " else{\n",
|
|
289 " newStyle=\"none\";\n",
|
|
290 " }\n",
|
|
291 " document.getElementById(name).style.display=newStyle;\n",
|
|
292 " }\n",
|
|
293 "</script>\n",
|
|
294 "</head>\n",
|
|
295 "\n",
|
|
296 "<body onload=\"expand('",chromosomes[1],"')\">",
|
|
297 "<h2>Enrichment Performance of Sample ",samplename,"</h2>\n",
|
|
298 "<table>\n",
|
|
299 "<tr>\n",
|
|
300 "<td>\n",
|
|
301 "<h3>Summary Statistics</h3>\n",
|
|
302 "<table class=\"output\">\n",
|
|
303 " <tr><td><b># Reads</b></td><td>",numreads_total,"</td></tr>\n",
|
|
304 " <tr><td><b># On Target ± 100 bp</b></td><td>",numreads_target,"</td></tr>\n",
|
|
305 " <tr><td><b>Target Size (bp)</b><td>",targetsize,"</td></tr>\n",
|
|
306 " <tr><td><b># Target Regions</b><td>",nrow(bed),"</td></tr>\n",
|
|
307 " <tr><td><b>Coverage Mean</b></td><td>",averagecov,"</td></tr>\n",
|
|
308 " <tr><td><b>Coverage Std Dev</b></td><td>",stddevcov,"</td></tr>\n",
|
|
309 " <tr><td><b>Covered 1x</b></td><td>",sample1,"</td></tr>\n",
|
|
310 " <tr><td><b>Covered 5x</b></td><td>",sample5,"</td></tr>\n",
|
|
311 " <tr><td><b>Covered 10x</b></td><td>",sample10,"</td></tr>\n",
|
|
312 " <tr><td><b>Covered 20x</b></td><td>",sample20,"</td></tr>\n",
|
|
313 " <tr><td><b>Covered 30x</b></td><td>",sample30,"</td></tr>\n",
|
|
314 " <tr><td><b>TPKM</b></td><td>",tpkm,"</td></tr>\n",
|
|
315 "</table>\n",
|
|
316 "</td>\n",
|
|
317 "<td width=\"10%\"></td>",
|
|
318 "<td>\n",
|
|
319 "<img src=\"plots/",samplename,"_pieplot.png\"></img>\n",
|
|
320 "</td>\n",
|
|
321 "</tr>\n",
|
|
322 "</table>\n",
|
|
323 "<br/>\n",
|
|
324 "<h2>Genewise Target Coverage</h2>",sdetails,"<br/>\n",sep=""))
|
|
325
|
|
326
|
|
327 if(ngenes>=2000){
|
|
328 for(chromosome in chromosomes){
|
|
329 cat(file=outfile,paste("<a href=\"javascript:expand('",chromosome,"')\">",chromosome,"</a>\n",sep=""),append=TRUE)
|
|
330 }
|
|
331 for(chromosome in chromosomes){
|
|
332 cat(file=outfile,paste("<div style=\"height:480px; overflow:auto;\" class=\"chrView\" id=\"",chromosome,"\"><img src=\"plots/",samplename,"_target_coverage_",chromosome,".png\"></img></div>\n",sep=""),append=TRUE)
|
|
333 }
|
|
334 } else{
|
|
335 cat(file=outfile,paste("<div style=\"height:480px; overflow:auto;\"><img src=\"plots/",samplename,"_target_coverage.png\"></img></div>\n",sep=""),append=TRUE)
|
|
336 }
|
|
337
|
|
338 cat(file=outfile,paste("<br/><br/>\n",
|
|
339 "<h2>Poorly Covered Genes (Cutoff: ",poor,"x)</h2>\n",sep=""),append=TRUE)
|
|
340 if(nrow(poorly_covered)==0){
|
|
341 cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE)
|
|
342 } else{
|
|
343 cat(file=outfile,paste(
|
|
344 "<table class=\"output\">\n",
|
|
345 " <tr>\n",
|
|
346 " <td><b>Region</b></td><td><b>Gene</b></td>\n",
|
|
347 " <td><b>Coverage Mean</b></td>\n",
|
|
348 " <td><b>Covered 1x</b></td><td><b>Covered 5x</b></td>\n",
|
|
349 " <td><b>Covered 10x</b></td><td><b>Covered 20x</b></td>\n",
|
|
350 " <td><b>Covered 30x</b></td><td><b>External Link</b></td>\n",
|
|
351 " </tr>\n",sep=""),append=TRUE)
|
|
352
|
|
353 for(i in 1:nrow(poorly_covered)){
|
|
354 cat(file=outfile,paste(
|
|
355 " <tr>\n",
|
|
356 " <td>",poorly_covered[i,1],":",poorly_covered[i,2],"-",poorly_covered[i,3],"</td>\n",
|
|
357 " <td>",poorly_covered[i,4],"</td>\n",
|
|
358 " <td>",poorly_covered[i,5],"</td>\n",
|
|
359 " <td>",poorly_covered[i,7],"</td>\n",
|
|
360 " <td>",poorly_covered[i,8],"</td>\n",
|
|
361 " <td>",poorly_covered[i,9],"</td>\n",
|
|
362 " <td>",poorly_covered[i,10],"</td>\n",
|
|
363 " <td>",poorly_covered[i,11],"</td>\n",
|
|
364 " <td><a href=\"",linkpoor[i],"\">Show in Genome Browser</a></td>\n",
|
|
365 " </tr>\n",sep=""),append=TRUE)
|
|
366 }
|
|
367
|
|
368 cat(file=outfile,"</table><br/>\n",append=TRUE)
|
|
369 }
|
|
370
|
|
371 cat(file=outfile,paste("<h2>Highly Covered Genes (Cutoff: ",high,"x)</h2>\n",sep=""),append=TRUE)
|
|
372 if(nrow(highly_covered)==0){
|
|
373 cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE)
|
|
374 } else{
|
|
375 cat(file=outfile,paste(
|
|
376 "<table class=\"output\">\n",
|
|
377 " <tr>\n",
|
|
378 " <td><b>Target</b></td><td><b>Gene</b></td><td><b>Coverage Mean</b></td>",
|
|
379 " <td><b>Covered 1x</b></td><td><b>Covered 5x</b></td><td><b>Covered 10x</b></td>",
|
|
380 " <td><b>Covered 20x</b></td><td><b>Covered 30x</b></td><td><b>External Link</b></td>\n",
|
|
381 " </tr>\n",sep=""),append=TRUE)
|
|
382 for(i in 1:nrow(highly_covered)){
|
|
383 cat(file=outfile,paste(" <tr>\n",
|
|
384 " <td>",highly_covered[i,1],":",highly_covered[i,2],"-",highly_covered[i,3],"</td>\n",
|
|
385 " <td>",highly_covered[i,4],"</td>\n",
|
|
386 " <td>",highly_covered[i,5],"</td>\n",
|
|
387 " <td>",highly_covered[i,7],"</td>\n",
|
|
388 " <td>",highly_covered[i,8],"</td>\n",
|
|
389 " <td>",highly_covered[i,9],"</td>\n",
|
|
390 " <td>",highly_covered[i,10],"</td>\n",
|
|
391 " <td>",highly_covered[i,11],"</td>\n",
|
|
392 " <td><a href=\"",linkhigh[i],"\">Show in Genome Browser</a></td>\n",
|
|
393 " </tr>",sep=""),append=TRUE)
|
|
394 }
|
|
395
|
|
396 cat(file=outfile,"</table>\n",append=TRUE)
|
|
397 }
|
|
398 cat(file=outfile,"<br/>BED file used for specification of target regions:<br/>",targetfile,append=TRUE)
|
|
399 cat(file=outfile,"</body>\n</html>\n",append=TRUE)
|
|
400
|
|
401 cat("ready.\n")
|