comparison NGSrich_0.5.5/R/summarize_enrichment.R @ 0:89ad0a9cca52 default tip

Uploaded
author pfrommolt
date Mon, 21 Nov 2011 08:12:19 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:89ad0a9cca52
1 #!/usr/bin/Rscript
2
3 infile=as.character(commandArgs()[6])
4 outdir=as.character(commandArgs()[7])
5 poor=as.numeric(commandArgs()[8])
6 high=as.numeric(commandArgs()[9])
7
8
9 outfile=paste(outdir,"/summary.html",sep="")
10 cutoff=c(poor,high)
11
12 xmlfiles<-bedfiles<-character(0)
13 indirs<-as.character(readLines(infile))
14 for(thisdir in indirs){
15 thisdir=paste(thisdir,"/data",sep="")
16 xmlfiles=c(xmlfiles,list.files(thisdir,pattern=".xml",full.names=TRUE))
17 bedfiles=c(bedfiles,list.files(thisdir,pattern=".bed",full.names=TRUE))
18 }
19 nsamples<-length(indirs)
20
21 samplenames<-character(0)
22 diffs<-matrix(nrow=5,ncol=nsamples)
23 averagecov<-stddevcov<-targetsize<-numreads_total<-numreads_target<-numeric(0)
24 sample1<-sample5<-sample10<-sample20<-sample30<-numeric(0)
25
26
27 ##Read XML input files
28 xmltag<-function(line){
29 return(strsplit(strsplit(line,">")[[1]][2],"</")[[1]][1])
30 }
31 i=1
32 nextfold=0
33 for(file in xmlfiles){
34 numreads<-numeric(0)
35 inlines<-readLines(file)
36 for(line in inlines){
37 if(length(grep("SampleName",line)>0)){samplenames<-c(samplenames,xmltag(line))}
38 if(length(grep("NumberReads",line)>0)){numreads<-c(numreads,as.numeric(xmltag(line)))}
39 if(length(grep("AvTargetCoverage",line)>0)){averagecov=c(averagecov,as.numeric(xmltag(line)))}
40 if(length(grep("SDTargetCoverage",line)>0)){stddevcov=c(stddevcov,as.numeric(xmltag(line)))}
41 if(length(grep("TargetSize",line)>0)){targetsize=as.numeric(xmltag(line))}
42 if(length(grep("<from1x>",line)>0)){nextfold=1}
43 else{
44 if((nextfold==1) && length(grep("PercBases",line))>0){
45 sample1<-c(sample1,as.numeric(xmltag(line)))
46 nextfold=0
47 }
48 }
49 if(length(grep("<from5x>",line)>0)){nextfold=5}
50 else{
51 if((nextfold==5) && length(grep("PercBases",line))>0){
52 sample5<-c(sample5,as.numeric(xmltag(line)))
53 nextfold=0
54 }
55 }
56 if(length(grep("<from10x>",line)>0)){nextfold=10}
57 else{
58 if((nextfold==10) && length(grep("PercBases",line))>0){
59 sample10<-c(sample10,as.numeric(xmltag(line)))
60 nextfold=0
61 }
62 }
63 if(length(grep("<from20x>",line)>0)){nextfold=20}
64 else{
65 if((nextfold==20) && length(grep("PercBases",line))>0){
66 sample20<-c(sample20,as.numeric(xmltag(line)))
67 nextfold=0
68 }
69 }
70 if(length(grep("<from30x>",line)>0)){nextfold=30}
71 else{
72 if((nextfold==30) && length(grep("PercBases",line))>0){
73 sample30<-c(sample30,as.numeric(xmltag(line)))
74 nextfold=0
75 }
76 }
77 }
78 numreads_total<-c(numreads_total,numreads[1])
79 numreads_target<-c(numreads_target,numreads[2])
80 diffs[,i]<-c(sample30[i],sample20[i]-sample30[i],sample10[i]-sample20[i],sample5[i]-sample10[i],sample1[i]-sample5[i])
81 i=i+1
82 }
83
84
85 ##Create barplot
86 rownames(diffs)<-c("Covered 30x","Covered 20x","Covered 10x","Covered 5x","Covered 1x")
87 plot_samplenames<-samplenames
88 s=0
89 for(sname in plot_samplenames){
90 s=s+1
91 if(nchar(sname)>10){plot_samplenames[s]<-paste("Sample ",s,"\n(table above)",sep="")}
92 }
93 colnames(diffs)<-plot_samplenames
94 png(file=paste(outdir,"/summary_plot.png",sep=""),width=66*(4+nsamples),height=400)
95 par(oma=c(1,0,0,1))
96 barplot(diffs,
97 legend=TRUE,
98 col=c("darkblue","steelblue1","tomato1","tomato3","tomato4"),
99 xlim=c(0,4+nsamples),ylim=c(0,100),
100 ylab="Percentage",las=2 ) #,
101 # args.legend=list(x=nsamples+4,y=80))
102 dev.off()
103
104
105
106 ##Read BED files
107 sametarget=TRUE
108 thisbed<-read.table(bedfiles[1],stringsAsFactors=FALSE)
109 genes<-levels(as.factor(thisbed$V4))
110 meancov<-matrix(nrow=length(genes),ncol=length(bedfiles))
111 chr<-start<-end<-character(0)
112
113 i=1
114 for(file in bedfiles){
115 thisbed<-read.table(file,stringsAsFactors=FALSE)
116 genes_thissample<-levels(as.factor(thisbed$V4))
117 if(length(genes_thissample)!=length(genes)){sametarget=FALSE}
118 for(j in 1:length(genes)){
119 thisgene<-thisbed[thisbed$V4==genes[j],]
120 chr[j]<-thisgene[1,1]
121 start[j]<-thisgene[1,2]
122 end[j]<-thisgene[1,3]
123 meancov[j,i]<-as.numeric(mean(thisgene[,5]))
124 }
125 i=i+1
126 }
127
128 meancov<-data.frame(chr,start,end,genes,meancov)
129 for(i in 1:nrow(meancov)){
130 print(i)
131 meancov$ave[i]<-round(mean(as.numeric(meancov[i,4+(1:nsamples)])),4)
132 meancov$stddev[i]<-round(sd(as.numeric(meancov[i,4+(1:nsamples)])),4)
133 meancov$mini[i]<-round(min(as.numeric(meancov[i,4+(1:nsamples)])),4)
134 meancov$maxi[i]<-round(max(as.numeric(meancov[i,4+(1:nsamples)])),4)
135 }
136 meancov_sorted<-meancov[order(meancov$ave),]
137 poorly_all<-meancov_sorted[meancov_sorted$max<cutoff[1],]
138
139
140 ##Write HTML output
141 cat(file=outfile,paste(
142 "<html>\n",
143 "<head>\n",
144 "<title>Enrichment Performance</title>\n",
145 "<style type=\"text/css\">\n",
146 "body{font-family:sans-serif;}\n",
147 "h2,h3{color: darkblue;}\n",
148 "a{color: darkblue;}\n",
149 " table.output td{\n",
150 " padding: 4px; background-color: lightskyblue;\n",
151 " border: 1px solid #000; border-color: darkblue;\n",
152 "}\n",
153 "</Style>\n",
154 "</head>\n",
155 "<body>\n",
156 "<h2>Target Enrichment Performance</h2><br/>\n",
157 "<table class=\"output\">\n",
158 " <tr>\n",
159 " <td><b>Sample</b></td><td><b># Reads</b></td><td><b># On Target</b></td><td><b>Coverage Mean</b></td><td><b>Coverage Std Dev</b></td><td><b>Covered 1x</b></td><td><b>Covered 5x</b></td><td><b>Covered 10x</b></td><td><b>Covered 20x</b></td><td><b>Covered 30x</b></td><td><b>Details</b></td>\n",
160 "</tr>\n",sep=""))
161 for(i in 1:nsamples){
162 cat(file=outfile,paste(
163 "<tr><td>",samplenames[i],"</td><td>",numreads_total[i],"</td>",
164 "<td>",numreads_target[i],"</td><td>",averagecov[i],"</td>",
165 "<td>",stddevcov[i],"</td><td>",sample1[i],"</td><td>",sample5[i],"</td>",
166 "<td>",sample10[i],"</td><td>",sample20[i],"</td><td>",sample30[i],"</td>",
167 "<td><a href=\"",rev(strsplit(indirs[i],"/")[[1]])[1],"/",samplenames[i],"_enrichment.html\">Show</a></td></tr>\n",
168 sep=""),append=TRUE)
169 }
170 cat(file=outfile,paste(
171 "</table>\n",
172 "<br/><br/>\n",
173 "<h2>Target Base Percentage Covered</h2>\n",
174 "<img src=\"summary_plot.png\"></img>\n",sep=""),append=TRUE)
175 if(!sametarget){
176 warning("Target regions are not the same for all samples.")
177 } else{
178 cat(file=outfile,paste(
179 "<h2>Poorly Covered in All Samples (Cutoff: ",cutoff[1],"x)</h2>\n",sep=""),append=TRUE)
180 if(nrow(poorly_all)==0){
181 cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE)
182 } else{
183 cat(file=outfile,paste(
184 "<table class=\"output\">\n",
185 "<tr>\n",
186 "<td><b>Target Region</b></td><td><b>Gene</b></td><td><b>Mean Across Samples</b></td><td><b>Std Dev Across Samples</b></td>\n",sep=""),append=TRUE)
187 for(i in 1:nrow(poorly_all)){
188 cat(file=outfile,paste(
189 "<tr>\n",
190 "<td>",poorly_all$chr[i],":",poorly_all$start[i],"-",poorly_all$end[i],"</td>\n",
191 "<td>",poorly_all$genes[i],"</td>\n",
192 "<td>",poorly_all$ave[i],"</td>\n",
193 "<td>",poorly_all$stddev[i],"</td>\n",
194 "<tr>\n",sep=""),append=TRUE)
195 }
196 cat(file=outfile,"</table>\n",append=TRUE)
197 }
198
199 meancov_sorted<-meancov[order(meancov$ave,decreasing=TRUE),] ##Highly Covered
200 highly_all<-meancov_sorted[meancov_sorted$min>cutoff[2],]
201 cat(file=outfile,paste(
202 "<h2>Highly Covered in All Samples (Cutoff: ",cutoff[2],"x)</h2>\n",sep=""),append=TRUE)
203 if(nrow(highly_all)==0){
204 cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE)
205 } else{
206 cat(file=outfile,paste("<table class=\"output\">\n",
207 "<tr>\n",
208 "<td><b>Target Region</b></td><td><b>Gene</b></td><td><b>Mean Across Sample</b></td><td><b>Std Dev Across Samples</b></td>\n",sep=""),append=TRUE)
209 for(i in 1:nrow(highly_all)){
210 cat(file=outfile,paste(
211 "<tr>\n",
212 "<td>",highly_all$chr[i],":",highly_all$start[i],"-",highly_all$end[i],"</td>\n",
213 "<td>",highly_all$genes[i],"</td>\n",
214 "<td>",highly_all$ave[i],"</td>\n",
215 "<td>",highly_all$stddev[i],"</td>\n",
216 "<tr>\n",sep=""),append=TRUE)
217 }
218 cat(file=outfile,"</table>\n",append=TRUE)
219 }
220
221 }
222
223 cat(file=outfile,"</body>\n</html>\n",append=TRUE)