Mercurial > repos > pfrommolt > ngsrich
comparison NGSrich_0.5.5/R/summarize_enrichment.R @ 0:89ad0a9cca52 default tip
Uploaded
author | pfrommolt |
---|---|
date | Mon, 21 Nov 2011 08:12:19 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:89ad0a9cca52 |
---|---|
1 #!/usr/bin/Rscript | |
2 | |
3 infile=as.character(commandArgs()[6]) | |
4 outdir=as.character(commandArgs()[7]) | |
5 poor=as.numeric(commandArgs()[8]) | |
6 high=as.numeric(commandArgs()[9]) | |
7 | |
8 | |
9 outfile=paste(outdir,"/summary.html",sep="") | |
10 cutoff=c(poor,high) | |
11 | |
12 xmlfiles<-bedfiles<-character(0) | |
13 indirs<-as.character(readLines(infile)) | |
14 for(thisdir in indirs){ | |
15 thisdir=paste(thisdir,"/data",sep="") | |
16 xmlfiles=c(xmlfiles,list.files(thisdir,pattern=".xml",full.names=TRUE)) | |
17 bedfiles=c(bedfiles,list.files(thisdir,pattern=".bed",full.names=TRUE)) | |
18 } | |
19 nsamples<-length(indirs) | |
20 | |
21 samplenames<-character(0) | |
22 diffs<-matrix(nrow=5,ncol=nsamples) | |
23 averagecov<-stddevcov<-targetsize<-numreads_total<-numreads_target<-numeric(0) | |
24 sample1<-sample5<-sample10<-sample20<-sample30<-numeric(0) | |
25 | |
26 | |
27 ##Read XML input files | |
28 xmltag<-function(line){ | |
29 return(strsplit(strsplit(line,">")[[1]][2],"</")[[1]][1]) | |
30 } | |
31 i=1 | |
32 nextfold=0 | |
33 for(file in xmlfiles){ | |
34 numreads<-numeric(0) | |
35 inlines<-readLines(file) | |
36 for(line in inlines){ | |
37 if(length(grep("SampleName",line)>0)){samplenames<-c(samplenames,xmltag(line))} | |
38 if(length(grep("NumberReads",line)>0)){numreads<-c(numreads,as.numeric(xmltag(line)))} | |
39 if(length(grep("AvTargetCoverage",line)>0)){averagecov=c(averagecov,as.numeric(xmltag(line)))} | |
40 if(length(grep("SDTargetCoverage",line)>0)){stddevcov=c(stddevcov,as.numeric(xmltag(line)))} | |
41 if(length(grep("TargetSize",line)>0)){targetsize=as.numeric(xmltag(line))} | |
42 if(length(grep("<from1x>",line)>0)){nextfold=1} | |
43 else{ | |
44 if((nextfold==1) && length(grep("PercBases",line))>0){ | |
45 sample1<-c(sample1,as.numeric(xmltag(line))) | |
46 nextfold=0 | |
47 } | |
48 } | |
49 if(length(grep("<from5x>",line)>0)){nextfold=5} | |
50 else{ | |
51 if((nextfold==5) && length(grep("PercBases",line))>0){ | |
52 sample5<-c(sample5,as.numeric(xmltag(line))) | |
53 nextfold=0 | |
54 } | |
55 } | |
56 if(length(grep("<from10x>",line)>0)){nextfold=10} | |
57 else{ | |
58 if((nextfold==10) && length(grep("PercBases",line))>0){ | |
59 sample10<-c(sample10,as.numeric(xmltag(line))) | |
60 nextfold=0 | |
61 } | |
62 } | |
63 if(length(grep("<from20x>",line)>0)){nextfold=20} | |
64 else{ | |
65 if((nextfold==20) && length(grep("PercBases",line))>0){ | |
66 sample20<-c(sample20,as.numeric(xmltag(line))) | |
67 nextfold=0 | |
68 } | |
69 } | |
70 if(length(grep("<from30x>",line)>0)){nextfold=30} | |
71 else{ | |
72 if((nextfold==30) && length(grep("PercBases",line))>0){ | |
73 sample30<-c(sample30,as.numeric(xmltag(line))) | |
74 nextfold=0 | |
75 } | |
76 } | |
77 } | |
78 numreads_total<-c(numreads_total,numreads[1]) | |
79 numreads_target<-c(numreads_target,numreads[2]) | |
80 diffs[,i]<-c(sample30[i],sample20[i]-sample30[i],sample10[i]-sample20[i],sample5[i]-sample10[i],sample1[i]-sample5[i]) | |
81 i=i+1 | |
82 } | |
83 | |
84 | |
85 ##Create barplot | |
86 rownames(diffs)<-c("Covered 30x","Covered 20x","Covered 10x","Covered 5x","Covered 1x") | |
87 plot_samplenames<-samplenames | |
88 s=0 | |
89 for(sname in plot_samplenames){ | |
90 s=s+1 | |
91 if(nchar(sname)>10){plot_samplenames[s]<-paste("Sample ",s,"\n(table above)",sep="")} | |
92 } | |
93 colnames(diffs)<-plot_samplenames | |
94 png(file=paste(outdir,"/summary_plot.png",sep=""),width=66*(4+nsamples),height=400) | |
95 par(oma=c(1,0,0,1)) | |
96 barplot(diffs, | |
97 legend=TRUE, | |
98 col=c("darkblue","steelblue1","tomato1","tomato3","tomato4"), | |
99 xlim=c(0,4+nsamples),ylim=c(0,100), | |
100 ylab="Percentage",las=2 ) #, | |
101 # args.legend=list(x=nsamples+4,y=80)) | |
102 dev.off() | |
103 | |
104 | |
105 | |
106 ##Read BED files | |
107 sametarget=TRUE | |
108 thisbed<-read.table(bedfiles[1],stringsAsFactors=FALSE) | |
109 genes<-levels(as.factor(thisbed$V4)) | |
110 meancov<-matrix(nrow=length(genes),ncol=length(bedfiles)) | |
111 chr<-start<-end<-character(0) | |
112 | |
113 i=1 | |
114 for(file in bedfiles){ | |
115 thisbed<-read.table(file,stringsAsFactors=FALSE) | |
116 genes_thissample<-levels(as.factor(thisbed$V4)) | |
117 if(length(genes_thissample)!=length(genes)){sametarget=FALSE} | |
118 for(j in 1:length(genes)){ | |
119 thisgene<-thisbed[thisbed$V4==genes[j],] | |
120 chr[j]<-thisgene[1,1] | |
121 start[j]<-thisgene[1,2] | |
122 end[j]<-thisgene[1,3] | |
123 meancov[j,i]<-as.numeric(mean(thisgene[,5])) | |
124 } | |
125 i=i+1 | |
126 } | |
127 | |
128 meancov<-data.frame(chr,start,end,genes,meancov) | |
129 for(i in 1:nrow(meancov)){ | |
130 print(i) | |
131 meancov$ave[i]<-round(mean(as.numeric(meancov[i,4+(1:nsamples)])),4) | |
132 meancov$stddev[i]<-round(sd(as.numeric(meancov[i,4+(1:nsamples)])),4) | |
133 meancov$mini[i]<-round(min(as.numeric(meancov[i,4+(1:nsamples)])),4) | |
134 meancov$maxi[i]<-round(max(as.numeric(meancov[i,4+(1:nsamples)])),4) | |
135 } | |
136 meancov_sorted<-meancov[order(meancov$ave),] | |
137 poorly_all<-meancov_sorted[meancov_sorted$max<cutoff[1],] | |
138 | |
139 | |
140 ##Write HTML output | |
141 cat(file=outfile,paste( | |
142 "<html>\n", | |
143 "<head>\n", | |
144 "<title>Enrichment Performance</title>\n", | |
145 "<style type=\"text/css\">\n", | |
146 "body{font-family:sans-serif;}\n", | |
147 "h2,h3{color: darkblue;}\n", | |
148 "a{color: darkblue;}\n", | |
149 " table.output td{\n", | |
150 " padding: 4px; background-color: lightskyblue;\n", | |
151 " border: 1px solid #000; border-color: darkblue;\n", | |
152 "}\n", | |
153 "</Style>\n", | |
154 "</head>\n", | |
155 "<body>\n", | |
156 "<h2>Target Enrichment Performance</h2><br/>\n", | |
157 "<table class=\"output\">\n", | |
158 " <tr>\n", | |
159 " <td><b>Sample</b></td><td><b># Reads</b></td><td><b># On Target</b></td><td><b>Coverage Mean</b></td><td><b>Coverage Std Dev</b></td><td><b>Covered 1x</b></td><td><b>Covered 5x</b></td><td><b>Covered 10x</b></td><td><b>Covered 20x</b></td><td><b>Covered 30x</b></td><td><b>Details</b></td>\n", | |
160 "</tr>\n",sep="")) | |
161 for(i in 1:nsamples){ | |
162 cat(file=outfile,paste( | |
163 "<tr><td>",samplenames[i],"</td><td>",numreads_total[i],"</td>", | |
164 "<td>",numreads_target[i],"</td><td>",averagecov[i],"</td>", | |
165 "<td>",stddevcov[i],"</td><td>",sample1[i],"</td><td>",sample5[i],"</td>", | |
166 "<td>",sample10[i],"</td><td>",sample20[i],"</td><td>",sample30[i],"</td>", | |
167 "<td><a href=\"",rev(strsplit(indirs[i],"/")[[1]])[1],"/",samplenames[i],"_enrichment.html\">Show</a></td></tr>\n", | |
168 sep=""),append=TRUE) | |
169 } | |
170 cat(file=outfile,paste( | |
171 "</table>\n", | |
172 "<br/><br/>\n", | |
173 "<h2>Target Base Percentage Covered</h2>\n", | |
174 "<img src=\"summary_plot.png\"></img>\n",sep=""),append=TRUE) | |
175 if(!sametarget){ | |
176 warning("Target regions are not the same for all samples.") | |
177 } else{ | |
178 cat(file=outfile,paste( | |
179 "<h2>Poorly Covered in All Samples (Cutoff: ",cutoff[1],"x)</h2>\n",sep=""),append=TRUE) | |
180 if(nrow(poorly_all)==0){ | |
181 cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE) | |
182 } else{ | |
183 cat(file=outfile,paste( | |
184 "<table class=\"output\">\n", | |
185 "<tr>\n", | |
186 "<td><b>Target Region</b></td><td><b>Gene</b></td><td><b>Mean Across Samples</b></td><td><b>Std Dev Across Samples</b></td>\n",sep=""),append=TRUE) | |
187 for(i in 1:nrow(poorly_all)){ | |
188 cat(file=outfile,paste( | |
189 "<tr>\n", | |
190 "<td>",poorly_all$chr[i],":",poorly_all$start[i],"-",poorly_all$end[i],"</td>\n", | |
191 "<td>",poorly_all$genes[i],"</td>\n", | |
192 "<td>",poorly_all$ave[i],"</td>\n", | |
193 "<td>",poorly_all$stddev[i],"</td>\n", | |
194 "<tr>\n",sep=""),append=TRUE) | |
195 } | |
196 cat(file=outfile,"</table>\n",append=TRUE) | |
197 } | |
198 | |
199 meancov_sorted<-meancov[order(meancov$ave,decreasing=TRUE),] ##Highly Covered | |
200 highly_all<-meancov_sorted[meancov_sorted$min>cutoff[2],] | |
201 cat(file=outfile,paste( | |
202 "<h2>Highly Covered in All Samples (Cutoff: ",cutoff[2],"x)</h2>\n",sep=""),append=TRUE) | |
203 if(nrow(highly_all)==0){ | |
204 cat(file=outfile,"<p>Nothing found for this cutoff.</p>",append=TRUE) | |
205 } else{ | |
206 cat(file=outfile,paste("<table class=\"output\">\n", | |
207 "<tr>\n", | |
208 "<td><b>Target Region</b></td><td><b>Gene</b></td><td><b>Mean Across Sample</b></td><td><b>Std Dev Across Samples</b></td>\n",sep=""),append=TRUE) | |
209 for(i in 1:nrow(highly_all)){ | |
210 cat(file=outfile,paste( | |
211 "<tr>\n", | |
212 "<td>",highly_all$chr[i],":",highly_all$start[i],"-",highly_all$end[i],"</td>\n", | |
213 "<td>",highly_all$genes[i],"</td>\n", | |
214 "<td>",highly_all$ave[i],"</td>\n", | |
215 "<td>",highly_all$stddev[i],"</td>\n", | |
216 "<tr>\n",sep=""),append=TRUE) | |
217 } | |
218 cat(file=outfile,"</table>\n",append=TRUE) | |
219 } | |
220 | |
221 } | |
222 | |
223 cat(file=outfile,"</body>\n</html>\n",append=TRUE) |