Mercurial > repos > geert-vandeweyer > coverage_report
annotate CoverageReport.pl @ 12:86df3f847a72 draft
Switched to R 3.0.2 from iuc, and moved bedtools to seperate tool_definition
author | geert-vandeweyer |
---|---|
date | Thu, 20 Feb 2014 08:57:09 -0500 |
parents | 2936bcb2a378 |
children | 95062840f80f |
rev | line source |
---|---|
1 | 1 #!/usr/bin/perl |
2 | |
3 # load modules | |
4 use Getopt::Std; | |
5 use File::Basename; | |
6 use Number::Format; | |
7 | |
8 # number format | |
9 my $de = new Number::Format(-thousands_sep =>',',-decimal_point => '.'); | |
10 | |
11 ########## | |
12 ## opts ## | |
13 ########## | |
14 ## input files | |
15 # b : path to input (b)am file | |
16 # t : path to input (t)arget regions in BED format | |
17 ## output files | |
18 # o : report pdf (o)utput file | |
19 # z : all plots and tables in tar.g(z) format | |
20 ## entries in the report | |
21 # r : Coverage per (r)egion (boolean) | |
22 # s : (s)ubregion coverage if average < specified (plots for positions along target region) (boolean) | |
23 # S : (S)ubregion coverage for ALL failed exons => use either s OR S or you will have double plots. | |
24 # A : (A)ll exons will be plotted. | |
25 # L : (L)ist failed exons instead of plotting | |
26 # m : (m)inimal Coverage threshold | |
27 # f : fraction of average as threshold | |
28 # n : sample (n)ame. | |
29 | |
30 | |
31 getopts('b:t:o:z:rsSALm:n:f:', \%opts) ; | |
32 | |
33 # make output directory in (tmp) working dir | |
34 our $wd = "/tmp/Coverage.".int(rand(1000)); | |
35 while (-d $wd) { | |
36 $wd = "/tmp/Coverage.".int(rand(1000)); | |
37 } | |
38 system("mkdir $wd"); | |
39 | |
40 ## variables | |
41 our %commandsrun = (); | |
42 | |
43 if (!exists($opts{'b'}) || !-e $opts{'b'}) { | |
44 die('Bam File not found'); | |
45 } | |
46 if (!exists($opts{'t'}) || !-e $opts{'t'}) { | |
47 die('Target File (BED) not found'); | |
48 } | |
49 | |
50 if (exists($opts{'m'})) { | |
51 $thresh = $opts{'m'}; | |
52 } | |
53 else { | |
54 $thresh = 40; | |
55 } | |
56 | |
57 if (exists($opts{'f'})) { | |
58 $frac = $opts{'f'}; | |
59 } | |
60 else { | |
61 $frac = 0.2; | |
62 } | |
63 | |
64 if (exists($opts{'o'})) { | |
65 $pdffile = $opts{'o'}; | |
66 } | |
67 else { | |
68 $pdffile = "$wd/CoverageReport.pdf"; | |
69 } | |
70 | |
71 if (exists($opts{'z'})) { | |
72 $tarfile = $opts{'z'}; | |
73 } | |
74 else { | |
75 $tarfile = "$wd/Results.tar.gz"; | |
76 } | |
77 | |
78 # 1. Global Summary => default | |
79 &GlobalSummary($opts{'b'}, $opts{'t'}); | |
80 | |
81 # 2. Coverage per position | |
82 &SubRegionCoverage($opts{'b'}, $opts{'t'}); | |
83 our %filehash; | |
84 if (exists($opts{'s'}) || exists($opts{'S'}) || exists($opts{'A'}) || exists($opts{'L'})) { | |
85 system("mkdir $wd/SplitFiles"); | |
86 ## get position coverages | |
87 ## split input files | |
88 open IN, "$wd/Targets.Position.Coverage"; | |
89 my $fileidx = 0; | |
90 my $currreg = ''; | |
91 while (<IN>) { | |
92 my $line = $_; | |
93 chomp($line); | |
94 my @p = split(/\t/,$line); | |
95 my $reg = $p[0].'-'.$p[1].'-'.$p[2]; #.$p[3]; | |
96 my $ex = $p[3]; | |
97 if ($reg ne $currreg) { | |
98 ## new exon open new outfile | |
99 if ($currreg ne '') { | |
100 ## filehandle is open. close it | |
101 close OUT; | |
102 } | |
103 if (!exists($filehash{$reg})) { | |
104 $fileidx++; | |
105 $filehash{$reg}{'idx'} = $fileidx; | |
106 $filehash{$reg}{'exon'} = $ex; | |
107 open OUT, ">> $wd/SplitFiles/File_$fileidx.txt"; | |
108 $currreg = $reg; | |
109 } | |
110 else { | |
111 open OUT, ">> $wd/SplitFiles/File_".$filehash{$reg}{'idx'}.".txt"; | |
112 $currreg = $reg; | |
113 } | |
114 } | |
115 ## print the line to the open filehandle. | |
116 print OUT "$line\n"; | |
117 } | |
118 close OUT; | |
119 close IN; | |
120 | |
121 } | |
122 | |
123 ## sort output files according to targets file | |
124 if (exists($opts{'r'}) ) { | |
125 my %hash = (); | |
126 open IN, "$wd/Targets.Global.Coverage"; | |
127 while (<IN>) { | |
128 my @p = split(/\t/,$_) ; | |
129 $hash{$p[3]} = $_; | |
130 } | |
131 close IN; | |
132 open OUT, ">$wd/Targets.Global.Coverage"; | |
133 open IN, $opts{'t'}; | |
134 while (<IN>) { | |
135 my @p = split(/\t/,$_) ; | |
136 print OUT $hash{$p[3]}; | |
137 } | |
138 close IN; | |
139 close OUT; | |
140 } | |
141 | |
142 | |
143 #################################### | |
144 ## PROCESS RESULTS & CREATE PLOTS ## | |
145 #################################### | |
146 system("mkdir $wd/Report"); | |
147 | |
148 system("mkdir $wd/Rout"); | |
149 system("mkdir $wd/Plots"); | |
150 | |
151 $samplename = $opts{'n'}; | |
152 $samplename =~ s/_/\\_/g; | |
153 | |
154 # 0. Preamble | |
155 ## compose preamble | |
156 open OUT, ">$wd/Report/Report.tex"; | |
157 print OUT '\documentclass[a4paper,10pt]{article}'."\n"; | |
158 print OUT '\usepackage[left=2cm,top=1.5cm,right=1.5cm,bottom=2.5cm,nohead]{geometry}'."\n"; | |
159 print OUT '\usepackage{longtable}'."\n"; | |
160 print OUT '\usepackage[T1]{fontenc}'."\n"; | |
161 print OUT '\usepackage{fancyhdr}'."\n"; | |
162 print OUT '\usepackage[latin9]{inputenc}'."\n"; | |
163 print OUT '\usepackage{color}'."\n"; | |
164 print OUT '\usepackage[pdftex]{graphicx}'."\n"; | |
165 print OUT '\definecolor{grey}{RGB}{160,160,160}'."\n"; | |
166 print OUT '\definecolor{darkgrey}{RGB}{100,100,100}'."\n"; | |
167 print OUT '\definecolor{red}{RGB}{255,0,0}'."\n"; | |
168 print OUT '\definecolor{orange}{RGB}{238,118,0}'."\n"; | |
169 print OUT '\setlength\LTleft{0pt}'."\n"; | |
170 print OUT '\setlength\LTright{0pt}'."\n"; | |
171 print OUT '\begin{document}'."\n"; | |
172 print OUT '\pagestyle{fancy}'."\n"; | |
173 print OUT '\fancyhead{}'."\n"; | |
174 print OUT '\renewcommand{\footrulewidth}{0.4pt}'."\n"; | |
175 print OUT '\renewcommand{\headrulewidth}{0pt}'."\n"; | |
176 print OUT '\fancyfoot[R]{\today\hspace{2cm}\thepage\ of \pageref{endofdoc}}'."\n"; | |
177 print OUT '\fancyfoot[C]{}'."\n"; | |
178 print OUT '\fancyfoot[L]{Coverage Report for ``'.$samplename.'"}'."\n"; | |
179 print OUT '\let\oldsubsubsection=\subsubsection'."\n"; | |
180 print OUT '\renewcommand{\subsubsection}{%'."\n"; | |
181 print OUT ' \filbreak'."\n"; | |
182 print OUT ' \oldsubsubsection'."\n"; | |
183 print OUT '}'."\n"; | |
184 # main title | |
185 print OUT '\section*{Coverage Report for ``'.$samplename.'"}'."\n"; | |
186 close OUT; | |
187 | |
188 # 1. Summary Report | |
189 # Get samtools flagstat summary of BAM file | |
190 my $flagstat = `samtools flagstat $opts{'b'}`; | |
191 my @s = split(/\n/,$flagstat); | |
192 # Get number of reads mapped in total | |
193 ## updated on 2012-10-1 !! | |
194 $totalmapped = $s[2]; | |
195 $totalmapped =~ s/^(\d+)(\s.+)/$1/; | |
196 # count columns | |
197 my $head = `head -n 1 $wd/Targets.Global.Coverage`; | |
198 chomp($head); | |
199 my @cols = split(/\t/,$head); | |
200 my $nrcols = scalar(@cols); | |
201 my $covcol = $nrcols - 3; | |
202 # get min/max/median/average coverage => values | |
203 my $covs = `cut -f $covcol $wd/Targets.Global.Coverage`; | |
204 my @coverages = split(/\n/,$covs); | |
205 my ($eavg,$med,$min,$max,$first,$third,$ontarget) = arraystats(@coverages); | |
206 my $spec = sprintf("%.1f",($ontarget / $totalmapped)*100); | |
207 # get min/max/median/average coverage => boxplot in R | |
208 open OUT, ">$wd/Rout/boxplot.R"; | |
209 print OUT 'coverage <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n"; | |
210 print OUT 'coverage <- coverage[,'.$covcol.']'."\n"; | |
211 print OUT 'png(file="../Plots/CoverageBoxPlot.png", bg="white", width=240, height=480)'."\n"; | |
212 print OUT 'boxplot(coverage,range=1.5,main="Target Region Coverage")'."\n"; | |
213 print OUT 'graphics.off()'."\n"; | |
214 close OUT; | |
12
86df3f847a72
Switched to R 3.0.2 from iuc, and moved bedtools to seperate tool_definition
geert-vandeweyer
parents:
11
diff
changeset
|
215 system("cd $wd/Rout && Rscript boxplot.R"); |
1 | 216 |
217 ## global nt coverage plot | |
218 ## use perl to make histogram (lower memory) | |
219 open IN, "$wd/Targets.Position.Coverage"; | |
220 my %dens; | |
221 my $counter = 0; | |
222 my $sum = 0; | |
223 while (<IN>) { | |
224 chomp(); | |
225 my @p = split(/\t/); | |
226 $sum += $p[-1]; | |
227 $counter++; | |
228 if (defined($dens{$p[-1]})) { | |
229 $dens{$p[-1]}++; | |
230 } | |
231 else { | |
232 $dens{$p[-1]} = 1; | |
233 } | |
234 } | |
235 $avg = $sum/$counter; | |
236 close IN; | |
237 open OUT, ">$wd/Rout/hist.txt"; | |
3 | 238 if (!defined($dens{'0'})) { |
239 $dens{'0'} = 0; | |
240 } | |
1 | 241 foreach (keys(%dens)) { |
242 print OUT "$_;$dens{$_}\n"; | |
243 } | |
244 close OUT; | |
245 open OUT, ">$wd/Rout/ntplot.R"; | |
246 # read coverage hist in R to plot | |
247 print OUT 'coverage <- read.table("hist.txt" , as.is = TRUE, header=FALSE,sep=";")'."\n"; | |
248 print OUT 'mincov <- '."$thresh \n"; | |
249 print OUT "avg <- round($avg)\n"; | |
250 print OUT "colnames(coverage) <- c('cov','count')\n"; | |
251 print OUT 'coverage$cov <- coverage$cov / avg'."\n"; | |
252 print OUT 'rep <- which(coverage$cov > 1)'."\n"; | |
253 print OUT 'coverage[coverage$cov > 1,1] <- 1'."\n"; | |
254 print OUT 'values <- coverage[coverage$cov < 1,]'."\n"; | |
255 print OUT 'values <- rbind(values,c(1,sum(coverage[coverage$cov == 1,"count"])))'."\n"; | |
256 print OUT 'values <- values[order(values$cov),]'."\n"; | |
257 print OUT 'prevcount <- 0'."\n"; | |
258 # make cumulative count data frame | |
259 print OUT 'for (i in rev(values$cov)) {'."\n"; | |
260 print OUT ' values[values$cov == i,"count"] <- prevcount + values[values$cov == i,"count"]'."\n"; | |
261 print OUT ' prevcount <- values[values$cov == i,"count"]'."\n"; | |
262 print OUT '}'."\n"; | |
263 print OUT 'values$count <- values$count / (values[values$cov == 0,"count"] / 100)'."\n"; | |
264 # get some values to plot lines. | |
265 print OUT 'mincov.x <- mincov/avg'."\n"; | |
266 print OUT 'if (mincov/avg <= 1) {'."\n"; | |
267 print OUT ' ii <- which(values$cov == mincov.x)'."\n"; | |
268 print OUT ' if (length(ii) == 1) {'."\n"; | |
269 print OUT ' mincov.y <- values[ii[1],"count"]'."\n"; | |
270 print OUT ' } else {'."\n"; | |
271 print OUT ' i1 <- max(which(values$cov < mincov.x))'."\n"; | |
272 print OUT ' i2 <- min(which(values$cov > mincov.x))'."\n"; | |
273 print OUT ' mincov.y <- ((values[i2,"count"] - values[i1,"count"])/(values[i2,"cov"] - values[i1,"cov"]))*(mincov.x - values[i1,"cov"]) + values[i1,"count"]'."\n"; | |
274 print OUT ' }'."\n"; | |
275 print OUT '}'."\n"; | |
276 # open output image and create plot | |
277 print OUT 'png(file="../Plots/CoverageNtPlot.png", bg="white", width=540, height=480)'."\n"; | |
278 print OUT 'par(xaxs="i",yaxs="i")'."\n"; | |
279 print OUT 'plot(values$cov,values$count,ylim=c(0,100),pch=".",main="Cumulative Normalised Base-Coverage Plot",xlab="Normalizalised Coverage",ylab="Cumulative Nr. Of Bases")'."\n"; | |
280 print OUT 'lines(values$cov,values$count)'."\n"; | |
281 print OUT 'if (mincov.x <= 1) {'."\n"; | |
282 print OUT ' lines(c(mincov.x,mincov.x),c(0,mincov.y),lty=2,col="darkgreen")'."\n"; | |
283 print OUT ' lines(c(0,mincov.x),c(mincov.y,mincov.y),lty=2,col="darkgreen")'."\n"; | |
284 print OUT ' text(1,(95),pos=2,col="darkgreen",labels="Threshold: '.$thresh.'x")'."\n"; | |
285 print OUT ' text(1,(91),pos=2,col="darkgreen",labels=paste("%Bases: ",round(mincov.y,2),"%",sep=""))'."\n"; | |
286 print OUT '} else {'."\n"; | |
287 print OUT ' text(1,(95),pos=2,col="darkgreen",labels="Threshold ('.$thresh.'x) > Average")'."\n"; | |
288 print OUT ' text(1,(91),pos=2,col="darkgreen",labels="Plotting impossible")'."\n"; | |
289 print OUT '}'."\n"; | |
290 print OUT 'frac.x <- '."$frac\n"; | |
291 print OUT 'ii <- which(values$cov == frac.x)'."\n"; | |
292 print OUT 'if (length(ii) == 1) {'."\n"; | |
293 print OUT ' frac.y <- values[ii[1],"count"]'."\n"; | |
294 print OUT '} else {'."\n"; | |
295 print OUT ' i1 <- max(which(values$cov < frac.x))'."\n"; | |
296 print OUT ' i2 <- min(which(values$cov > frac.x))'."\n"; | |
297 print OUT ' frac.y <- ((values[i2,"count"] - values[i1,"count"])/(values[i2,"cov"] - values[i1,"cov"]))*(frac.x - values[i1,"cov"]) + values[i1,"count"]'."\n"; | |
298 print OUT '}'."\n"; | |
299 print OUT 'lines(c(frac.x,frac.x),c(0,frac.y),lty=2,col="red")'."\n"; | |
300 print OUT 'lines(c(0,frac.x),c(frac.y,frac.y),lty=2,col="red")'."\n"; | |
301 #iprint OUT 'text((frac.x+0.05),(frac.y - 2),pos=4,col="red",labels=paste(frac.x," x Avg.Cov : ",round(frac.x * avg,2),"x",sep="" ))'."\n"; | |
302 #print OUT 'text((frac.x+0.05),(frac.y-5),pos=4,col="red",labels=paste("%Bases: ",round(frac.y,2),"%",sep=""))'."\n"; | |
303 print OUT 'text(1,86,pos=2,col="red",labels=paste(frac.x," x Avg.Cov : ",round(frac.x * avg,2),"x",sep="" ))'."\n"; | |
304 print OUT 'text(1,82,pos=2,col="red",labels=paste("%Bases: ",round(frac.y,2),"%",sep=""))'."\n"; | |
305 | |
306 print OUT 'graphics.off()'."\n"; | |
307 | |
308 close OUT; | |
12
86df3f847a72
Switched to R 3.0.2 from iuc, and moved bedtools to seperate tool_definition
geert-vandeweyer
parents:
11
diff
changeset
|
309 system("cd $wd/Rout && Rscript ntplot.R"); |
1 | 310 ## PRINT TO .TEX FILE |
311 open OUT, ">>$wd/Report/Report.tex"; | |
312 # average coverage overviews | |
313 print OUT '\subsection*{Overall Summary}'."\n"; | |
314 print OUT '{\small '; | |
315 # left : boxplot | |
316 print OUT '\begin{minipage}{0.3\linewidth}\centering'."\n"; | |
317 print OUT '\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/CoverageBoxPlot.png}'."\n"; | |
318 print OUT '\end{minipage}'."\n"; | |
319 # right : cum.cov.plot | |
320 print OUT '\hspace{0.6cm}'."\n"; | |
321 print OUT '\begin{minipage}{0.65\linewidth}\centering'."\n"; | |
322 print OUT '\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/CoverageNtPlot.png}'."\n"; | |
323 print OUT '\end{minipage} \\\\'."\n"; | |
324 ## next line | |
325 print OUT '\begin{minipage}{0.48\linewidth}'."\n"; | |
326 print OUT '\vspace{-1.2em}'."\n"; | |
327 print OUT '\begin{tabular}{ll}'."\n"; | |
328 # bam statistics | |
329 print OUT '\multicolumn{2}{l}{\textbf{\underline{Samtools Flagstat Summary}}} \\\\'."\n"; | |
330 foreach (@s) { | |
331 $_ =~ m/^(\d+)\s(.+)$/; | |
332 my $one = $1; | |
333 my $two = $2; | |
334 $two =~ s/\s\+\s0\s//; | |
335 $two = ucfirst($two); | |
336 $one =~ s/%/\\%/g; | |
337 # remove '+ 0 ' from front | |
338 $two =~ s/\+\s0\s//; | |
339 # remove trailing from end | |
340 $two =~ s/(\s\+.*)|(:.*)/\)/; | |
341 $two =~ s/%/\\%/g; | |
342 $two =~ s/>=/\$\\ge\$/g; | |
343 $two = ucfirst($two); | |
344 print OUT '\textbf{'.$two.'} & '.$one.' \\\\'."\n"; | |
345 } | |
346 print OUT '\end{tabular}\end{minipage}'."\n"; | |
347 print OUT '\hspace{1.5cm}'."\n"; | |
348 # target coverage statistics | |
349 print OUT '\begin{minipage}{0.4\linewidth}'."\n"; | |
350 #print OUT '\vspace{-4.8em}'."\n"; | |
351 print OUT '\begin{tabular}{ll}'."\n"; | |
352 print OUT '\multicolumn{2}{l}{\textbf{\underline{Target Region Coverage}}} \\\\'."\n"; | |
353 print OUT '\textbf{Number of Target Regions} & '.scalar(@coverages).' \\\\'."\n"; | |
354 print OUT '\textbf{Minimal Region Coverage} & '.$min.' \\\\'."\n"; | |
355 print OUT '\textbf{25\% Region Coverage} & '.$first.' \\\\'. "\n"; | |
356 print OUT '\textbf{50\% (Median) Region Coverage} & '.$med.' \\\\'. "\n"; | |
357 print OUT '\textbf{75\% Region Coverage} & '.$third.' \\\\'. "\n"; | |
358 print OUT '\textbf{Maximal Region Coverage} & '.$max.' \\\\'. "\n"; | |
359 print OUT '\textbf{Average Region Coverage} & '.int($eavg).' \\\\'. "\n"; | |
360 print OUT '\textbf{Mapped On Target} & '.$spec.' \\\\'."\n"; | |
361 print OUT '\multicolumn{2}{l}{\textbf{\underline{Target Base Coverage }}} \\\\'."\n"; | |
362 print OUT '\textbf{Number of Target Bases} & '.$counter.' \\\\'."\n"; | |
363 print OUT '\textbf{Average Base Coverage} & '.int($avg).' \\\\'. "\n"; | |
364 print OUT '\textbf{Non-Covered Bases} & '.$dens{'0'}.' \\\\'."\n"; | |
365 #print OUT '\textbf{Bases Covered $ge$ '.$frac.'xAvg.Cov} & '. | |
366 print OUT '\end{tabular}\end{minipage}}'."\n"; | |
367 close OUT; | |
368 | |
369 # 2. GLOBAL COVERAGE OVERVIEW PER GENE | |
370 @failedexons; | |
371 @allexons; | |
372 @allregions; | |
373 @failedregions; | |
374 if (exists($opts{'r'}) || exists($opts{'s'}) || exists($opts{'S'})) { | |
375 # count columns | |
376 my $head = `head -n 1 $wd/Targets.Global.Coverage`; | |
377 chomp($head); | |
378 my @cols = split(/\t/,$head); | |
379 my $nrcols = scalar(@cols); | |
380 my $covcol = $nrcols - 3; | |
381 # Coverage Plots for each gene => barplots in R, table here. | |
382 open IN, "$wd/Targets.Global.Coverage"; | |
383 my $currgroup = ''; | |
384 my $startline = 0; | |
385 my $stopline = 0; | |
386 $linecounter = 0; | |
387 while (<IN>) { | |
388 $linecounter++; | |
389 chomp($_); | |
390 my @c = split(/\t/,$_); | |
391 push(@allregions,$c[0].'-'.$c[1].'-'.$c[2]); | |
392 my $group = $c[3]; | |
393 ## coverage failure? | |
394 if ($c[$nrcol-1] < 1 || $c[$covcol-1] < $thresh) { | |
395 push(@failedexons,$group); | |
396 push(@failedregions,$c[0].'-'.$c[1].'-'.$c[2]); | |
397 } | |
398 ## store exon | |
399 push(@allexons,$group); | |
400 ## extract and check gene | |
401 $group =~ s/^(\S+)[\|\s](.+)/$1/; | |
402 if ($group ne $currgroup ) { | |
403 if ($currgroup ne '') { | |
404 # new gene, make plot. | |
405 open OUT, ">$wd/Rout/barplot.R"; | |
406 print OUT 'coveragetable <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n"; | |
407 print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n"; | |
408 print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n"; | |
409 print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n"; | |
410 print OUT 'coverage[coverage < 1] <- 1'."\n"; | |
411 print OUT 'colors <- c(rep("grey",length(coverage)))'."\n"; | |
412 # coverage not whole target region => orange | |
413 print OUT 'covperc <- coveragetable[c('.$startline.':'.$stopline.'),'.$nrcols.']'."\n"; | |
414 print OUT 'colors[covperc<1] <- "orange"'."\n"; | |
415 # coverage below threshold => red | |
416 print OUT 'colors[coverage<'.$thresh.'] <- "red"'."\n"; | |
417 | |
418 if ($stopline - $startline > 20) { | |
419 $scale = 2; | |
420 } | |
421 else { | |
422 $scale = 1; | |
423 } | |
424 my $width = 480 * $scale; | |
425 my $height = 240 * $scale; | |
426 print OUT 'png(file="../Plots/Coverage_'.$currgroup.'.png", bg="white", width='.$width.', height='.$height.')'."\n"; | |
427 print OUT 'ylim = c(0,max(max(log10(coverage),log10('.($thresh+20).'))))'."\n"; | |
428 print OUT 'mp <- barplot(log10(coverage),col=colors,main="Exon Coverage for '.$currgroup.'",ylab="Log10(Coverage)",ylim=ylim)'."\n"; | |
429 print OUT 'text(mp, log10(coverage) + '.(0.4/$scale).',format(coverage),xpd = TRUE,srt=90)'."\n"; | |
430 print OUT 'text(mp,par("usr")[3]-0.05,labels=entries,srt=45,adj=1,xpd=TRUE)'."\n"; | |
431 print OUT 'abline(h=log10('.$thresh.'),lwd=4,col=rgb(255,0,0,100,maxColorValue=255))'."\n"; | |
432 print OUT 'graphics.off()'."\n"; | |
433 close OUT; | |
12
86df3f847a72
Switched to R 3.0.2 from iuc, and moved bedtools to seperate tool_definition
geert-vandeweyer
parents:
11
diff
changeset
|
434 system("cd $wd/Rout && Rscript barplot.R"); |
1 | 435 if ($scale == 1) { |
436 push(@small,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}'); | |
437 } | |
438 else { | |
439 push(@large,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}'); | |
440 } | |
441 | |
442 } | |
443 $currgroup = $group; | |
444 $startline = $linecounter; | |
445 } | |
446 $stopline = $linecounter; | |
447 } | |
448 close IN; | |
449 if ($currgroup ne '') { | |
450 # last gene, make plot. | |
451 open OUT, ">$wd/Rout/barplot.R"; | |
452 print OUT 'coveragetable <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n"; | |
453 print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n"; | |
454 print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n"; | |
455 print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n"; | |
456 print OUT 'coverage[coverage < 1] <- 1'."\n"; | |
457 print OUT 'colors <- c(rep("grey",length(coverage)))'."\n"; | |
458 print OUT 'colors[coverage<'.$thresh.'] <- "red"'."\n"; | |
459 | |
460 if ($stopline - $startline > 20) { | |
461 $scale = 2; | |
462 } | |
463 else { | |
464 $scale = 1; | |
465 } | |
466 my $width = 480 * $scale; | |
467 my $height = 240 * $scale; | |
468 print OUT 'png(file="../Plots/Coverage_'.$currgroup.'.png", bg="white", width='.$width.', height='.$height.')'."\n"; | |
469 print OUT 'ylim = c(0,max(max(log10(coverage),log10('.($thresh+20).'))))'."\n"; | |
470 print OUT 'mp <- barplot(log10(coverage),col=colors,main="Exon Coverage for '.$currgroup.'",ylab="Log10(Coverage)", ylim=ylim)'."\n"; | |
471 print OUT 'text(mp, log10(coverage) + log10(2),format(coverage),xpd = TRUE,srt=90)'."\n"; | |
472 print OUT 'text(mp,par("usr")[3]-0.1,labels=entries,srt=45,adj=1,xpd=TRUE)'."\n"; | |
473 print OUT 'abline(h=log10('.$thresh.'),lwd=4,col=rgb(255,0,0,100,maxColorValue=255))'."\n"; | |
474 print OUT 'graphics.off()'."\n"; | |
475 close OUT; | |
12
86df3f847a72
Switched to R 3.0.2 from iuc, and moved bedtools to seperate tool_definition
geert-vandeweyer
parents:
11
diff
changeset
|
476 system("cd $wd/Rout && Rscript barplot.R"); |
1 | 477 if ($scale == 1) { |
478 push(@small,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}'); | |
479 } | |
480 else { | |
481 push(@large,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}'); | |
482 } | |
483 } | |
484 ## print to TEX | |
485 open OUT, ">>$wd/Report/Report.tex"; | |
486 print OUT '\subsection*{Gene Summaries}'."\n"; | |
487 print OUT '\underline{Legend:} \\\\'."\n"; | |
488 print OUT '{\color{red}\textbf{RED:} Coverage did not reach set threshold of '.$thresh.'} \\\\'."\n"; | |
489 print OUT '{\color{orange}\textbf{ORANGE:} Coverage was incomplete for the exon. Overruled by red.} \\\\' ."\n"; | |
490 $col = 1; | |
491 foreach (@small) { | |
492 if ($col > 2) { | |
493 $col = 1; | |
494 print OUT "\n"; | |
495 } | |
496 print OUT '\begin{minipage}{0.5\linewidth}\centering'."\n"; | |
497 print OUT $_."\n"; | |
498 print OUT '\end{minipage}'."\n"; | |
499 $col++; | |
500 } | |
501 ## new line | |
502 if ($col == 2) { | |
503 print OUT '\\\\'." \n"; | |
504 } | |
505 foreach(@large) { | |
506 print OUT $_."\n"; | |
507 } | |
508 close OUT; | |
509 | |
510 } | |
511 | |
512 # 3. Detailed overview of failed exons (globally failed) | |
513 if (exists($opts{'s'})) { | |
514 # count columns | |
515 my $head = `head -n 1 $wd/Targets.Position.Coverage`; | |
516 chomp($head); | |
517 my @cols = split(/\t/,$head); | |
518 my $nrcols = scalar(@cols); | |
519 my $covcol = $nrcols; | |
520 my $poscol = $nrcols -1; | |
521 # tex section header | |
522 open TEX, ">>$wd/Report/Report.tex"; | |
523 print TEX '\subsection*{Failed Exon Plots}'."\n"; | |
524 $col = 1; | |
525 print TEX '\underline{NOTE:} Only exons with global coverage $<$'.$thresh.' or incomplete coverage were plotted \\\\'."\n"; | |
526 foreach(@failedregions) { | |
527 if ($col > 2) { | |
528 $col = 1; | |
529 print TEX "\n"; | |
530 } | |
531 # which exon | |
532 my $region = $_; | |
533 my $exon = $filehash{$region}{'exon'}; | |
534 # link exon to tmp file | |
535 my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt"; | |
536 ## determine transcript orientation and location | |
537 my $firstline = `head -n 1 $exonfile`; | |
538 my @firstcols = split(/\t/,$firstline); | |
539 my $orient = $firstcols[5]; | |
540 my $genomicchr = $firstcols[0]; | |
541 my $genomicstart = $firstcols[1]; | |
542 my $genomicstop = $firstcols[2]; | |
543 if ($orient eq '+') { | |
544 $bps = $genomicstop - $genomicstart + 1; | |
545 $subtitle = "Region 0-$bps: $genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop); | |
546 } | |
547 else { | |
548 $bps = $genomicstop - $genomicstart + 1; | |
549 $subtitle = "Region 0-$bps: $genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop); | |
550 } | |
551 # print Rscript | |
552 open OUT, ">$wd/Rout/exonplot.R"; | |
553 print OUT 'coveragetable <- read.table("'.$exonfile.'",as.is=TRUE,sep="\t",header=FALSE)'."\n"; | |
554 print OUT 'coverage <- coveragetable[,'.$covcol.']'."\n"; | |
555 print OUT 'coverage[coverage < 1] <- 1'."\n"; | |
556 print OUT 'positions <- coveragetable[,'.$poscol.']'."\n"; | |
557 | |
558 my $width = 480 ; | |
559 my $height = 240 ; | |
560 my $exonstr = $exon; | |
561 $exonstr =~ s/\s/_/g; | |
562 $exon =~ s/_/ /g; | |
563 $exon =~ s/\|/ /g; | |
564 print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.')'."\n"; | |
565 print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n"; | |
566 if ($orient eq '-') { | |
567 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n"; | |
568 print OUT 'mtext("'.$subtitle.'")'."\n"; | |
569 } | |
570 else { | |
571 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",sub="(Transcribed from plus strand)")'."\n"; | |
572 print OUT 'mtext("'.$subtitle.'")'."\n"; | |
573 } | |
574 print OUT 'lines(positions,log10(coverage))'."\n"; | |
575 print OUT 'abline(h=log10('.$thresh.'),lwd=4,col=rgb(255,0,0,100,maxColorValue=255))'."\n"; | |
576 print OUT 'failedpos <- positions[coverage<'.$thresh.']'."\n"; | |
577 print OUT 'failedcov <- coverage[coverage<'.$thresh.']'."\n"; | |
578 print OUT 'points(failedpos,log10(failedcov),col="red",pch=19)'."\n"; | |
579 print OUT 'graphics.off()'."\n"; | |
580 close OUT; | |
581 # run R script | |
12
86df3f847a72
Switched to R 3.0.2 from iuc, and moved bedtools to seperate tool_definition
geert-vandeweyer
parents:
11
diff
changeset
|
582 system("cd $wd/Rout && Rscript exonplot.R"); |
1 | 583 # Add to .TEX |
584 print TEX '\begin{minipage}{0.5\linewidth}\centering'."\n"; | |
585 print TEX '\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$exonstr.'.png}'."\n"; | |
586 print TEX '\end{minipage}'."\n"; | |
587 $col++; | |
588 } | |
589 } | |
590 | |
591 ## plot failed (subregion) or all exons | |
592 if (exists($opts{'S'}) || exists($opts{'A'})) { | |
593 # count columns | |
594 my $head = `head -n 1 $wd/Targets.Position.Coverage`; | |
595 chomp($head); | |
596 my @cols = split(/\t/,$head); | |
597 my $nrcols = scalar(@cols); | |
598 my $covcol = $nrcols; | |
599 my $poscol = $nrcols -1; | |
600 # tex section header | |
601 open TEX, ">>$wd/Report/Report.tex"; | |
602 print TEX '\subsection*{Failed Exon Plots}'."\n"; | |
603 if (exists($opts{'S'})) { | |
604 print TEX '\underline{NOTE:} ALL exons were tested for local coverage $<$'.$thresh.' \\\\'."\n"; | |
605 } | |
606 elsif (exists($opts{'A'})) { | |
607 print TEX '\underline{NOTE:} ALL exons are plotted, regardless of coverage \\\\'."\n"; | |
608 } | |
609 $col = 1; | |
610 foreach(@allregions) { | |
611 if ($col > 2) { | |
612 $col = 1; | |
613 print TEX "\n"; | |
614 } | |
615 # which exon | |
616 my $region = $_; | |
617 my $exon = $filehash{$region}{'exon'}; | |
618 # grep exon to tmp file | |
619 my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt"; | |
620 ## determine transcript orientation. | |
621 my $firstline = `head -n 1 $exonfile`; | |
622 my @firstcols = split(/\t/,$firstline); | |
623 my $orient = $firstcols[5]; | |
624 my $genomicchr = $firstcols[0]; | |
625 my $genomicstart = $firstcols[1]; | |
626 my $genomicstop = $firstcols[2]; | |
627 | |
628 if ($orient eq '+') { | |
629 $bps = $genomicstop - $genomicstart + 1; | |
630 $subtitle = "Region 0-$bps: $genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop); | |
631 | |
632 } | |
633 else { | |
634 $bps = $genomicstop - $genomicstart + 1; | |
635 $subtitle = "Region 0-$bps: $genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop); | |
636 | |
637 } | |
638 | |
639 # check if failed | |
640 if (exists($opts{'S'})) { | |
641 my $cs = `cut -f $covcol '$exonfile' `; | |
642 my @c = split(/\n/,$cs); | |
643 @c = sort { $a <=> $b } @c; | |
644 if ($c[0] >= $thresh) { | |
645 # lowest coverage > threshold => skip | |
646 next; | |
647 } | |
648 } | |
649 # print Rscript | |
650 open OUT, ">$wd/Rout/exonplot.R"; | |
651 print OUT 'coveragetable <- read.table("'.$exonfile.'",as.is=TRUE,sep="\t",header=FALSE)'."\n"; | |
652 print OUT 'coverage <- coveragetable[,'.$covcol.']'."\n"; | |
653 print OUT 'coverage[coverage < 1] <- 1'."\n"; | |
654 print OUT 'positions <- coveragetable[,'.$poscol.']'."\n"; | |
655 my $width = 480 ; | |
656 my $height = 240 ; | |
657 my $exonstr = $exon; | |
658 $exonstr =~ s/\s/_/g; | |
659 $exon =~ s/_/ /g; | |
660 $exon =~ s/\|/ /g; | |
661 print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.')'."\n"; | |
662 print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n"; | |
663 if ($orient eq '-') { | |
664 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n"; | |
665 print OUT 'mtext("'.$subtitle.'")'."\n"; | |
666 } | |
667 else { | |
668 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",sub="(Transcribed from plus strand)")'."\n"; | |
669 print OUT 'mtext("'.$subtitle.'")'."\n"; | |
670 } | |
671 | |
672 print OUT 'lines(positions,log10(coverage))'."\n"; | |
673 print OUT 'abline(h=log10('.$thresh.'),lwd=4,col=rgb(255,0,0,100,maxColorValue=255))'."\n"; | |
674 print OUT 'failedpos <- positions[coverage<'.$thresh.']'."\n"; | |
675 print OUT 'failedcov <- coverage[coverage<'.$thresh.']'."\n"; | |
676 print OUT 'points(failedpos,log10(failedcov),col="red",pch=19)'."\n"; | |
677 print OUT 'graphics.off()'."\n"; | |
678 close OUT; | |
679 # run R script | |
12
86df3f847a72
Switched to R 3.0.2 from iuc, and moved bedtools to seperate tool_definition
geert-vandeweyer
parents:
11
diff
changeset
|
680 system("cd $wd/Rout && Rscript exonplot.R"); |
1 | 681 # Add to .TEX |
682 print TEX '\begin{minipage}{0.5\linewidth}\centering'."\n"; | |
683 print TEX '\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$exonstr.'.png}'."\n"; | |
684 print TEX '\end{minipage}'."\n"; | |
685 $col++; | |
686 } | |
687 } | |
688 ## list failed exons | |
689 if (exists($opts{'L'})) { | |
690 # count columns | |
691 my $head = `head -n 1 $wd/Targets.Position.Coverage`; | |
692 chomp($head); | |
693 my @cols = split(/\t/,$head); | |
694 my $nrcols = scalar(@cols); | |
695 my $covcol = $nrcols; | |
696 my $poscol = $nrcols -1; | |
697 ## hash to print | |
698 # tex section header | |
699 open TEX, ">>$wd/Report/Report.tex"; | |
700 print TEX '\subsection*{List of Failed Exons}'."\n"; | |
701 print TEX '\underline{NOTE:} ALL exons were tested for local coverage $<$'.$thresh.' \\\\'."\n"; | |
702 print TEX '{\footnotesize\begin{longtable}[l]{@{\extracolsep{\fill}}llll}'."\n".'\hline'."\n"; | |
703 print TEX '\textbf{Target Name} & \textbf{Genomic Position} & \textbf{Avg.Coverage} & \textbf{Min.Coverage} \\\\'."\n".'\hline'."\n"; | |
704 print TEX '\endhead'."\n"; | |
705 print TEX '\hline '."\n".'\multicolumn{4}{r}{{\textsl{\footnotesize Continued on next page}}} \\\\ '."\n".'\hline' ."\n". '\endfoot' . "\n". '\endlastfoot' . "\n"; | |
706 | |
707 $col = 1; | |
708 open IN, "$wd/Targets.Global.Coverage"; | |
709 while (<IN>) { | |
710 chomp($_); | |
711 my @p = split(/\t/,$_); | |
712 my $region = $p[0].'-'.$p[1].'-'.$p[2]; | |
713 my $exon = $filehash{$region}{'exon'}; | |
714 # grep exon to tmp file | |
715 my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt"; | |
716 ## determine transcript orientation. | |
717 my $firstline = `head -n 1 $exonfile`; | |
718 my @firstcols = split(/\t/,$firstline); | |
719 my $orient = $firstcols[5]; | |
720 my $genomicchr = $firstcols[0]; | |
721 my $genomicstart = $firstcols[1]; | |
722 my $genomicstop = $firstcols[2]; | |
723 | |
724 if ($orient eq '+') { | |
725 $bps = $genomicstop - $genomicstart + 1; | |
726 $subtitle = "$genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop); | |
727 | |
728 } | |
729 else { | |
730 $bps = $genomicstop - $genomicstart + 1; | |
731 $subtitle = "$genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop); | |
732 } | |
733 | |
734 # check if failed | |
735 my $cs = `cut -f $covcol '$exonfile' `; | |
736 my @c = split(/\n/,$cs); | |
737 my ($avg,$med,$min,$max,$first,$third,$ontarget) = arraystats(@c); | |
738 | |
739 if ($min >= $thresh) { | |
740 # lowest coverage > threshold => skip | |
741 next; | |
742 } | |
743 | |
744 # print to .tex table | |
745 if (length($exon) > 30) { | |
746 $exon = substr($exon,0,27) . '...'; | |
747 } | |
748 $exon =~ s/_/ /g; | |
749 $exon =~ s/\|/ /g; | |
750 | |
751 print TEX "$exon & $subtitle & ".int($avg)." & $min ".'\\\\'."\n"; | |
752 } | |
753 close IN; | |
754 print TEX '\hline'."\n"; | |
755 print TEX '\end{longtable}}'."\n"; | |
756 close TEX; | |
757 } | |
758 | |
759 | |
760 ## Close document | |
761 open OUT, ">>$wd/Report/Report.tex"; | |
762 print OUT '\label{endofdoc}'."\n"; | |
763 print OUT '\end{document}'."\n"; | |
764 close OUT; | |
765 system("cd $wd/Report && pdflatex Report.tex > /dev/null 2>&1 && pdflatex Report.tex > /dev/null 2>&1 "); | |
766 | |
767 ## mv report to output file | |
768 system("cp -f $wd/Report/Report.pdf '$pdffile'"); | |
769 ##create tar.gz file | |
770 system("mkdir $wd/Results"); | |
771 system("cp -Rf $wd/Plots $wd/Results/"); | |
772 system("cp -Rf $wd/Report/ $wd/Results/"); | |
773 if (-e "$wd/Targets.Global.Coverage") { | |
774 system("cp -Rf $wd/Targets.Global.Coverage $wd/Results/"); | |
775 } | |
776 if (-e "$wd/Targets.Position.Coverage") { | |
777 system("cp -Rf $wd/Targets.Position.Coverage $wd/Results/"); | |
778 } | |
779 | |
780 system("cd $wd && tar czf '$tarfile' Results/"); | |
781 ## clean up (galaxy stores outside wd) | |
782 system("rm -Rf $wd"); | |
783 ############### | |
784 ## FUNCTIONS ## | |
785 ############### | |
786 sub arraystats{ | |
787 my @array = @_; | |
788 my $count = scalar(@array); | |
789 @array = sort { $a <=> $b } @array; | |
790 # median | |
791 my $median = 0; | |
792 if ($count % 2) { | |
793 $median = $array[int($count/2)]; | |
794 } else { | |
795 $median = ($array[$count/2] + $array[$count/2 - 1]) / 2; | |
796 } | |
797 # average | |
798 my $sum = 0; | |
799 foreach (@array) { $sum += $_; } | |
800 my $average = $sum / $count; | |
801 # quantiles (rounded) | |
802 my $quart = int($count/4) ; | |
803 my $first = $array[$quart]; | |
804 my $third = $array[($quart*3)]; | |
805 my $min = $array[0]; | |
806 my $max = $array[($count-1)]; | |
807 return ($average,$median,$min,$max,$first,$third,$sum); | |
808 } | |
809 | |
810 sub GlobalSummary { | |
811 my ($bam,$targets) = @_; | |
812 | |
813 my $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage"; | |
814 if (exists($commandsrun{$command})) { | |
815 return; | |
816 } | |
817 system($command); | |
818 $commandsrun{$command} = 1; | |
819 } | |
820 | |
821 sub CoveragePerRegion { | |
822 my ($bam,$targets) = @_; | |
823 my $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage"; | |
824 if (exists($commandsrun{$command})) { | |
825 return; | |
826 } | |
827 system($command); | |
828 $commandsrun{$command} = 1; | |
829 } | |
830 | |
831 sub SubRegionCoverage { | |
832 my ($bam,$targets) = @_; | |
833 my $command = "cd $wd && coverageBed -abam $bam -b $targets -d > $wd/Targets.Position.Coverage"; | |
834 system($command); | |
835 $commandsrun{$command} = 1; | |
836 } | |
837 |