comparison CoverageReport.pl @ 1:864d0ccfbe6f draft

Initial Uploaded
author geert-vandeweyer
date Thu, 05 Sep 2013 02:37:03 -0400
parents
children 39c374d4cba7
comparison
equal deleted inserted replaced
0:ea32a329aced 1:864d0ccfbe6f
1 #!/usr/bin/perl
2
3 # load modules
4 use Getopt::Std;
5 use File::Basename;
6 use Number::Format;
7
8 # number format
9 my $de = new Number::Format(-thousands_sep =>',',-decimal_point => '.');
10
11 ##########
12 ## opts ##
13 ##########
14 ## input files
15 # b : path to input (b)am file
16 # t : path to input (t)arget regions in BED format
17 ## output files
18 # o : report pdf (o)utput file
19 # z : all plots and tables in tar.g(z) format
20 ## entries in the report
21 # r : Coverage per (r)egion (boolean)
22 # s : (s)ubregion coverage if average < specified (plots for positions along target region) (boolean)
23 # S : (S)ubregion coverage for ALL failed exons => use either s OR S or you will have double plots.
24 # A : (A)ll exons will be plotted.
25 # L : (L)ist failed exons instead of plotting
26 # m : (m)inimal Coverage threshold
27 # f : fraction of average as threshold
28 # n : sample (n)ame.
29
30
31 getopts('b:t:o:z:rsSALm:n:f:', \%opts) ;
32
33 # make output directory in (tmp) working dir
34 our $wd = "/tmp/Coverage.".int(rand(1000));
35 while (-d $wd) {
36 $wd = "/tmp/Coverage.".int(rand(1000));
37 }
38 system("mkdir $wd");
39
40 ## variables
41 our %commandsrun = ();
42
43 if (!exists($opts{'b'}) || !-e $opts{'b'}) {
44 die('Bam File not found');
45 }
46 if (!exists($opts{'t'}) || !-e $opts{'t'}) {
47 die('Target File (BED) not found');
48 }
49
50 if (exists($opts{'m'})) {
51 $thresh = $opts{'m'};
52 }
53 else {
54 $thresh = 40;
55 }
56
57 if (exists($opts{'f'})) {
58 $frac = $opts{'f'};
59 }
60 else {
61 $frac = 0.2;
62 }
63
64 if (exists($opts{'o'})) {
65 $pdffile = $opts{'o'};
66 }
67 else {
68 $pdffile = "$wd/CoverageReport.pdf";
69 }
70
71 if (exists($opts{'z'})) {
72 $tarfile = $opts{'z'};
73 }
74 else {
75 $tarfile = "$wd/Results.tar.gz";
76 }
77
78 # 1. Global Summary => default
79 &GlobalSummary($opts{'b'}, $opts{'t'});
80
81 # 2. Coverage per position
82 &SubRegionCoverage($opts{'b'}, $opts{'t'});
83 our %filehash;
84 if (exists($opts{'s'}) || exists($opts{'S'}) || exists($opts{'A'}) || exists($opts{'L'})) {
85 system("mkdir $wd/SplitFiles");
86 ## get position coverages
87 ## split input files
88 open IN, "$wd/Targets.Position.Coverage";
89 my $fileidx = 0;
90 my $currreg = '';
91 while (<IN>) {
92 my $line = $_;
93 chomp($line);
94 my @p = split(/\t/,$line);
95 my $reg = $p[0].'-'.$p[1].'-'.$p[2]; #.$p[3];
96 my $ex = $p[3];
97 if ($reg ne $currreg) {
98 ## new exon open new outfile
99 if ($currreg ne '') {
100 ## filehandle is open. close it
101 close OUT;
102 }
103 if (!exists($filehash{$reg})) {
104 $fileidx++;
105 $filehash{$reg}{'idx'} = $fileidx;
106 $filehash{$reg}{'exon'} = $ex;
107 open OUT, ">> $wd/SplitFiles/File_$fileidx.txt";
108 $currreg = $reg;
109 }
110 else {
111 open OUT, ">> $wd/SplitFiles/File_".$filehash{$reg}{'idx'}.".txt";
112 $currreg = $reg;
113 }
114 }
115 ## print the line to the open filehandle.
116 print OUT "$line\n";
117 }
118 close OUT;
119 close IN;
120
121 }
122
123 ## sort output files according to targets file
124 if (exists($opts{'r'}) ) {
125 my %hash = ();
126 open IN, "$wd/Targets.Global.Coverage";
127 while (<IN>) {
128 my @p = split(/\t/,$_) ;
129 $hash{$p[3]} = $_;
130 }
131 close IN;
132 open OUT, ">$wd/Targets.Global.Coverage";
133 open IN, $opts{'t'};
134 while (<IN>) {
135 my @p = split(/\t/,$_) ;
136 print OUT $hash{$p[3]};
137 }
138 close IN;
139 close OUT;
140 }
141
142
143 ####################################
144 ## PROCESS RESULTS & CREATE PLOTS ##
145 ####################################
146 system("mkdir $wd/Report");
147
148 system("mkdir $wd/Rout");
149 system("mkdir $wd/Plots");
150
151 $samplename = $opts{'n'};
152 $samplename =~ s/_/\\_/g;
153
154 # 0. Preamble
155 ## compose preamble
156 open OUT, ">$wd/Report/Report.tex";
157 print OUT '\documentclass[a4paper,10pt]{article}'."\n";
158 print OUT '\usepackage[left=2cm,top=1.5cm,right=1.5cm,bottom=2.5cm,nohead]{geometry}'."\n";
159 print OUT '\usepackage{longtable}'."\n";
160 print OUT '\usepackage[T1]{fontenc}'."\n";
161 print OUT '\usepackage{fancyhdr}'."\n";
162 print OUT '\usepackage[latin9]{inputenc}'."\n";
163 print OUT '\usepackage{color}'."\n";
164 print OUT '\usepackage[pdftex]{graphicx}'."\n";
165 print OUT '\definecolor{grey}{RGB}{160,160,160}'."\n";
166 print OUT '\definecolor{darkgrey}{RGB}{100,100,100}'."\n";
167 print OUT '\definecolor{red}{RGB}{255,0,0}'."\n";
168 print OUT '\definecolor{orange}{RGB}{238,118,0}'."\n";
169 print OUT '\setlength\LTleft{0pt}'."\n";
170 print OUT '\setlength\LTright{0pt}'."\n";
171 print OUT '\begin{document}'."\n";
172 print OUT '\pagestyle{fancy}'."\n";
173 print OUT '\fancyhead{}'."\n";
174 print OUT '\renewcommand{\footrulewidth}{0.4pt}'."\n";
175 print OUT '\renewcommand{\headrulewidth}{0pt}'."\n";
176 print OUT '\fancyfoot[R]{\today\hspace{2cm}\thepage\ of \pageref{endofdoc}}'."\n";
177 print OUT '\fancyfoot[C]{}'."\n";
178 print OUT '\fancyfoot[L]{Coverage Report for ``'.$samplename.'"}'."\n";
179 print OUT '\let\oldsubsubsection=\subsubsection'."\n";
180 print OUT '\renewcommand{\subsubsection}{%'."\n";
181 print OUT ' \filbreak'."\n";
182 print OUT ' \oldsubsubsection'."\n";
183 print OUT '}'."\n";
184 # main title
185 print OUT '\section*{Coverage Report for ``'.$samplename.'"}'."\n";
186 close OUT;
187
188 # 1. Summary Report
189 # Get samtools flagstat summary of BAM file
190 my $flagstat = `samtools flagstat $opts{'b'}`;
191 my @s = split(/\n/,$flagstat);
192 # Get number of reads mapped in total
193 ## updated on 2012-10-1 !!
194 $totalmapped = $s[2];
195 $totalmapped =~ s/^(\d+)(\s.+)/$1/;
196 # count columns
197 my $head = `head -n 1 $wd/Targets.Global.Coverage`;
198 chomp($head);
199 my @cols = split(/\t/,$head);
200 my $nrcols = scalar(@cols);
201 my $covcol = $nrcols - 3;
202 # get min/max/median/average coverage => values
203 my $covs = `cut -f $covcol $wd/Targets.Global.Coverage`;
204 my @coverages = split(/\n/,$covs);
205 my ($eavg,$med,$min,$max,$first,$third,$ontarget) = arraystats(@coverages);
206 my $spec = sprintf("%.1f",($ontarget / $totalmapped)*100);
207 # get min/max/median/average coverage => boxplot in R
208 open OUT, ">$wd/Rout/boxplot.R";
209 print OUT 'coverage <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n";
210 print OUT 'coverage <- coverage[,'.$covcol.']'."\n";
211 print OUT 'png(file="../Plots/CoverageBoxPlot.png", bg="white", width=240, height=480)'."\n";
212 print OUT 'boxplot(coverage,range=1.5,main="Target Region Coverage")'."\n";
213 print OUT 'graphics.off()'."\n";
214 close OUT;
215 system("cd $wd/Rout && Rscript boxplot.R");
216
217 ## global nt coverage plot
218 ## use perl to make histogram (lower memory)
219 open IN, "$wd/Targets.Position.Coverage";
220 my %dens;
221 my $counter = 0;
222 my $sum = 0;
223 while (<IN>) {
224 chomp();
225 my @p = split(/\t/);
226 $sum += $p[-1];
227 $counter++;
228 if (defined($dens{$p[-1]})) {
229 $dens{$p[-1]}++;
230 }
231 else {
232 $dens{$p[-1]} = 1;
233 }
234 }
235 $avg = $sum/$counter;
236 close IN;
237 open OUT, ">$wd/Rout/hist.txt";
238 foreach (keys(%dens)) {
239 print OUT "$_;$dens{$_}\n";
240 }
241 close OUT;
242 if (!defined($dens{'0'})) {
243 $dens{'0'} = 0;
244 }
245 open OUT, ">$wd/Rout/ntplot.R";
246 # read coverage hist in R to plot
247 print OUT 'coverage <- read.table("hist.txt" , as.is = TRUE, header=FALSE,sep=";")'."\n";
248 print OUT 'mincov <- '."$thresh \n";
249 print OUT "avg <- round($avg)\n";
250 print OUT "colnames(coverage) <- c('cov','count')\n";
251 print OUT 'coverage$cov <- coverage$cov / avg'."\n";
252 print OUT 'rep <- which(coverage$cov > 1)'."\n";
253 print OUT 'coverage[coverage$cov > 1,1] <- 1'."\n";
254 print OUT 'values <- coverage[coverage$cov < 1,]'."\n";
255 print OUT 'values <- rbind(values,c(1,sum(coverage[coverage$cov == 1,"count"])))'."\n";
256 print OUT 'values <- values[order(values$cov),]'."\n";
257 print OUT 'prevcount <- 0'."\n";
258 # make cumulative count data frame
259 print OUT 'for (i in rev(values$cov)) {'."\n";
260 print OUT ' values[values$cov == i,"count"] <- prevcount + values[values$cov == i,"count"]'."\n";
261 print OUT ' prevcount <- values[values$cov == i,"count"]'."\n";
262 print OUT '}'."\n";
263 print OUT 'values$count <- values$count / (values[values$cov == 0,"count"] / 100)'."\n";
264 # get some values to plot lines.
265 print OUT 'mincov.x <- mincov/avg'."\n";
266 print OUT 'if (mincov/avg <= 1) {'."\n";
267 print OUT ' ii <- which(values$cov == mincov.x)'."\n";
268 print OUT ' if (length(ii) == 1) {'."\n";
269 print OUT ' mincov.y <- values[ii[1],"count"]'."\n";
270 print OUT ' } else {'."\n";
271 print OUT ' i1 <- max(which(values$cov < mincov.x))'."\n";
272 print OUT ' i2 <- min(which(values$cov > mincov.x))'."\n";
273 print OUT ' mincov.y <- ((values[i2,"count"] - values[i1,"count"])/(values[i2,"cov"] - values[i1,"cov"]))*(mincov.x - values[i1,"cov"]) + values[i1,"count"]'."\n";
274 print OUT ' }'."\n";
275 print OUT '}'."\n";
276 # open output image and create plot
277 print OUT 'png(file="../Plots/CoverageNtPlot.png", bg="white", width=540, height=480)'."\n";
278 print OUT 'par(xaxs="i",yaxs="i")'."\n";
279 print OUT 'plot(values$cov,values$count,ylim=c(0,100),pch=".",main="Cumulative Normalised Base-Coverage Plot",xlab="Normalizalised Coverage",ylab="Cumulative Nr. Of Bases")'."\n";
280 print OUT 'lines(values$cov,values$count)'."\n";
281 print OUT 'if (mincov.x <= 1) {'."\n";
282 print OUT ' lines(c(mincov.x,mincov.x),c(0,mincov.y),lty=2,col="darkgreen")'."\n";
283 print OUT ' lines(c(0,mincov.x),c(mincov.y,mincov.y),lty=2,col="darkgreen")'."\n";
284 print OUT ' text(1,(95),pos=2,col="darkgreen",labels="Threshold: '.$thresh.'x")'."\n";
285 print OUT ' text(1,(91),pos=2,col="darkgreen",labels=paste("%Bases: ",round(mincov.y,2),"%",sep=""))'."\n";
286 print OUT '} else {'."\n";
287 print OUT ' text(1,(95),pos=2,col="darkgreen",labels="Threshold ('.$thresh.'x) > Average")'."\n";
288 print OUT ' text(1,(91),pos=2,col="darkgreen",labels="Plotting impossible")'."\n";
289 print OUT '}'."\n";
290 print OUT 'frac.x <- '."$frac\n";
291 print OUT 'ii <- which(values$cov == frac.x)'."\n";
292 print OUT 'if (length(ii) == 1) {'."\n";
293 print OUT ' frac.y <- values[ii[1],"count"]'."\n";
294 print OUT '} else {'."\n";
295 print OUT ' i1 <- max(which(values$cov < frac.x))'."\n";
296 print OUT ' i2 <- min(which(values$cov > frac.x))'."\n";
297 print OUT ' frac.y <- ((values[i2,"count"] - values[i1,"count"])/(values[i2,"cov"] - values[i1,"cov"]))*(frac.x - values[i1,"cov"]) + values[i1,"count"]'."\n";
298 print OUT '}'."\n";
299 print OUT 'lines(c(frac.x,frac.x),c(0,frac.y),lty=2,col="red")'."\n";
300 print OUT 'lines(c(0,frac.x),c(frac.y,frac.y),lty=2,col="red")'."\n";
301 #iprint OUT 'text((frac.x+0.05),(frac.y - 2),pos=4,col="red",labels=paste(frac.x," x Avg.Cov : ",round(frac.x * avg,2),"x",sep="" ))'."\n";
302 #print OUT 'text((frac.x+0.05),(frac.y-5),pos=4,col="red",labels=paste("%Bases: ",round(frac.y,2),"%",sep=""))'."\n";
303 print OUT 'text(1,86,pos=2,col="red",labels=paste(frac.x," x Avg.Cov : ",round(frac.x * avg,2),"x",sep="" ))'."\n";
304 print OUT 'text(1,82,pos=2,col="red",labels=paste("%Bases: ",round(frac.y,2),"%",sep=""))'."\n";
305
306 print OUT 'graphics.off()'."\n";
307
308 close OUT;
309 system("cd $wd/Rout && Rscript ntplot.R");
310 ## PRINT TO .TEX FILE
311 open OUT, ">>$wd/Report/Report.tex";
312 # average coverage overviews
313 print OUT '\subsection*{Overall Summary}'."\n";
314 print OUT '{\small ';
315 # left : boxplot
316 print OUT '\begin{minipage}{0.3\linewidth}\centering'."\n";
317 print OUT '\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/CoverageBoxPlot.png}'."\n";
318 print OUT '\end{minipage}'."\n";
319 # right : cum.cov.plot
320 print OUT '\hspace{0.6cm}'."\n";
321 print OUT '\begin{minipage}{0.65\linewidth}\centering'."\n";
322 print OUT '\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/CoverageNtPlot.png}'."\n";
323 print OUT '\end{minipage} \\\\'."\n";
324 ## next line
325 print OUT '\begin{minipage}{0.48\linewidth}'."\n";
326 print OUT '\vspace{-1.2em}'."\n";
327 print OUT '\begin{tabular}{ll}'."\n";
328 # bam statistics
329 print OUT '\multicolumn{2}{l}{\textbf{\underline{Samtools Flagstat Summary}}} \\\\'."\n";
330 foreach (@s) {
331 $_ =~ m/^(\d+)\s(.+)$/;
332 my $one = $1;
333 my $two = $2;
334 $two =~ s/\s\+\s0\s//;
335 $two = ucfirst($two);
336 $one =~ s/%/\\%/g;
337 # remove '+ 0 ' from front
338 $two =~ s/\+\s0\s//;
339 # remove trailing from end
340 $two =~ s/(\s\+.*)|(:.*)/\)/;
341 $two =~ s/%/\\%/g;
342 $two =~ s/>=/\$\\ge\$/g;
343 $two = ucfirst($two);
344 print OUT '\textbf{'.$two.'} & '.$one.' \\\\'."\n";
345 }
346 print OUT '\end{tabular}\end{minipage}'."\n";
347 print OUT '\hspace{1.5cm}'."\n";
348 # target coverage statistics
349 print OUT '\begin{minipage}{0.4\linewidth}'."\n";
350 #print OUT '\vspace{-4.8em}'."\n";
351 print OUT '\begin{tabular}{ll}'."\n";
352 print OUT '\multicolumn{2}{l}{\textbf{\underline{Target Region Coverage}}} \\\\'."\n";
353 print OUT '\textbf{Number of Target Regions} & '.scalar(@coverages).' \\\\'."\n";
354 print OUT '\textbf{Minimal Region Coverage} & '.$min.' \\\\'."\n";
355 print OUT '\textbf{25\% Region Coverage} & '.$first.' \\\\'. "\n";
356 print OUT '\textbf{50\% (Median) Region Coverage} & '.$med.' \\\\'. "\n";
357 print OUT '\textbf{75\% Region Coverage} & '.$third.' \\\\'. "\n";
358 print OUT '\textbf{Maximal Region Coverage} & '.$max.' \\\\'. "\n";
359 print OUT '\textbf{Average Region Coverage} & '.int($eavg).' \\\\'. "\n";
360 print OUT '\textbf{Mapped On Target} & '.$spec.' \\\\'."\n";
361 print OUT '\multicolumn{2}{l}{\textbf{\underline{Target Base Coverage }}} \\\\'."\n";
362 print OUT '\textbf{Number of Target Bases} & '.$counter.' \\\\'."\n";
363 print OUT '\textbf{Average Base Coverage} & '.int($avg).' \\\\'. "\n";
364 print OUT '\textbf{Non-Covered Bases} & '.$dens{'0'}.' \\\\'."\n";
365 #print OUT '\textbf{Bases Covered $ge$ '.$frac.'xAvg.Cov} & '.
366 print OUT '\end{tabular}\end{minipage}}'."\n";
367 close OUT;
368
369 # 2. GLOBAL COVERAGE OVERVIEW PER GENE
370 @failedexons;
371 @allexons;
372 @allregions;
373 @failedregions;
374 if (exists($opts{'r'}) || exists($opts{'s'}) || exists($opts{'S'})) {
375 # count columns
376 my $head = `head -n 1 $wd/Targets.Global.Coverage`;
377 chomp($head);
378 my @cols = split(/\t/,$head);
379 my $nrcols = scalar(@cols);
380 my $covcol = $nrcols - 3;
381 # Coverage Plots for each gene => barplots in R, table here.
382 open IN, "$wd/Targets.Global.Coverage";
383 my $currgroup = '';
384 my $startline = 0;
385 my $stopline = 0;
386 $linecounter = 0;
387 while (<IN>) {
388 $linecounter++;
389 chomp($_);
390 my @c = split(/\t/,$_);
391 push(@allregions,$c[0].'-'.$c[1].'-'.$c[2]);
392 my $group = $c[3];
393 ## coverage failure?
394 if ($c[$nrcol-1] < 1 || $c[$covcol-1] < $thresh) {
395 push(@failedexons,$group);
396 push(@failedregions,$c[0].'-'.$c[1].'-'.$c[2]);
397 }
398 ## store exon
399 push(@allexons,$group);
400 ## extract and check gene
401 $group =~ s/^(\S+)[\|\s](.+)/$1/;
402 if ($group ne $currgroup ) {
403 if ($currgroup ne '') {
404 # new gene, make plot.
405 open OUT, ">$wd/Rout/barplot.R";
406 print OUT 'coveragetable <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n";
407 print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n";
408 print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n";
409 print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n";
410 print OUT 'coverage[coverage < 1] <- 1'."\n";
411 print OUT 'colors <- c(rep("grey",length(coverage)))'."\n";
412 # coverage not whole target region => orange
413 print OUT 'covperc <- coveragetable[c('.$startline.':'.$stopline.'),'.$nrcols.']'."\n";
414 print OUT 'colors[covperc<1] <- "orange"'."\n";
415 # coverage below threshold => red
416 print OUT 'colors[coverage<'.$thresh.'] <- "red"'."\n";
417
418 if ($stopline - $startline > 20) {
419 $scale = 2;
420 }
421 else {
422 $scale = 1;
423 }
424 my $width = 480 * $scale;
425 my $height = 240 * $scale;
426 print OUT 'png(file="../Plots/Coverage_'.$currgroup.'.png", bg="white", width='.$width.', height='.$height.')'."\n";
427 print OUT 'ylim = c(0,max(max(log10(coverage),log10('.($thresh+20).'))))'."\n";
428 print OUT 'mp <- barplot(log10(coverage),col=colors,main="Exon Coverage for '.$currgroup.'",ylab="Log10(Coverage)",ylim=ylim)'."\n";
429 print OUT 'text(mp, log10(coverage) + '.(0.4/$scale).',format(coverage),xpd = TRUE,srt=90)'."\n";
430 print OUT 'text(mp,par("usr")[3]-0.05,labels=entries,srt=45,adj=1,xpd=TRUE)'."\n";
431 print OUT 'abline(h=log10('.$thresh.'),lwd=4,col=rgb(255,0,0,100,maxColorValue=255))'."\n";
432 print OUT 'graphics.off()'."\n";
433 close OUT;
434 system("cd $wd/Rout && Rscript barplot.R");
435 if ($scale == 1) {
436 push(@small,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}');
437 }
438 else {
439 push(@large,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}');
440 }
441
442 }
443 $currgroup = $group;
444 $startline = $linecounter;
445 }
446 $stopline = $linecounter;
447 }
448 close IN;
449 if ($currgroup ne '') {
450 # last gene, make plot.
451 open OUT, ">$wd/Rout/barplot.R";
452 print OUT 'coveragetable <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n";
453 print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n";
454 print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n";
455 print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n";
456 print OUT 'coverage[coverage < 1] <- 1'."\n";
457 print OUT 'colors <- c(rep("grey",length(coverage)))'."\n";
458 print OUT 'colors[coverage<'.$thresh.'] <- "red"'."\n";
459
460 if ($stopline - $startline > 20) {
461 $scale = 2;
462 }
463 else {
464 $scale = 1;
465 }
466 my $width = 480 * $scale;
467 my $height = 240 * $scale;
468 print OUT 'png(file="../Plots/Coverage_'.$currgroup.'.png", bg="white", width='.$width.', height='.$height.')'."\n";
469 print OUT 'ylim = c(0,max(max(log10(coverage),log10('.($thresh+20).'))))'."\n";
470 print OUT 'mp <- barplot(log10(coverage),col=colors,main="Exon Coverage for '.$currgroup.'",ylab="Log10(Coverage)", ylim=ylim)'."\n";
471 print OUT 'text(mp, log10(coverage) + log10(2),format(coverage),xpd = TRUE,srt=90)'."\n";
472 print OUT 'text(mp,par("usr")[3]-0.1,labels=entries,srt=45,adj=1,xpd=TRUE)'."\n";
473 print OUT 'abline(h=log10('.$thresh.'),lwd=4,col=rgb(255,0,0,100,maxColorValue=255))'."\n";
474 print OUT 'graphics.off()'."\n";
475 close OUT;
476 system("cd $wd/Rout && Rscript barplot.R");
477 if ($scale == 1) {
478 push(@small,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}');
479 }
480 else {
481 push(@large,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}');
482 }
483 }
484 ## print to TEX
485 open OUT, ">>$wd/Report/Report.tex";
486 print OUT '\subsection*{Gene Summaries}'."\n";
487 print OUT '\underline{Legend:} \\\\'."\n";
488 print OUT '{\color{red}\textbf{RED:} Coverage did not reach set threshold of '.$thresh.'} \\\\'."\n";
489 print OUT '{\color{orange}\textbf{ORANGE:} Coverage was incomplete for the exon. Overruled by red.} \\\\' ."\n";
490 $col = 1;
491 foreach (@small) {
492 if ($col > 2) {
493 $col = 1;
494 print OUT "\n";
495 }
496 print OUT '\begin{minipage}{0.5\linewidth}\centering'."\n";
497 print OUT $_."\n";
498 print OUT '\end{minipage}'."\n";
499 $col++;
500 }
501 ## new line
502 if ($col == 2) {
503 print OUT '\\\\'." \n";
504 }
505 foreach(@large) {
506 print OUT $_."\n";
507 }
508 close OUT;
509
510 }
511
512 # 3. Detailed overview of failed exons (globally failed)
513 if (exists($opts{'s'})) {
514 # count columns
515 my $head = `head -n 1 $wd/Targets.Position.Coverage`;
516 chomp($head);
517 my @cols = split(/\t/,$head);
518 my $nrcols = scalar(@cols);
519 my $covcol = $nrcols;
520 my $poscol = $nrcols -1;
521 # tex section header
522 open TEX, ">>$wd/Report/Report.tex";
523 print TEX '\subsection*{Failed Exon Plots}'."\n";
524 $col = 1;
525 print TEX '\underline{NOTE:} Only exons with global coverage $<$'.$thresh.' or incomplete coverage were plotted \\\\'."\n";
526 foreach(@failedregions) {
527 if ($col > 2) {
528 $col = 1;
529 print TEX "\n";
530 }
531 # which exon
532 my $region = $_;
533 my $exon = $filehash{$region}{'exon'};
534 # link exon to tmp file
535 my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt";
536 ## determine transcript orientation and location
537 my $firstline = `head -n 1 $exonfile`;
538 my @firstcols = split(/\t/,$firstline);
539 my $orient = $firstcols[5];
540 my $genomicchr = $firstcols[0];
541 my $genomicstart = $firstcols[1];
542 my $genomicstop = $firstcols[2];
543 if ($orient eq '+') {
544 $bps = $genomicstop - $genomicstart + 1;
545 $subtitle = "Region 0-$bps: $genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop);
546 }
547 else {
548 $bps = $genomicstop - $genomicstart + 1;
549 $subtitle = "Region 0-$bps: $genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop);
550 }
551 # print Rscript
552 open OUT, ">$wd/Rout/exonplot.R";
553 print OUT 'coveragetable <- read.table("'.$exonfile.'",as.is=TRUE,sep="\t",header=FALSE)'."\n";
554 print OUT 'coverage <- coveragetable[,'.$covcol.']'."\n";
555 print OUT 'coverage[coverage < 1] <- 1'."\n";
556 print OUT 'positions <- coveragetable[,'.$poscol.']'."\n";
557
558 my $width = 480 ;
559 my $height = 240 ;
560 my $exonstr = $exon;
561 $exonstr =~ s/\s/_/g;
562 $exon =~ s/_/ /g;
563 $exon =~ s/\|/ /g;
564 print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.')'."\n";
565 print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n";
566 if ($orient eq '-') {
567 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n";
568 print OUT 'mtext("'.$subtitle.'")'."\n";
569 }
570 else {
571 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",sub="(Transcribed from plus strand)")'."\n";
572 print OUT 'mtext("'.$subtitle.'")'."\n";
573 }
574 print OUT 'lines(positions,log10(coverage))'."\n";
575 print OUT 'abline(h=log10('.$thresh.'),lwd=4,col=rgb(255,0,0,100,maxColorValue=255))'."\n";
576 print OUT 'failedpos <- positions[coverage<'.$thresh.']'."\n";
577 print OUT 'failedcov <- coverage[coverage<'.$thresh.']'."\n";
578 print OUT 'points(failedpos,log10(failedcov),col="red",pch=19)'."\n";
579 print OUT 'graphics.off()'."\n";
580 close OUT;
581 # run R script
582 system("cd $wd/Rout && Rscript exonplot.R");
583 # Add to .TEX
584 print TEX '\begin{minipage}{0.5\linewidth}\centering'."\n";
585 print TEX '\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$exonstr.'.png}'."\n";
586 print TEX '\end{minipage}'."\n";
587 $col++;
588 }
589 }
590
591 ## plot failed (subregion) or all exons
592 if (exists($opts{'S'}) || exists($opts{'A'})) {
593 # count columns
594 my $head = `head -n 1 $wd/Targets.Position.Coverage`;
595 chomp($head);
596 my @cols = split(/\t/,$head);
597 my $nrcols = scalar(@cols);
598 my $covcol = $nrcols;
599 my $poscol = $nrcols -1;
600 # tex section header
601 open TEX, ">>$wd/Report/Report.tex";
602 print TEX '\subsection*{Failed Exon Plots}'."\n";
603 if (exists($opts{'S'})) {
604 print TEX '\underline{NOTE:} ALL exons were tested for local coverage $<$'.$thresh.' \\\\'."\n";
605 }
606 elsif (exists($opts{'A'})) {
607 print TEX '\underline{NOTE:} ALL exons are plotted, regardless of coverage \\\\'."\n";
608 }
609 $col = 1;
610 foreach(@allregions) {
611 if ($col > 2) {
612 $col = 1;
613 print TEX "\n";
614 }
615 # which exon
616 my $region = $_;
617 my $exon = $filehash{$region}{'exon'};
618 # grep exon to tmp file
619 my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt";
620 ## determine transcript orientation.
621 my $firstline = `head -n 1 $exonfile`;
622 my @firstcols = split(/\t/,$firstline);
623 my $orient = $firstcols[5];
624 my $genomicchr = $firstcols[0];
625 my $genomicstart = $firstcols[1];
626 my $genomicstop = $firstcols[2];
627
628 if ($orient eq '+') {
629 $bps = $genomicstop - $genomicstart + 1;
630 $subtitle = "Region 0-$bps: $genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop);
631
632 }
633 else {
634 $bps = $genomicstop - $genomicstart + 1;
635 $subtitle = "Region 0-$bps: $genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop);
636
637 }
638
639 # check if failed
640 if (exists($opts{'S'})) {
641 my $cs = `cut -f $covcol '$exonfile' `;
642 my @c = split(/\n/,$cs);
643 @c = sort { $a <=> $b } @c;
644 if ($c[0] >= $thresh) {
645 # lowest coverage > threshold => skip
646 next;
647 }
648 }
649 # print Rscript
650 open OUT, ">$wd/Rout/exonplot.R";
651 print OUT 'coveragetable <- read.table("'.$exonfile.'",as.is=TRUE,sep="\t",header=FALSE)'."\n";
652 print OUT 'coverage <- coveragetable[,'.$covcol.']'."\n";
653 print OUT 'coverage[coverage < 1] <- 1'."\n";
654 print OUT 'positions <- coveragetable[,'.$poscol.']'."\n";
655 my $width = 480 ;
656 my $height = 240 ;
657 my $exonstr = $exon;
658 $exonstr =~ s/\s/_/g;
659 $exon =~ s/_/ /g;
660 $exon =~ s/\|/ /g;
661 print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.')'."\n";
662 print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n";
663 if ($orient eq '-') {
664 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n";
665 print OUT 'mtext("'.$subtitle.'")'."\n";
666 }
667 else {
668 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",sub="(Transcribed from plus strand)")'."\n";
669 print OUT 'mtext("'.$subtitle.'")'."\n";
670 }
671
672 print OUT 'lines(positions,log10(coverage))'."\n";
673 print OUT 'abline(h=log10('.$thresh.'),lwd=4,col=rgb(255,0,0,100,maxColorValue=255))'."\n";
674 print OUT 'failedpos <- positions[coverage<'.$thresh.']'."\n";
675 print OUT 'failedcov <- coverage[coverage<'.$thresh.']'."\n";
676 print OUT 'points(failedpos,log10(failedcov),col="red",pch=19)'."\n";
677 print OUT 'graphics.off()'."\n";
678 close OUT;
679 # run R script
680 system("cd $wd/Rout && Rscript exonplot.R");
681 # Add to .TEX
682 print TEX '\begin{minipage}{0.5\linewidth}\centering'."\n";
683 print TEX '\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$exonstr.'.png}'."\n";
684 print TEX '\end{minipage}'."\n";
685 $col++;
686 }
687 }
688 ## list failed exons
689 if (exists($opts{'L'})) {
690 # count columns
691 my $head = `head -n 1 $wd/Targets.Position.Coverage`;
692 chomp($head);
693 my @cols = split(/\t/,$head);
694 my $nrcols = scalar(@cols);
695 my $covcol = $nrcols;
696 my $poscol = $nrcols -1;
697 ## hash to print
698 # tex section header
699 open TEX, ">>$wd/Report/Report.tex";
700 print TEX '\subsection*{List of Failed Exons}'."\n";
701 print TEX '\underline{NOTE:} ALL exons were tested for local coverage $<$'.$thresh.' \\\\'."\n";
702 print TEX '{\footnotesize\begin{longtable}[l]{@{\extracolsep{\fill}}llll}'."\n".'\hline'."\n";
703 print TEX '\textbf{Target Name} & \textbf{Genomic Position} & \textbf{Avg.Coverage} & \textbf{Min.Coverage} \\\\'."\n".'\hline'."\n";
704 print TEX '\endhead'."\n";
705 print TEX '\hline '."\n".'\multicolumn{4}{r}{{\textsl{\footnotesize Continued on next page}}} \\\\ '."\n".'\hline' ."\n". '\endfoot' . "\n". '\endlastfoot' . "\n";
706
707 $col = 1;
708 open IN, "$wd/Targets.Global.Coverage";
709 while (<IN>) {
710 chomp($_);
711 my @p = split(/\t/,$_);
712 my $region = $p[0].'-'.$p[1].'-'.$p[2];
713 my $exon = $filehash{$region}{'exon'};
714 # grep exon to tmp file
715 my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt";
716 ## determine transcript orientation.
717 my $firstline = `head -n 1 $exonfile`;
718 my @firstcols = split(/\t/,$firstline);
719 my $orient = $firstcols[5];
720 my $genomicchr = $firstcols[0];
721 my $genomicstart = $firstcols[1];
722 my $genomicstop = $firstcols[2];
723
724 if ($orient eq '+') {
725 $bps = $genomicstop - $genomicstart + 1;
726 $subtitle = "$genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop);
727
728 }
729 else {
730 $bps = $genomicstop - $genomicstart + 1;
731 $subtitle = "$genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop);
732 }
733
734 # check if failed
735 my $cs = `cut -f $covcol '$exonfile' `;
736 my @c = split(/\n/,$cs);
737 my ($avg,$med,$min,$max,$first,$third,$ontarget) = arraystats(@c);
738
739 if ($min >= $thresh) {
740 # lowest coverage > threshold => skip
741 next;
742 }
743
744 # print to .tex table
745 if (length($exon) > 30) {
746 $exon = substr($exon,0,27) . '...';
747 }
748 $exon =~ s/_/ /g;
749 $exon =~ s/\|/ /g;
750
751 print TEX "$exon & $subtitle & ".int($avg)." & $min ".'\\\\'."\n";
752 }
753 close IN;
754 print TEX '\hline'."\n";
755 print TEX '\end{longtable}}'."\n";
756 close TEX;
757 }
758
759
760 ## Close document
761 open OUT, ">>$wd/Report/Report.tex";
762 print OUT '\label{endofdoc}'."\n";
763 print OUT '\end{document}'."\n";
764 close OUT;
765 system("cd $wd/Report && pdflatex Report.tex > /dev/null 2>&1 && pdflatex Report.tex > /dev/null 2>&1 ");
766
767 ## mv report to output file
768 system("cp -f $wd/Report/Report.pdf '$pdffile'");
769 ##create tar.gz file
770 system("mkdir $wd/Results");
771 system("cp -Rf $wd/Plots $wd/Results/");
772 system("cp -Rf $wd/Report/ $wd/Results/");
773 if (-e "$wd/Targets.Global.Coverage") {
774 system("cp -Rf $wd/Targets.Global.Coverage $wd/Results/");
775 }
776 if (-e "$wd/Targets.Position.Coverage") {
777 system("cp -Rf $wd/Targets.Position.Coverage $wd/Results/");
778 }
779
780 system("cd $wd && tar czf '$tarfile' Results/");
781 ## clean up (galaxy stores outside wd)
782 system("rm -Rf $wd");
783 ###############
784 ## FUNCTIONS ##
785 ###############
786 sub arraystats{
787 my @array = @_;
788 my $count = scalar(@array);
789 @array = sort { $a <=> $b } @array;
790 # median
791 my $median = 0;
792 if ($count % 2) {
793 $median = $array[int($count/2)];
794 } else {
795 $median = ($array[$count/2] + $array[$count/2 - 1]) / 2;
796 }
797 # average
798 my $sum = 0;
799 foreach (@array) { $sum += $_; }
800 my $average = $sum / $count;
801 # quantiles (rounded)
802 my $quart = int($count/4) ;
803 my $first = $array[$quart];
804 my $third = $array[($quart*3)];
805 my $min = $array[0];
806 my $max = $array[($count-1)];
807 return ($average,$median,$min,$max,$first,$third,$sum);
808 }
809
810 sub GlobalSummary {
811 my ($bam,$targets) = @_;
812
813 my $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage";
814 if (exists($commandsrun{$command})) {
815 return;
816 }
817 system($command);
818 $commandsrun{$command} = 1;
819 }
820
821 sub CoveragePerRegion {
822 my ($bam,$targets) = @_;
823 my $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage";
824 if (exists($commandsrun{$command})) {
825 return;
826 }
827 system($command);
828 $commandsrun{$command} = 1;
829 }
830
831 sub SubRegionCoverage {
832 my ($bam,$targets) = @_;
833 my $command = "cd $wd && coverageBed -abam $bam -b $targets -d > $wd/Targets.Position.Coverage";
834 system($command);
835 $commandsrun{$command} = 1;
836 }
837