comparison CoverageReport.pl @ 0:30f8f85e3f98 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/coverage_report commit 1b647bb088f62c1369d963f030caecaa9ee003c7
author iuc
date Wed, 25 Oct 2017 12:37:35 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:30f8f85e3f98
1 #!/usr/bin/env perl
2 # load modules
3 use warnings;
4 use Getopt::Std;
5 use File::Basename;
6 use Number::Format;
7 use Cwd;
8
9 my %opts;
10
11 # number format
12 my $de = new Number::Format(-thousands_sep =>',',-decimal_point => '.');
13
14 ##########
15 ## opts ##
16 ##########
17 ## input files
18 # b : path to input (b)am file
19 # t : path to input (t)arget regions in BED format
20 ## output files
21 # o : report pdf (o)utput file
22 ## entries in the report
23 # r : Coverage per (r)egion (boolean)
24 # s : (s)ubregion coverage if average < specified (plots for positions along target region) (boolean)
25 # S : (S)ubregion coverage for ALL failed exons => use either s OR S or you will have double plots.
26 # A : (A)ll exons will be plotted.
27 # L : (L)ist failed exons instead of plotting
28 # m : (m)inimal Coverage threshold
29 # f : fraction of average as threshold
30 # n : sample (n)ame.
31
32
33 getopts('b:t:o:rsSALm:n:f:', \%opts) ;
34
35 my $tmp = getcwd();
36 # make output directory in (tmp) working dir
37 our $wd = "$tmp/Coverage.".int(rand(1000));
38 while (-d $wd) {
39 $wd = "$tmp/Coverage.".int(rand(1000));
40 }
41 system("mkdir $wd");
42
43 ## variables
44 our %commandsrun = ();
45 my ($thresh,$frac,$pdffile,$samplename,$totalmapped);
46
47
48 if (!exists($opts{'b'}) || !-e $opts{'b'}) {
49 die('Bam File not found');
50 }
51 if (!exists($opts{'t'}) || !-e $opts{'t'}) {
52 die('Target File (BED) not found');
53 }
54
55 if (exists($opts{'m'})) {
56 $thresh = $opts{'m'};
57 }
58 else {
59 $thresh = 40;
60 }
61
62 if (exists($opts{'f'})) {
63 $frac = $opts{'f'};
64 }
65 else {
66 $frac = 0.2;
67 }
68
69 if (exists($opts{'o'})) {
70 $pdffile = $opts{'o'};
71 }
72 else {
73 $pdffile = "$wd/CoverageReport.pdf";
74 }
75
76
77 # 1. Global Summary => default
78 &GlobalSummary($opts{'b'}, $opts{'t'});
79
80 # 2. Coverage per position
81 &SubRegionCoverage($opts{'b'}, $opts{'t'});
82 our %filehash;
83 if (exists($opts{'s'}) || exists($opts{'S'}) || exists($opts{'A'}) || exists($opts{'L'})) {
84 system("mkdir $wd/SplitFiles");
85 ## get position coverages
86 ## split input files
87 open IN, "$wd/Targets.Position.Coverage";
88 my $fileidx = 0;
89 my $currreg = '';
90 while (<IN>) {
91 my $line = $_;
92 chomp($line);
93 my @p = split(/\t/,$line);
94 my $reg = $p[0].'-'.$p[1].'-'.$p[2]; #.$p[3];
95 my $ex = $p[3];
96 if ($reg ne $currreg) {
97 ## new exon open new outfile
98 if ($currreg ne '') {
99 ## filehandle is open. close it
100 close OUT;
101 }
102 if (!exists($filehash{$reg})) {
103 $fileidx++;
104 $filehash{$reg}{'idx'} = $fileidx;
105 $filehash{$reg}{'exon'} = $ex;
106 open OUT, ">> $wd/SplitFiles/File_$fileidx.txt";
107 $currreg = $reg;
108 }
109 else {
110 open OUT, ">> $wd/SplitFiles/File_".$filehash{$reg}{'idx'}.".txt";
111 $currreg = $reg;
112 }
113 }
114 ## print the line to the open filehandle.
115 print OUT "$line\n";
116 }
117 close OUT;
118 close IN;
119
120 }
121
122 ## sort output files according to targets file
123 if (exists($opts{'r'}) ) {
124 my %hash = ();
125 open IN, "$wd/Targets.Global.Coverage";
126 while (<IN>) {
127 chomp;
128 my @p = split(/\t/,$_) ;
129 $hash{$p[3]} = $_;
130 }
131 close IN;
132 open OUT, ">$wd/Targets.Global.Coverage";
133 open IN, $opts{'t'};
134 while (<IN>) {
135 chomp;
136 my @p = split(/\t/,$_) ;
137 print OUT $hash{$p[3]} . "\n";
138 }
139 close IN;
140 close OUT;
141 }
142
143
144 ####################################
145 ## PROCESS RESULTS & CREATE PLOTS ##
146 ####################################
147 system("mkdir $wd/Report");
148
149 system("mkdir $wd/Rout");
150 system("mkdir $wd/Plots");
151
152 $samplename = $opts{'n'};
153 $samplename =~ s/_/\\_/g;
154 $samplename =~ s/\..+$//g;
155
156 # 0. Preamble
157 ## compose preamble
158 open OUT, ">$wd/Report/Report.tex";
159 print OUT '\documentclass[a4paper,10pt]{article}'."\n";
160 print OUT '\usepackage[left=2cm,top=1.5cm,right=1.5cm,bottom=2.5cm,nohead]{geometry}'."\n";
161 print OUT '\usepackage{longtable}'."\n";
162 print OUT '\usepackage{fancyhdr}'."\n";
163 print OUT '\usepackage{fontspec}' . "\n";
164 print OUT '\usepackage{color}'."\n";
165 print OUT '\definecolor{grey}{RGB}{160,160,160}'."\n";
166 print OUT '\definecolor{darkgrey}{RGB}{100,100,100}'."\n";
167 print OUT '\definecolor{red}{RGB}{255,0,0}'."\n";
168 print OUT '\definecolor{orange}{RGB}{238,118,0}'."\n";
169 print OUT '\setlength\LTleft{0pt}'."\n";
170 print OUT '\setlength\LTright{0pt}'."\n";
171 print OUT '\begin{document}'."\n";
172 print OUT '\pagestyle{fancy}'."\n";
173 print OUT '\fancyhead{}'."\n";
174 print OUT '\renewcommand{\footrulewidth}{0.4pt}'."\n";
175 print OUT '\renewcommand{\headrulewidth}{0pt}'."\n";
176 print OUT '\fancyfoot[R]{\today\hspace{2cm}\thepage\ of \pageref{endofdoc}}'."\n";
177 print OUT '\fancyfoot[C]{}'."\n";
178 print OUT '\fancyfoot[L]{Coverage Report for ``'.$samplename.'"}'."\n";
179 print OUT '\let\oldsubsubsection=\subsubsection'."\n";
180 print OUT '\renewcommand{\subsubsection}{%'."\n";
181 print OUT ' \filbreak'."\n";
182 print OUT ' \oldsubsubsection'."\n";
183 print OUT '}'."\n";
184 # main title
185 print OUT '\section*{Coverage Report for ``'.$samplename.'"}'."\n";
186 close OUT;
187
188 # 1. Summary Report
189 # Get samtools flagstat summary of BAM file
190 my $flagstat = `samtools flagstat $opts{'b'}`;
191 my @s = split(/\n/,$flagstat);
192 # Get number of reads mapped in total
193 ## updated on 2012-10-1 !!
194 $totalmapped = $s[2];
195 $totalmapped =~ s/^(\d+)(\s.+)/$1/;
196
197 if ( not $totalmapped){
198 exit;
199 }
200
201 # count columns
202 my $head = `head -n 1 $wd/Targets.Global.Coverage`;
203 chomp($head);
204 my @cols = split(/\t/,$head);
205 my $nrcols = scalar(@cols);
206 my $covcol = $nrcols - 3;
207 # get min/max/median/average coverage => values
208 my $covs = `cut -f $covcol $wd/Targets.Global.Coverage`;
209 my @coverages = split(/\n/,$covs);
210 my ($eavg,$med,$min,$max,$first,$third,$ontarget) = arraystats(@coverages);
211 my $spec = 0;
212 $spec=sprintf("%.1f",($ontarget / $totalmapped)*100);
213 # get min/max/median/average coverage => boxplot in R
214 open OUT, ">$wd/Rout/boxplot.R";
215 print OUT 'coverage <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n";
216 print OUT 'coverage <- coverage[,'.$covcol.']'."\n";
217 print OUT 'png(file="../Plots/CoverageBoxPlot.png", bg="white", width=240, height=480)'."\n";
218 print OUT 'boxplot(coverage,range=1.5,main="Target Region Coverage")'."\n";
219 print OUT 'graphics.off()'."\n";
220 close OUT;
221 system("cd $wd/Rout && Rscript boxplot.R") == 0 || die "Could not run boxplot.R with following error '$!'\n";
222
223 ## global nt coverage plot
224 ## use perl to make histogram (lower memory)
225 open IN, "$wd/Targets.Position.Coverage";
226 my %dens;
227 my $counter = 0;
228 my $sum = 0;
229 my $avg = 0;
230
231 while (<IN>) {
232 chomp();
233 my @p = split(/\t/);
234 $sum += $p[-1];
235 $counter++;
236 if (defined($dens{$p[-1]})) {
237 $dens{$p[-1]}++;
238 }
239 else {
240 $dens{$p[-1]} = 1;
241 }
242 }
243 $avg = $sum/$counter;
244 close IN;
245 open OUT, ">$wd/Rout/hist.txt";
246 if (!defined($dens{'0'})) {
247 $dens{'0'} = 0;
248 }
249 foreach (keys(%dens)) {
250 print OUT "$_;$dens{$_}\n";
251 }
252 close OUT;
253 open OUT, ">$wd/Rout/ntplot.R";
254 # read coverage hist in R to plot
255 print OUT 'coverage <- read.table("hist.txt" , as.is = TRUE, header=FALSE,sep=";")'."\n";
256 print OUT 'mincov <- '."$thresh \n";
257 print OUT "avg <- round($avg,digits=8)\n";
258 print OUT "colnames(coverage) <- c('cov','count')\n";
259 print OUT 'coverage$cov <- coverage$cov / avg'."\n";
260 print OUT 'rep <- which(coverage$cov > 1)'."\n";
261 print OUT 'coverage[coverage$cov > 1,1] <- 1'."\n";
262 print OUT 'values <- coverage[coverage$cov < 1,]'."\n";
263 print OUT 'values <- rbind(values,c(1,sum(coverage[coverage$cov == 1,"count"])))'."\n";
264 print OUT 'values <- values[order(values$cov),]'."\n";
265 print OUT 'prevcount <- 0'."\n";
266 # make cumulative count data frame
267 print OUT 'for (i in rev(values$cov)) {'."\n";
268 print OUT ' values[values$cov == i,"count"] <- prevcount + values[values$cov == i,"count"]'."\n";
269 print OUT ' prevcount <- values[values$cov == i,"count"]'."\n";
270 print OUT '}'."\n";
271 print OUT 'values$count <- values$count / (values[values$cov == 0,"count"] / 100)'."\n";
272 # get some values to plot lines.
273 print OUT 'mincov.x <- mincov/avg'."\n";
274 print OUT 'if (mincov/avg <= 1) {'."\n";
275 print OUT ' ii <- which(values$cov == mincov.x)'."\n";
276 print OUT ' if (length(ii) == 1) {'."\n";
277 print OUT ' mincov.y <- values[ii[1],"count"]'."\n";
278 print OUT ' } else {'."\n";
279 print OUT ' i1 <- max(which(values$cov < mincov.x))'."\n";
280 print OUT ' i2 <- min(which(values$cov > mincov.x))'."\n";
281 print OUT ' mincov.y <- ((values[i2,"count"] - values[i1,"count"])/(values[i2,"cov"] - values[i1,"cov"]))*(mincov.x - values[i1,"cov"]) + values[i1,"count"]'."\n";
282 print OUT ' }'."\n";
283 print OUT '}'."\n";
284 # open output image and create plot
285 print OUT 'png(file="../Plots/CoverageNtPlot.png", bg="white", width=540, height=480)'."\n";
286 print OUT 'par(xaxs="i",yaxs="i")'."\n";
287 print OUT 'plot(values$cov,values$count,ylim=c(0,100),pch=".",main="Cumulative Normalised Base-Coverage Plot",xlab="Normalizalised Coverage",ylab="Cumulative Nr. Of Bases")'."\n";
288 print OUT 'lines(values$cov,values$count)'."\n";
289 print OUT 'if (mincov.x <= 1) {'."\n";
290 print OUT ' lines(c(mincov.x,mincov.x),c(0,mincov.y),lty=2,col="darkgreen")'."\n";
291 print OUT ' lines(c(0,mincov.x),c(mincov.y,mincov.y),lty=2,col="darkgreen")'."\n";
292 print OUT ' text(1,(95),pos=2,col="darkgreen",labels="Threshold: '.$thresh.'x")'."\n";
293 print OUT ' text(1,(91),pos=2,col="darkgreen",labels=paste("%Bases: ",round(mincov.y,2),"%",sep=""))'."\n";
294 print OUT '} else {'."\n";
295 print OUT ' text(1,(95),pos=2,col="darkgreen",labels="Threshold ('.$thresh.'x) > Average")'."\n";
296 print OUT ' text(1,(91),pos=2,col="darkgreen",labels="Plotting impossible")'."\n";
297 print OUT '}'."\n";
298 print OUT 'frac.x <- '."$frac\n";
299 print OUT 'ii <- which(values$cov == frac.x)'."\n";
300 print OUT 'if (length(ii) == 1) {'."\n";
301 print OUT ' frac.y <- values[ii[1],"count"]'."\n";
302 print OUT '} else {'."\n";
303 print OUT ' i1 <- max(which(values$cov < frac.x))'."\n";
304 print OUT ' i2 <- min(which(values$cov > frac.x))'."\n";
305 print OUT ' frac.y <- ((values[i2,"count"] - values[i1,"count"])/(values[i2,"cov"] - values[i1,"cov"]))*(frac.x - values[i1,"cov"]) + values[i1,"count"]'."\n";
306 print OUT '}'."\n";
307 print OUT 'lines(c(frac.x,frac.x),c(0,frac.y),lty=2,col="red")'."\n";
308 print OUT 'lines(c(0,frac.x),c(frac.y,frac.y),lty=2,col="red")'."\n";
309 #iprint OUT 'text((frac.x+0.05),(frac.y - 2),pos=4,col="red",labels=paste(frac.x," x Avg.Cov : ",round(frac.x * avg,2),"x",sep="" ))'."\n";
310 #print OUT 'text((frac.x+0.05),(frac.y-5),pos=4,col="red",labels=paste("%Bases: ",round(frac.y,2),"%",sep=""))'."\n";
311 print OUT 'text(1,86,pos=2,col="red",labels=paste(frac.x," x Avg.Cov : ",round(frac.x * avg,2),"x",sep="" ))'."\n";
312 print OUT 'text(1,82,pos=2,col="red",labels=paste("%Bases: ",round(frac.y,2),"%",sep=""))'."\n";
313
314 print OUT 'graphics.off()'."\n";
315
316 close OUT;
317 system("cd $wd/Rout && Rscript ntplot.R") == 0 || die "Could not run ntplot.R with following error '$!'\n";
318 ## PRINT TO .TEX FILE
319 open OUT, ">>$wd/Report/Report.tex";
320 # average coverage overviews
321 print OUT '\subsection*{Overall Summary}'."\n";
322 print OUT '{\small ';
323 # left : boxplot
324 print OUT '\begin{minipage}{0.3\linewidth}\centering'."\n";
325 print OUT '\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/CoverageBoxPlot.png}'."\n";
326 print OUT '\end{minipage}'."\n";
327 # right : cum.cov.plot
328 print OUT '\hspace{0.6cm}'."\n";
329 print OUT '\begin{minipage}{0.65\linewidth}\centering'."\n";
330 print OUT '\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/CoverageNtPlot.png}'."\n";
331 print OUT '\end{minipage} \\\\'."\n";
332 ## next line
333 print OUT '\begin{minipage}{0.48\linewidth}'."\n";
334 print OUT '\vspace{-1.2em}'."\n";
335 print OUT '\begin{tabular}{ll}'."\n";
336 # bam statistics
337 print OUT '\multicolumn{2}{l}{\textbf{\underline{Samtools Flagstat Summary}}} \\\\'."\n";
338 foreach (@s) {
339 $_ =~ m/^(\d+)\s(.+)$/;
340 my $one = $1;
341 my $two = $2;
342 $two =~ s/\s\+\s0\s//;
343 $two = ucfirst($two);
344 $one =~ s/%/\\%/g;
345 # remove '+ 0 ' from front
346 $two =~ s/\+\s0\s//;
347 # remove trailing from end
348 $two =~ s/(\s\+.*)|(:.*)/\)/;
349 $two =~ s/%/\\%/g;
350 $two =~ s/>=/\$\\ge\$/g;
351 $two = ucfirst($two);
352 print OUT '\textbf{'.$two.'} & '.$one.' \\\\'."\n";
353 }
354 print OUT '\end{tabular}\end{minipage}'."\n";
355 print OUT '\hspace{1.5cm}'."\n";
356 # target coverage statistics
357 print OUT '\begin{minipage}{0.4\linewidth}'."\n";
358 #print OUT '\vspace{-4.8em}'."\n";
359 print OUT '\begin{tabular}{ll}'."\n";
360 print OUT '\multicolumn{2}{l}{\textbf{\underline{Target Region Coverage}}} \\\\'."\n";
361 print OUT '\textbf{Number of Target Regions} & '.scalar(@coverages).' \\\\'."\n";
362 print OUT '\textbf{Minimal Region Coverage} & '.$min.' \\\\'."\n";
363 print OUT '\textbf{25\% Region Coverage} & '.$first.' \\\\'. "\n";
364 print OUT '\textbf{50\% (Median) Region Coverage} & '.$med.' \\\\'. "\n";
365 print OUT '\textbf{75\% Region Coverage} & '.$third.' \\\\'. "\n";
366 print OUT '\textbf{Maximal Region Coverage} & '.$max.' \\\\'. "\n";
367 print OUT '\textbf{Average Region Coverage} & '.int($eavg).' \\\\'. "\n";
368 print OUT '\textbf{Mapped On Target} & '.$spec.' \\\\'."\n";
369 print OUT '\multicolumn{2}{l}{\textbf{\underline{Target Base Coverage }}} \\\\'."\n";
370 print OUT '\textbf{Number of Target Bases} & '.$counter.' \\\\'."\n";
371 print OUT '\textbf{Average Base Coverage} & '.int($avg).' \\\\'. "\n";
372 print OUT '\textbf{Non-Covered Bases} & '.$dens{'0'}.' \\\\'."\n";
373 #print OUT '\textbf{Bases Covered $ge$ '.$frac.'xAvg.Cov} & '.
374 print OUT '\end{tabular}\end{minipage}}'."\n";
375 close OUT;
376
377 # 2. GLOBAL COVERAGE OVERVIEW PER GENE
378 my @failedexons;
379 my @allexons;
380 my @allregions;
381 my @failedregions;
382 if (exists($opts{'r'}) || exists($opts{'s'}) || exists($opts{'S'})) {
383 # count columns
384 my $head = `head -n 1 $wd/Targets.Global.Coverage`;
385 chomp($head);
386 my @cols = split(/\t/,$head);
387 my $nrcols = scalar(@cols);
388 my $covcol = $nrcols - 3;
389 # Coverage Plots for each gene => barplots in R, table here.
390 open IN, "$wd/Targets.Global.Coverage";
391 my $currgroup = '';
392 my $startline = 0;
393 my $stopline = 0;
394 my $linecounter = 0;
395 my $nrcol=0;
396 while (<IN>) {
397 $linecounter++;
398 chomp($_);
399 my @c = split(/\t/,$_);
400 push(@allregions,$c[0].'-'.$c[1].'-'.$c[2]);
401 my $group = $c[3];
402 ## coverage failure?
403 if ($c[$nrcol-1] < 1 || $c[$covcol-1] < $thresh) {
404 push(@failedexons,$group);
405 push(@failedregions,$c[0].'-'.$c[1].'-'.$c[2]);
406 }
407 ## store exon
408 push(@allexons,$group);
409 ## extract and check gene
410 $group =~ s/^(\S+)[\|\s](.+)/$1/;
411
412 my $scale;
413 if ($group ne $currgroup ) {
414 if ($currgroup ne '') {
415 # new gene, make plot.
416 open OUT, ">$wd/Rout/barplot.R";
417 print OUT 'coveragetable <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n";
418 print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n";
419 print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n";
420 print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n";
421 print OUT 'coverage[coverage < 1] <- 1'."\n";
422 print OUT 'colors <- c(rep("grey",length(coverage)))'."\n";
423 # coverage not whole target region => orange
424 print OUT 'covperc <- coveragetable[c('.$startline.':'.$stopline.'),'.$nrcols.']'."\n";
425 print OUT 'colors[covperc<1] <- "orange"'."\n";
426 # coverage below threshold => red
427 print OUT 'colors[coverage<'.$thresh.'] <- "red"'."\n";
428
429 if ($stopline - $startline > 20) {
430 $scale = 2;
431 }
432 else {
433 $scale = 1;
434 }
435 my $width = 480 * $scale;
436 my $height = 240 * $scale;
437 print OUT 'png(file="../Plots/Coverage_'.$currgroup.'.png", bg="white", width='.$width.', height='.$height.')'."\n";
438 print OUT 'ylim = c(0,max(max(log10(coverage),log10('.($thresh+20).'))))'."\n";
439 print OUT 'mp <- barplot(log10(coverage),col=colors,main="Exon Coverage for '.$currgroup.'",ylab="Log10(Coverage)",ylim=ylim)'."\n";
440 print OUT 'text(mp, log10(coverage) + '.(0.4/$scale).',format(coverage),xpd = TRUE,srt=90)'."\n";
441 print OUT 'text(mp,par("usr")[3]-0.05,labels=entries,srt=45,adj=1,xpd=TRUE)'."\n";
442 print OUT 'abline(h=log10('.$thresh.'),lwd=4,col=rgb(255,0,0,100,maxColorValue=255))'."\n";
443 print OUT 'graphics.off()'."\n";
444 close OUT;
445 system("cd $wd/Rout && Rscript barplot.R") == 0 || die "Could not run barplot.R with following error '$!'\n";
446 if ($scale == 1) {
447 push(@small,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}');
448 }
449 else {
450 push(@large,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}');
451 }
452
453 }
454 $currgroup = $group;
455 $startline = $linecounter;
456 }
457 $stopline = $linecounter;
458 }
459 close IN;
460 if ($currgroup ne '') {
461 # last gene, make plot.
462 open OUT, ">$wd/Rout/barplot.R";
463 print OUT 'coveragetable <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n";
464 print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n";
465 print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n";
466 print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n";
467 print OUT 'coverage[coverage < 1] <- 1'."\n";
468 print OUT 'colors <- c(rep("grey",length(coverage)))'."\n";
469 print OUT 'colors[coverage<'.$thresh.'] <- "red"'."\n";
470
471 if ($stopline - $startline > 20) {
472 $scale = 2;
473 }
474 else {
475 $scale = 1;
476 }
477 my $width = 480 * $scale;
478 my $height = 240 * $scale;
479 print OUT 'png(file="../Plots/Coverage_'.$currgroup.'.png", bg="white", width='.$width.', height='.$height.')'."\n";
480 print OUT 'ylim = c(0,max(max(log10(coverage),log10('.($thresh+20).'))))'."\n";
481 print OUT 'mp <- barplot(log10(coverage),col=colors,main="Exon Coverage for '.$currgroup.'",ylab="Log10(Coverage)", ylim=ylim)'."\n";
482 print OUT 'text(mp, log10(coverage) + log10(2),format(coverage),xpd = TRUE,srt=90)'."\n";
483 print OUT 'text(mp,par("usr")[3]-0.1,labels=entries,srt=45,adj=1,xpd=TRUE)'."\n";
484 print OUT 'abline(h=log10('.$thresh.'),lwd=4,col=rgb(255,0,0,100,maxColorValue=255))'."\n";
485 print OUT 'graphics.off()'."\n";
486 close OUT;
487 system("cd $wd/Rout && Rscript barplot.R") == 0 || die "Could not run barplot.R with following error '$!'\n";
488 if ($scale == 1) {
489 push(@small,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}');
490 }
491 else {
492 push(@large,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}');
493 }
494 }
495 ## print to TEX
496 open OUT, ">>$wd/Report/Report.tex";
497 print OUT '\subsection*{Gene Summaries}'."\n";
498 print OUT '\underline{Legend:} \\\\'."\n";
499 print OUT '{\color{red}\textbf{RED:} Coverage did not reach set threshold of '.$thresh.'} \\\\'."\n";
500 print OUT '{\color{orange}\textbf{ORANGE:} Coverage was incomplete for the exon. Overruled by red.} \\\\' ."\n";
501 my $col = 1;
502 foreach (@small) {
503 if ($col > 2) {
504 $col = 1;
505 print OUT "\n";
506 }
507 print OUT '\begin{minipage}{0.5\linewidth}\centering'."\n";
508 print OUT $_."\n";
509 print OUT '\end{minipage}'."\n";
510 $col++;
511 }
512 ## new line
513 if ($col == 2) {
514 print OUT '\\\\'." \n";
515 }
516 foreach(@large) {
517 print OUT $_."\n";
518 }
519 close OUT;
520
521 }
522
523 # 3. Detailed overview of failed exons (globally failed)
524 if (exists($opts{'s'})) {
525 # count columns
526 my $head = `head -n 1 $wd/Targets.Position.Coverage`;
527 chomp($head);
528 my $subtitle;
529 my @cols = split(/\t/,$head);
530 my $nrcols = scalar(@cols);
531 my $covcol = $nrcols;
532 my $poscol = $nrcols -1;
533 # tex section header
534 open TEX, ">>$wd/Report/Report.tex";
535 print TEX '\subsection*{Failed Exon Plots}'."\n";
536 $col = 1;
537 print TEX '\underline{NOTE:} Only exons with global coverage $<$'.$thresh.' or incomplete coverage were plotted \\\\'."\n";
538 foreach(@failedregions) {
539 if ($col > 2) {
540 $col = 1;
541 print TEX "\n";
542 }
543 # which exon
544 my $region = $_;
545 my $exon = $filehash{$region}{'exon'};
546 # link exon to tmp file
547 my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt";
548 ## determine transcript orientation and location
549 my $firstline = `head -n 1 $exonfile`;
550 my @firstcols = split(/\t/,$firstline);
551 my $orient = $firstcols[5];
552 my $genomicchr = $firstcols[0];
553 my $genomicstart = $firstcols[1];
554 my $genomicstop = $firstcols[2];
555 if ($orient eq '+') {
556 $bps = $genomicstop - $genomicstart + 1;
557 $subtitle = "Region 0-$bps: $genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop);
558 }
559 else {
560 $bps = $genomicstop - $genomicstart + 1;
561 $subtitle = "Region 0-$bps: $genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop);
562 }
563 # print Rscript
564 open OUT, ">$wd/Rout/exonplot.R";
565 print OUT 'coveragetable <- read.table("'.$exonfile.'",as.is=TRUE,sep="\t",header=FALSE)'."\n";
566 print OUT 'coverage <- coveragetable[,'.$covcol.']'."\n";
567 print OUT 'coverage[coverage < 1] <- 1'."\n";
568 print OUT 'positions <- coveragetable[,'.$poscol.']'."\n";
569
570 my $width = 480 ;
571 my $height = 240 ;
572 my $exonstr = $exon;
573 $exonstr =~ s/\s/_/g;
574 $exon =~ s/_/ /g;
575 $exon =~ s/\|/ /g;
576 print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.')'."\n";
577 print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n";
578 if ($orient eq '-') {
579 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n";
580 print OUT 'mtext("'.$subtitle.'")'."\n";
581 }
582 else {
583 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",sub="(Transcribed from plus strand)")'."\n";
584 print OUT 'mtext("'.$subtitle.'")'."\n";
585 }
586 print OUT 'lines(positions,log10(coverage))'."\n";
587 print OUT 'abline(h=log10('.$thresh.'),lwd=4,col=rgb(255,0,0,100,maxColorValue=255))'."\n";
588 print OUT 'failedpos <- positions[coverage<'.$thresh.']'."\n";
589 print OUT 'failedcov <- coverage[coverage<'.$thresh.']'."\n";
590 print OUT 'points(failedpos,log10(failedcov),col="red",pch=19)'."\n";
591 print OUT 'graphics.off()'."\n";
592 close OUT;
593 # run R script
594 system("cd $wd/Rout && Rscript exonplot.R")== 0 || die "Could not run exonplot.R with following error '$!'\n";
595 # Add to .TEX
596 print TEX '\begin{minipage}{0.5\linewidth}\centering'."\n";
597 print TEX '\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$exonstr.'.png}'."\n";
598 print TEX '\end{minipage}'."\n";
599 $col++;
600 }
601 }
602
603 ## plot failed (subregion) or all exons
604 if (exists($opts{'S'}) || exists($opts{'A'})) {
605 # count columns
606 my $head = `head -n 1 $wd/Targets.Position.Coverage`;
607 chomp($head);
608 my @cols = split(/\t/,$head);
609 my $nrcols = scalar(@cols);
610 my $covcol = $nrcols;
611 my $poscol = $nrcols -1;
612 # tex section header
613 open TEX, ">>$wd/Report/Report.tex";
614 print TEX '\subsection*{Failed Exon Plots}'."\n";
615 if (exists($opts{'S'})) {
616 print TEX '\underline{NOTE:} ALL exons were tested for local coverage $<$'.$thresh.' \\\\'."\n";
617 }
618 elsif (exists($opts{'A'})) {
619 print TEX '\underline{NOTE:} ALL exons are plotted, regardless of coverage \\\\'."\n";
620 }
621 $col = 1;
622 foreach(@allregions) {
623 if ($col > 2) {
624 $col = 1;
625 print TEX "\n";
626 }
627 # which exon
628 my $region = $_;
629 my $exon = $filehash{$region}{'exon'};
630 # grep exon to tmp file
631 my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt";
632 ## determine transcript orientation.
633 my $firstline = `head -n 1 $exonfile`;
634 my @firstcols = split(/\t/,$firstline);
635 my $orient = $firstcols[5];
636 my $genomicchr = $firstcols[0];
637 my $genomicstart = $firstcols[1];
638 my $genomicstop = $firstcols[2];
639 my $subtitle;
640
641 if ($orient eq '+') {
642 my $bps = $genomicstop - $genomicstart + 1;
643 $subtitle = "Region 0-$bps: $genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop);
644
645 }
646 else {
647 my $bps = $genomicstop - $genomicstart + 1;
648 $subtitle = "Region 0-$bps: $genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop);
649
650 }
651
652 # check if failed
653 if (exists($opts{'S'})) {
654 my $cs = `cut -f $covcol '$exonfile' `;
655 my @c = split(/\n/,$cs);
656 @c = sort { $a <=> $b } @c;
657 if ($c[0] >= $thresh) {
658 # lowest coverage > threshold => skip
659 next;
660 }
661 }
662 # print Rscript
663 open OUT, ">$wd/Rout/exonplot.R";
664 print OUT 'coveragetable <- read.table("'.$exonfile.'",as.is=TRUE,sep="\t",header=FALSE)'."\n";
665 print OUT 'coverage <- coveragetable[,'.$covcol.']'."\n";
666 print OUT 'coverage[coverage < 1] <- 1'."\n";
667 print OUT 'positions <- coveragetable[,'.$poscol.']'."\n";
668 my $width = 480 ;
669 my $height = 240 ;
670 my $exonstr = $exon;
671 $exonstr =~ s/\s/_/g;
672 $exon =~ s/_/ /g;
673 $exon =~ s/\|/ /g;
674 print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.')'."\n";
675 print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n";
676 if ($orient eq '-') {
677 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n";
678 print OUT 'mtext("'.$subtitle.'")'."\n";
679 }
680 else {
681 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",sub="(Transcribed from plus strand)")'."\n";
682 print OUT 'mtext("'.$subtitle.'")'."\n";
683 }
684
685 print OUT 'lines(positions,log10(coverage))'."\n";
686 print OUT 'abline(h=log10('.$thresh.'),lwd=4,col=rgb(255,0,0,100,maxColorValue=255))'."\n";
687 print OUT 'failedpos <- positions[coverage<'.$thresh.']'."\n";
688 print OUT 'failedcov <- coverage[coverage<'.$thresh.']'."\n";
689 print OUT 'points(failedpos,log10(failedcov),col="red",pch=19)'."\n";
690 print OUT 'graphics.off()'."\n";
691 close OUT;
692 # run R script
693 system("cd $wd/Rout && Rscript exonplot.R") == 0 || die "Could not run exonplot.R with following error '$!'\n";
694 # Add to .TEX
695 print TEX '\begin{minipage}{0.5\linewidth}\centering'."\n";
696 print TEX '\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$exonstr.'.png}'."\n";
697 print TEX '\end{minipage}'."\n";
698 $col++;
699 }
700 }
701 ## list failed exons
702 if (exists($opts{'L'})) {
703 # count columns
704 my $subtitle;
705 my $head = `head -n 1 $wd/Targets.Position.Coverage`;
706 chomp($head);
707 my @cols = split(/\t/,$head);
708 my $nrcols = scalar(@cols);
709 my $covcol = $nrcols;
710 my $poscol = $nrcols -1;
711 ## hash to print
712 # tex section header
713 open TEX, ">>$wd/Report/Report.tex";
714 print TEX '\subsection*{List of Failed Exons}'."\n";
715 print TEX '\underline{NOTE:} ALL exons were tested for local coverage $<$'.$thresh.' \\\\'."\n";
716 print TEX '{\footnotesize\begin{longtable}[l]{@{\extracolsep{\fill}}llll}'."\n".'\hline'."\n";
717 print TEX '\textbf{Target Name} & \textbf{Genomic Position} & \textbf{Avg.Coverage} & \textbf{Min.Coverage} \\\\'."\n".'\hline'."\n";
718 print TEX '\endhead'."\n";
719 print TEX '\hline '."\n".'\multicolumn{4}{r}{{\textsl{\footnotesize Continued on next page}}} \\\\ '."\n".'\hline' ."\n". '\endfoot' . "\n". '\endlastfoot' . "\n";
720
721 $col = 1;
722 open IN, "$wd/Targets.Global.Coverage";
723 while (<IN>) {
724 chomp($_);
725 my @p = split(/\t/,$_);
726 my $region = $p[0].'-'.$p[1].'-'.$p[2];
727 my $exon = $filehash{$region}{'exon'};
728 # grep exon to tmp file
729 my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt";
730 ## determine transcript orientation.
731 my $firstline = `head -n 1 $exonfile`;
732 my @firstcols = split(/\t/,$firstline);
733 my $orient = $firstcols[5];
734 my $genomicchr = $firstcols[0];
735 my $genomicstart = $firstcols[1];
736 my $genomicstop = $firstcols[2];
737
738 if ($orient eq '+') {
739 my $bps = $genomicstop - $genomicstart + 1;
740 $subtitle = "$genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop);
741
742 }
743 else {
744 my $bps = $genomicstop - $genomicstart + 1;
745 $subtitle = "$genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop);
746 }
747
748 # check if failed
749 my $cs = `cut -f $covcol '$exonfile' `;
750 my @c = split(/\n/,$cs);
751 my ($avg,$med,$min,$max,$first,$third,$ontarget) = arraystats(@c);
752
753 if ($min >= $thresh) {
754 # lowest coverage > threshold => skip
755 next;
756 }
757
758 # print to .tex table
759 if (length($exon) > 30) {
760 $exon = substr($exon,0,27) . '...';
761 }
762 $exon =~ s/_/ /g;
763 $exon =~ s/\|/ /g;
764
765 print TEX "$exon & $subtitle & ".int($avg)." & $min ".'\\\\'."\n";
766 }
767 close IN;
768 print TEX '\hline'."\n";
769 print TEX '\end{longtable}}'."\n";
770 close TEX;
771 }
772
773
774 ## Close document
775 open OUT, ">>$wd/Report/Report.tex";
776 print OUT '\label{endofdoc}'."\n";
777 print OUT '\end{document}'."\n";
778 close OUT;
779 system("cd $wd/Report && tectonic Report.tex") == 0 || die "Could not run tectonic with following error '$!'\n";
780
781 ## mv report to output file
782 system("cp -f $wd/Report/Report.pdf '$pdffile'");
783 ##create tar.gz file
784 system("mkdir $wd/Results");
785 system("cp -Rf $wd/Plots $wd/Results/");
786 system("cp -Rf $wd/Report/ $wd/Results/");
787 if (-e "$wd/Targets.Global.Coverage") {
788 system("cp -Rf $wd/Targets.Global.Coverage $wd/Results/");
789 }
790 if (-e "$wd/Targets.Position.Coverage") {
791 system("cp -Rf $wd/Targets.Position.Coverage $wd/Results/");
792 }
793
794
795 exit;
796
797 ###############
798 ## FUNCTIONS ##
799 ###############
800 sub arraystats{
801 my @array = @_;
802 my $count = scalar(@array);
803 @array = sort { $a <=> $b } @array;
804 # median
805 my $median = 0;
806 if ($count % 2) {
807 $median = $array[int($count/2)];
808 } else {
809 $median = ($array[$count/2] + $array[$count/2 - 1]) / 2;
810 }
811 # average
812 my $sum = 0;
813 foreach (@array) { $sum += $_; }
814 my $average = $sum / $count;
815 # quantiles (rounded)
816 my $quart = int($count/4) ;
817 my $first = $array[$quart];
818 my $third = $array[($quart*3)];
819 my $min = $array[0];
820 my $max = $array[($count-1)];
821 return ($average,$median,$min,$max,$first,$third,$sum);
822 }
823
824 sub GlobalSummary {
825 my ($bam,$targets) = @_;
826
827 my $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage";
828 if (exists($commandsrun{$command})) {
829 return;
830 }
831 system($command);
832 $commandsrun{$command} = 1;
833 }
834
835 sub CoveragePerRegion {
836 my ($bam,$targets) = @_;
837 my $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage";
838 if (exists($commandsrun{$command})) {
839 return;
840 }
841 system($command);
842 $commandsrun{$command} = 1;
843 }
844
845 sub SubRegionCoverage {
846 my ($bam,$targets) = @_;
847 my $command = "cd $wd && coverageBed -abam $bam -b $targets -d > $wd/Targets.Position.Coverage";
848 system($command);
849 $commandsrun{$command} = 1;
850 }
851