comparison CoverageReport.pl @ 26:859999cb135b draft

revised routines. Better handling of collapsing, isoforms and mutiple mapppings. Barplots revised to visualise average base coverage in exon instead of total number of reads in exon
author geert-vandeweyer
date Wed, 29 Nov 2017 08:12:27 -0500
parents 6cb012c8497a
children 576bfc1586f9
comparison
equal deleted inserted replaced
25:6cb012c8497a 26:859999cb135b
24 # A : (A)ll exons will be plotted. 24 # A : (A)ll exons will be plotted.
25 # L : (L)ist failed exons instead of plotting 25 # L : (L)ist failed exons instead of plotting
26 # m : (m)inimal Coverage threshold 26 # m : (m)inimal Coverage threshold
27 # f : fraction of average as threshold 27 # f : fraction of average as threshold
28 # n : sample (n)ame. 28 # n : sample (n)ame.
29 29 # T : collapse overlapping Target regions.
30 30
31 getopts('b:t:o:z:rsSALm:n:f:T', \%opts) ; 31 getopts('b:t:o:z:rsSALm:n:f:T', \%opts) ;
32 32
33 # make output directory in (tmp) working dir 33 # make output directory in (tmp) working dir
34 our $wd = "/tmp/Coverage.".int(rand(1000)); 34 our $wd = "/tmp/Coverage.".int(rand(1000));
35
35 while (-d $wd) { 36 while (-d $wd) {
36 $wd = "/tmp/Coverage.".int(rand(1000)); 37 $wd = "/tmp/Coverage.".int(rand(1000));
37 38
38 } 39 }
39 system("mkdir $wd"); 40 system("mkdir $wd");
40 41 $wd = "/tmp/Coverage.993";
42 print "wd : $wd\n";
41 ## variables 43 ## variables
42 our %commandsrun = (); 44 our %commandsrun = ();
43 45
44 if (!exists($opts{'b'}) || !-e $opts{'b'}) { 46 if (!exists($opts{'b'}) || !-e $opts{'b'}) {
45 die('Bam File not found'); 47 die('Bam File not found');
87 } 89 }
88 my $targets = $opts{'t'}; 90 my $targets = $opts{'t'};
89 my $tmptargets = "$wd/collapsedtargets.bed"; 91 my $tmptargets = "$wd/collapsedtargets.bed";
90 system("sort -k1,1 -k2,2n $targets > $wd/sorted.targets.bed"); 92 system("sort -k1,1 -k2,2n $targets > $wd/sorted.targets.bed");
91 system("bedtools merge -s -scores max -nms -i $wd/sorted.targets.bed > $tmptargets"); 93 system("bedtools merge -s -scores max -nms -i $wd/sorted.targets.bed > $tmptargets");
92 $opts{'t'} = $tmptargets; 94 open IN, $tmptargets;
93 } 95 open OUT, ">$wd/collapsed.targets.renamed.bed";
94 96 # we assume that overlapping fragments come from isoforms, not from different genes.
95 # 1. Global Summary => default 97 my %counters = ();
96 &GlobalSummary($opts{'b'}, $opts{'t'}); 98 my @genes = ();
99 while (<IN>) {
100 chomp;
101 my @p = split(/\t/,$_);
102 my @g = split(/,/,$p[3]);
103 $g[0] =~ m/(\S+)\(.*/;
104 my $gene = $1;
105 if (!defined($counters{$gene})) {
106 push(@genes,$gene);
107 $counters{$gene}{'lines'} = ();
108 $counters{$gene}{'orient'} = $p[5];
109 }
110 $p[3] = $gene."(COLLAPSED)";
111 push(@{$counters{$gene}{'lines'}},\@p);
112 }
113 close IN;
114 foreach my $gene (@genes) {
115 if ($counters{$gene}{'orient'} eq '-') {
116 my $idx = scalar(@{$counters{$gene}{'lines'}}) + 1;
117 foreach my $line (@{$counters{$gene}{'lines'}}) {
118 $idx--;
119 $line->[3] .= "|Region_$idx";
120 print OUT join("\t",@$line)."\n";
121 }
122 }
123 else {
124 my $idx = 0;
125 foreach my $line (@{$counters{$gene}{'lines'}}) {
126 $idx++;
127 $line->[3] .= "|Region_$idx";
128 print OUT join("\t",@$line)."\n";
129 }
130 }
131 }
132 close OUT;
133 $opts{'t'} = "$wd/collapsed.targets.renamed.bed";
134 }
135 # 1. Coverage per exon
136 # included in 2.
97 137
98 # 2. Coverage per position 138 # 2. Coverage per position
99 &SubRegionCoverage($opts{'b'}, $opts{'t'}); 139 &SubRegionCoverage($opts{'b'}, $opts{'t'});
100 our %filehash; 140 our %filehash;
141 our $tcov;
101 if (exists($opts{'s'}) || exists($opts{'S'}) || exists($opts{'A'}) || exists($opts{'L'})) { 142 if (exists($opts{'s'}) || exists($opts{'S'}) || exists($opts{'A'}) || exists($opts{'L'})) {
102 system("mkdir $wd/SplitFiles"); 143 system("mkdir -p $wd/SplitFiles");
144 system("rm $wd/SplitFiles/*");
103 ## get position coverages 145 ## get position coverages
104 ## split input files 146 ## split input files
105 open IN, "$wd/Targets.Position.Coverage"; 147 open IN, "$wd/Targets.Position.Coverage";
148 open BCOVSUM, ">$wd/Results/".$opts{'n'}.".Average_Region_Coverage.txt";
106 my $fileidx = 0; 149 my $fileidx = 0;
107 my $currreg = ''; 150 my $currreg = '';
151 my $elength = 0;
152 my $esum = 0;
153 my $eline = "";
154 my %out = ();
108 while (<IN>) { 155 while (<IN>) {
109 my $line = $_; 156 my $line = $_;
110 chomp($line); 157 chomp($line);
111 my @p = split(/\t/,$line); 158 my @p = split(/\t/,$line);
112 my $reg = $p[0].'-'.$p[1].'-'.$p[2]; #.$p[3]; 159 my $reg = $p[0].'-'.$p[1].'-'.$p[2]. ": $p[3]";
113 my $ex = $p[3]; 160 my $ex = $p[3];
161 my $epos = $p[1];
162 # average exon coverage calculation.
163 if (!defined($out{$ex})) {
164 $out{$ex} = ();
165 }
166 if (!defined($out{$ex}{$p[0]})) {
167 $out{$ex}{$p[0]} = ();
168 }
169 # needs to be transcript specific ($ex) and position specific ($epos) to handle both isoforms and PAR/multiple mapping situations.
170 if (!defined($out{$ex}{$p[0]}{$epos})) {
171 $out{$ex}{$p[0]}{$epos}{'r'} = "$p[0]\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$p[5]";
172 $out{$ex}{$p[0]}{$epos}{'c'} = ();
173
174 }
175 push(@{$out{$ex}{$p[0]}{$epos}{'c'}},$p[-1]);
176 # splitting files
114 if ($reg ne $currreg) { 177 if ($reg ne $currreg) {
115 ## new exon open new outfile 178 ## new exon open new outfile
116 if ($currreg ne '') { 179 if ($currreg ne '') {
180 print BCOVSUM "$eline\t".($esum/$elength)."\n";
117 ## filehandle is open. close it 181 ## filehandle is open. close it
118 close OUT; 182 close OUT;
119 } 183 }
184 $eline = "$p[0]\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$p[5]"; #\t$p[6]\t$p[7]\t$p[8]";
185 $esum = 0;
186 $elength = 0;
120 if (!exists($filehash{$reg})) { 187 if (!exists($filehash{$reg})) {
121 $fileidx++; 188 $fileidx++;
122 $filehash{$reg}{'idx'} = $fileidx; 189 $filehash{$reg}{'idx'} = $fileidx;
123 $filehash{$reg}{'exon'} = $ex; 190 $filehash{$reg}{'exon'} = $reg;
124 open OUT, ">> $wd/SplitFiles/File_$fileidx.txt"; 191 open OUT, ">> $wd/SplitFiles/File_$fileidx.txt";
125 $currreg = $reg; 192 $currreg = $reg;
126 } 193 }
127 else { 194 else {
128 open OUT, ">> $wd/SplitFiles/File_".$filehash{$reg}{'idx'}.".txt"; 195 open OUT, ">> $wd/SplitFiles/File_".$filehash{$reg}{'idx'}.".txt";
129 $currreg = $reg; 196 $currreg = $reg;
130 } 197 }
131 } 198 }
132 ## print the line to the open filehandle. 199 ## print the line to the open filehandle.
133 print OUT "$line\n"; 200 print OUT "$line\n";
201 $esum += $p[-1];
202 $elength++;
134 } 203 }
135 close OUT; 204 close OUT;
136 close IN; 205 close IN;
137 206 if ($esum > 0) {
138 } 207 print BCOVSUM "$eline\t".($esum/$elength)."\n";
139 208 }
209 close BCOVSUM;
210 open OUT, ">$wd/avg.tcov.txt";
211
212 foreach my $tr_ex (sort {$a cmp $b} keys(%out)) {
213 foreach my $chr (sort {$a cmp $b} keys(%{$out{$tr_ex}})) {
214 foreach(sort {$a <=> $b} keys(%{$out{$tr_ex}{$chr}})) {
215 my ($avg,$nr,$nrcov) = GetStats(\@{$out{$tr_ex}{$chr}{$_}{'c'}});
216 my $frac = 0;
217 if ($nr > 0) {
218 $frac = ($nrcov / $nr);
219 }
220 print OUT $out{$tr_ex}{$chr}{$_}{'r'}."\t".$avg."\t".$nrcov."\t".$nr."\t".$frac."\n";
221 }
222 }
223 }
224 close OUT;
225 $tcov = "$wd/avg.tcov.txt";
226
227 }
140 ## sort output files according to targets file 228 ## sort output files according to targets file
141 if (exists($opts{'r'}) ) { 229 my %hash = ();
142 my %hash = (); 230 open IN, $tcov;
143 open IN, "$wd/Targets.Global.Coverage"; 231 while (<IN>) {
144 while (<IN>) { 232 my @p = split(/\t/,$_) ;
145 my @p = split(/\t/,$_) ; 233 $hash{$p[3].':'.$p[1]} = $_;
146 $hash{$p[3]} = $_; 234 }
147 } 235 close IN;
148 close IN; 236 open OUT, ">$tcov";
149 open OUT, ">$wd/Targets.Global.Coverage"; 237 open IN, $opts{'t'};
150 open IN, $opts{'t'}; 238 while (<IN>) {
151 while (<IN>) { 239 my @p = split(/\t/,$_) ;
152 my @p = split(/\t/,$_) ; 240 print OUT $hash{$p[3].':'.$p[1]};
153 print OUT $hash{$p[3]}; 241 }
154 } 242 close IN;
155 close IN; 243 close OUT;
156 close OUT;
157 }
158 244
159 245
160 #################################### 246 ####################################
161 ## PROCESS RESULTS & CREATE PLOTS ## 247 ## PROCESS RESULTS & CREATE PLOTS ##
162 #################################### 248 ####################################
209 # Get number of reads mapped in total 295 # Get number of reads mapped in total
210 ## updated on 2012-10-1 !! 296 ## updated on 2012-10-1 !!
211 $totalmapped = $s[2]; 297 $totalmapped = $s[2];
212 $totalmapped =~ s/^(\d+)(\s.+)/$1/; 298 $totalmapped =~ s/^(\d+)(\s.+)/$1/;
213 # count columns 299 # count columns
214 my $head = `head -n 1 $wd/Targets.Global.Coverage`; 300 my $head = `head -n 1 $tcov`;
215 chomp($head); 301 chomp($head);
216 my @cols = split(/\t/,$head); 302 my @cols = split(/\t/,$head);
217 my $nrcols = scalar(@cols); 303 my $nrcols = scalar(@cols);
218 my $covcol = $nrcols - 3; 304 my $covcol = $nrcols - 3;
219 # get min/max/median/average coverage => values 305 # get min/max/median/average coverage => values
220 my $covs = `cut -f $covcol $wd/Targets.Global.Coverage`; 306 my $covs = `cut -f $covcol $tcov`;
221 my @coverages = split(/\n/,$covs); 307 my @coverages = split(/\n/,$covs);
222 my ($eavg,$med,$min,$max,$first,$third,$ontarget) = arraystats(@coverages); 308 my ($eavg,$med,$min,$max,$first,$third,$ontarget) = arraystats(@coverages);
223 my $spec = sprintf("%.1f",($ontarget / $totalmapped)*100); 309 my $spec = '';
310 if ($totalmapped != 0 && $totalmapped ne '') {
311 $spec = sprintf("%.1f",($ontarget / $totalmapped)*100);
312 }
224 # get min/max/median/average coverage => boxplot in R 313 # get min/max/median/average coverage => boxplot in R
225 open OUT, ">$wd/Rout/boxplot.R"; 314 open OUT, ">$wd/Rout/boxplot.R";
226 print OUT 'coverage <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n"; 315 print OUT 'coverage <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n";
227 print OUT 'coverage <- coverage[,'.$covcol.']'."\n"; 316 print OUT 'coverage <- coverage[,'.$covcol.']'."\n";
228 print OUT 'png(file="../Plots/CoverageBoxPlot.png", bg="white", width=240, height=480,type=c("cairo"))'."\n"; 317 print OUT 'png(file="../Plots/CoverageBoxPlot.png", bg="white", width=240, height=480,type=c("cairo"))'."\n";
229 print OUT 'boxplot(coverage,range=1.5,main="Target Region Coverage")'."\n"; 318 print OUT 'boxplot(coverage,range=1.5,main="Target Region Coverage")'."\n";
230 print OUT 'graphics.off()'."\n"; 319 print OUT 'graphics.off()'."\n";
231 close OUT; 320 close OUT;
321 print "Running boxplot.R : \n";
232 system("cd $wd/Rout && Rscript boxplot.R"); 322 system("cd $wd/Rout && Rscript boxplot.R");
233 323
234 ## global nt coverage plot 324 ## global nt coverage plot
235 ## use perl to make histogram (lower memory) 325 ## use perl to make histogram (lower memory)
236 open IN, "$wd/Targets.Position.Coverage"; 326 open IN, "$wd/Targets.Position.Coverage";
321 print OUT 'text(1,82,pos=2,col="red",labels=paste("%Bases: ",round(frac.y,2),"%",sep=""))'."\n"; 411 print OUT 'text(1,82,pos=2,col="red",labels=paste("%Bases: ",round(frac.y,2),"%",sep=""))'."\n";
322 412
323 print OUT 'graphics.off()'."\n"; 413 print OUT 'graphics.off()'."\n";
324 414
325 close OUT; 415 close OUT;
416 print "Running ntplot.r\n";
326 system("cd $wd/Rout && Rscript ntplot.R"); 417 system("cd $wd/Rout && Rscript ntplot.R");
327 ## PRINT TO .TEX FILE 418 ## PRINT TO .TEX FILE
328 open OUT, ">>$wd/Report/Report.tex"; 419 open OUT, ">>$wd/Report/Report.tex";
329 # average coverage overviews 420 # average coverage overviews
330 print OUT '\subsection*{Overall Summary}'."\n"; 421 print OUT '\subsection*{Overall Summary}'."\n";
357 $two =~ s/(\s\+.*)|(:.*)/\)/; 448 $two =~ s/(\s\+.*)|(:.*)/\)/;
358 $two =~ s/%/\\%/g; 449 $two =~ s/%/\\%/g;
359 $two =~ s/>=/\$\\ge\$/g; 450 $two =~ s/>=/\$\\ge\$/g;
360 $two = ucfirst($two); 451 $two = ucfirst($two);
361 print OUT '\textbf{'.$two.'} & '.$one.' \\\\'."\n"; 452 print OUT '\textbf{'.$two.'} & '.$one.' \\\\'."\n";
453
454
362 } 455 }
363 print OUT '\end{tabular}\end{minipage}'."\n"; 456 print OUT '\end{tabular}\end{minipage}'."\n";
364 print OUT '\hspace{1.5cm}'."\n"; 457 print OUT '\hspace{1.5cm}'."\n";
365 # target coverage statistics 458 # target coverage statistics
366 print OUT '\begin{minipage}{0.4\linewidth}'."\n"; 459 print OUT '\begin{minipage}{0.4\linewidth}'."\n";
386 # 2. GLOBAL COVERAGE OVERVIEW PER GENE 479 # 2. GLOBAL COVERAGE OVERVIEW PER GENE
387 @failedexons; 480 @failedexons;
388 @allexons; 481 @allexons;
389 @allregions; 482 @allregions;
390 @failedregions; 483 @failedregions;
391 if (exists($opts{'r'}) || exists($opts{'s'}) || exists($opts{'S'})) { 484 %failednames;
485 %allnames;
486
487 if (exists($opts{'r'}) || exists($opts{'s'}) || exists($opts{'S'})|| exists($opts{'A'})) {
392 # count columns 488 # count columns
393 my $head = `head -n 1 $wd/Targets.Global.Coverage`; 489 my $head = `head -n 1 '$tcov'`;
394 chomp($head); 490 chomp($head);
395 my @cols = split(/\t/,$head); 491 my @cols = split(/\t/,$head);
396 my $nrcols = scalar(@cols); 492 my $nrcols = scalar(@cols);
397 my $covcol = $nrcols - 3; 493 my $covcol = $nrcols - 3;
398 # Coverage Plots for each gene => barplots in R, table here. 494 # Coverage Plots for each gene => barplots in R, table here.
399 open IN, "$wd/Targets.Global.Coverage"; 495 open IN, "$tcov";
400 my $currgroup = ''; 496 my $currgroup = '';
401 my $startline = 0; 497 my $startline = 0;
402 my $stopline = 0; 498 my $stopline = 0;
403 $linecounter = 0; 499 $linecounter = 0;
404 while (<IN>) { 500 while (<IN>) {
405 $linecounter++; 501 $linecounter++;
406 chomp($_); 502 chomp($_);
407 my @c = split(/\t/,$_); 503 my @c = split(/\t/,$_);
408 push(@allregions,$c[0].'-'.$c[1].'-'.$c[2]); 504 my $reg = $c[0].'-'.$c[1].'-'.$c[2];
409 my $group = $c[3]; 505 push(@allregions,$reg);
506 my $group = $reg .": ".$c[3];
507 #my $gene = $c[3];
410 ## coverage failure? 508 ## coverage failure?
411 if ($c[$nrcol-1] < 1 || $c[$covcol-1] < $thresh) { 509 if ($c[$nrcol-1] < 1 || $c[$covcol-1] < $thresh) {
412 push(@failedexons,$group); 510 push(@failedexons,$group);
413 push(@failedregions,$c[0].'-'.$c[1].'-'.$c[2]); 511 push(@failedregions,$c[0].'-'.$c[1].'-'.$c[2]);
512 $failednames{$group} = $c[0].'-'.$c[1].'-'.$c[2];
414 } 513 }
415 ## store exon 514 ## store exon
416 push(@allexons,$group); 515 push(@allexons,$group);
516 $allnames{$group} = $c[0].'-'.$c[1].'-'.$c[2];
517 if (!exists($opts{'r'}) && !exists($opts{'s'}) && !exists($opts{'S'}) && exists($opts{'A'})) {
518 ## no need for barplots
519 next;
520 }
417 ## extract and check gene 521 ## extract and check gene
418 $group =~ s/^(\S+)[\|\s](.+)/$1/; 522 my $gene = $group;
419 if ($group ne $currgroup ) { 523 $gene =~ s/^chr\S+: (\S+)[\|\s](.+)/$1/;
524 if ($gene ne $currgroup ) {
420 if ($currgroup ne '') { 525 if ($currgroup ne '') {
421 # new gene, make plot. 526 # new gene, make plot.
422 open OUT, ">$wd/Rout/barplot.R"; 527 open OUT, ">$wd/Rout/barplot.R";
423 print OUT 'coveragetable <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n"; 528 print OUT 'coveragetable <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n";
424 print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n"; 529 print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n";
425 print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n"; 530 print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n";
426 print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n"; 531 print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n";
427 print OUT 'coverage[coverage < 1] <- 1'."\n"; 532 print OUT 'coverage[coverage < 1] <- 1'."\n";
428 print OUT 'colors <- c(rep("grey",length(coverage)))'."\n"; 533 print OUT 'colors <- c(rep("grey",length(coverage)))'."\n";
455 else { 560 else {
456 push(@large,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}'); 561 push(@large,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}');
457 } 562 }
458 563
459 } 564 }
460 $currgroup = $group; 565 $currgroup = $gene;
461 $startline = $linecounter; 566 $startline = $linecounter;
462 } 567 }
463 $stopline = $linecounter; 568 $stopline = $linecounter;
464 } 569 }
465 close IN; 570 close IN;
466 if ($currgroup ne '') { 571 if ($currgroup ne '') {
467 # last gene, make plot. 572 # last gene, make plot.
468 open OUT, ">$wd/Rout/barplot.R"; 573 open OUT, ">$wd/Rout/barplot.R";
469 print OUT 'coveragetable <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n"; 574 print OUT 'coveragetable <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n";
470 print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n"; 575 print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n";
471 print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n"; 576 print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n";
472 print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n"; 577 print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n";
473 print OUT 'coverage[coverage < 1] <- 1'."\n"; 578 print OUT 'coverage[coverage < 1] <- 1'."\n";
474 print OUT 'colors <- c(rep("grey",length(coverage)))'."\n"; 579 print OUT 'colors <- c(rep("grey",length(coverage)))'."\n";
500 } 605 }
501 ## print to TEX 606 ## print to TEX
502 open OUT, ">>$wd/Report/Report.tex"; 607 open OUT, ">>$wd/Report/Report.tex";
503 print OUT '\subsection*{Gene Summaries}'."\n"; 608 print OUT '\subsection*{Gene Summaries}'."\n";
504 print OUT '\underline{Legend:} \\\\'."\n"; 609 print OUT '\underline{Legend:} \\\\'."\n";
505 print OUT '{\color{red}\textbf{RED:} Coverage did not reach set threshold of '.$thresh.'} \\\\'."\n"; 610 print OUT '{\color{red}\textbf{RED:} Average coverage did not reach set threshold of '.$thresh.'} \\\\'."\n";
506 print OUT '{\color{orange}\textbf{ORANGE:} Coverage was incomplete for the exon. Overruled by red.} \\\\' ."\n"; 611 print OUT '{\color{orange}\textbf{ORANGE:} Coverage was incomplete for the exon (section with zero coverage found). Overruled by red.} \\\\' ."\n";
507 $col = 1; 612 $col = 1;
508 foreach (@small) { 613 foreach (@small) {
509 if ($col > 2) { 614 if ($col > 2) {
510 $col = 1; 615 $col = 1;
511 print OUT "\n"; 616 print OUT "\n";
538 # tex section header 643 # tex section header
539 open TEX, ">>$wd/Report/Report.tex"; 644 open TEX, ">>$wd/Report/Report.tex";
540 print TEX '\subsection*{Failed Exon Plots}'."\n"; 645 print TEX '\subsection*{Failed Exon Plots}'."\n";
541 $col = 1; 646 $col = 1;
542 print TEX '\underline{NOTE:} Only exons with global coverage $<$'.$thresh.' or incomplete coverage were plotted \\\\'."\n"; 647 print TEX '\underline{NOTE:} Only exons with global coverage $<$'.$thresh.' or incomplete coverage were plotted \\\\'."\n";
543 foreach(@failedregions) { 648 foreach(sort(keys(%failednames)) ) {
649 #foreach(@failedregions) {
544 if ($col > 2) { 650 if ($col > 2) {
545 $col = 1; 651 $col = 1;
546 print TEX "\n"; 652 print TEX "\n";
547 } 653 }
548 # which exon 654 # which exon
549 my $region = $_; 655 my $group = $_;
550 my $exon = $filehash{$region}{'exon'}; 656 my ($region,$name) = split(/: /,$group);
657 #my $region = $failednames{$_};
658 my $exon = $filehash{$group}{'exon'};
551 # link exon to tmp file 659 # link exon to tmp file
552 my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt"; 660 my $exonfile = "$wd/SplitFiles/File_".$filehash{$group}{'idx'}.".txt";
553 ## determine transcript orientation and location 661 ## determine transcript orientation and location
554 my $firstline = `head -n 1 $exonfile`; 662 my $firstline = `head -n 1 $exonfile`;
555 my @firstcols = split(/\t/,$firstline); 663 my @firstcols = split(/\t/,$firstline);
556 my $orient = $firstcols[5]; 664 my $orient = $firstcols[5];
557 my $genomicchr = $firstcols[0]; 665 my $genomicchr = $firstcols[0];
574 682
575 my $width = 480 ; 683 my $width = 480 ;
576 my $height = 240 ; 684 my $height = 240 ;
577 my $exonstr = $exon; 685 my $exonstr = $exon;
578 $exonstr =~ s/\s/_/g; 686 $exonstr =~ s/\s/_/g;
687 $exonstr =~ s/:/_/g;
579 $exon =~ s/_/ /g; 688 $exon =~ s/_/ /g;
580 $exon =~ s/\|/ /g; 689 $exon =~ s/\|/ /g;
690 $exon =~ s/chr.*: (.*)$/$1/;
581 print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n"; 691 print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n";
582 print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n"; 692 print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n";
583 if ($orient eq '-') { 693 if ($orient eq '-') {
584 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n"; 694 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n";
585 print OUT 'mtext("'.$subtitle.'")'."\n"; 695 print OUT 'mtext("'.$subtitle.'")'."\n";
622 } 732 }
623 elsif (exists($opts{'A'})) { 733 elsif (exists($opts{'A'})) {
624 print TEX '\underline{NOTE:} ALL exons are plotted, regardless of coverage \\\\'."\n"; 734 print TEX '\underline{NOTE:} ALL exons are plotted, regardless of coverage \\\\'."\n";
625 } 735 }
626 $col = 1; 736 $col = 1;
627 foreach(@allregions) { 737 foreach(sort(keys(%allnames))) {
738
739 #foreach(@allregions) {
628 if ($col > 2) { 740 if ($col > 2) {
629 $col = 1; 741 $col = 1;
630 print TEX "\n"; 742 print TEX "\n";
631 } 743 }
744 my $group = $_;
745 my ($region,$name) = split(/: /,$group);
632 # which exon 746 # which exon
633 my $region = $_; 747 #my $region = $_;
634 my $exon = $filehash{$region}{'exon'}; 748 #my $region = $allnames{$_};
749 my $exon = $filehash{$group}{'exon'};
635 # grep exon to tmp file 750 # grep exon to tmp file
636 my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt"; 751 my $exonfile = "$wd/SplitFiles/File_".$filehash{$group}{'idx'}.".txt";
637 ## determine transcript orientation. 752 ## determine transcript orientation.
638 my $firstline = `head -n 1 $exonfile`; 753 my $firstline = `head -n 1 $exonfile`;
639 my @firstcols = split(/\t/,$firstline); 754 my @firstcols = split(/\t/,$firstline);
640 my $orient = $firstcols[5]; 755 my $orient = $firstcols[5];
641 my $genomicchr = $firstcols[0]; 756 my $genomicchr = $firstcols[0];
671 print OUT 'positions <- coveragetable[,'.$poscol.']'."\n"; 786 print OUT 'positions <- coveragetable[,'.$poscol.']'."\n";
672 my $width = 480 ; 787 my $width = 480 ;
673 my $height = 240 ; 788 my $height = 240 ;
674 my $exonstr = $exon; 789 my $exonstr = $exon;
675 $exonstr =~ s/\s/_/g; 790 $exonstr =~ s/\s/_/g;
791 $exonstr =~ s/:/_/g;
676 $exon =~ s/_/ /g; 792 $exon =~ s/_/ /g;
677 $exon =~ s/\|/ /g; 793 $exon =~ s/\|/ /g;
794 $exon =~ s/^chr.*: (.*)$/$1/;
678 print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n"; 795 print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n";
679 print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n"; 796 print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n";
680 if ($orient eq '-') { 797 if ($orient eq '-') {
681 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n"; 798 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n";
682 print OUT 'mtext("'.$subtitle.'")'."\n"; 799 print OUT 'mtext("'.$subtitle.'")'."\n";
737 my $genomicchr = $firstcols[0]; 854 my $genomicchr = $firstcols[0];
738 my $genomicstart = $firstcols[1]; 855 my $genomicstart = $firstcols[1];
739 my $genomicstop = $firstcols[2]; 856 my $genomicstop = $firstcols[2];
740 857
741 if ($orient eq '+') { 858 if ($orient eq '+') {
742 $bps = $genomicstop - $genomicstart + 1; 859 #$bps = $genomicstop - $genomicstart + 1;
743 $subtitle = "$genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop); 860 $subtitle = "$genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop);
744 861
745 } 862 }
746 else { 863 else {
747 $bps = $genomicstop - $genomicstart + 1; 864 #$bps = $genomicstop - $genomicstart + 1;
748 $subtitle = "$genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop); 865 $subtitle = "$genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop);
749 } 866 }
750 867
751 # check if failed 868 # check if failed
752 my $cs = `cut -f $covcol '$exonfile' `; 869 my $cs = `cut -f $covcol '$exonfile' `;
794 system("cp -Rf $wd/Targets.Position.Coverage $wd/Results/"); 911 system("cp -Rf $wd/Targets.Position.Coverage $wd/Results/");
795 } 912 }
796 913
797 system("cd $wd && tar czf '$tarfile' Results/"); 914 system("cd $wd && tar czf '$tarfile' Results/");
798 ## clean up (galaxy stores outside wd) 915 ## clean up (galaxy stores outside wd)
799 system("rm -Rf $wd"); 916 #system("rm -Rf $wd");
800 ############### 917 ###############
801 ## FUNCTIONS ## 918 ## FUNCTIONS ##
802 ############### 919 ###############
803 sub arraystats{ 920 sub arraystats{
804 my @array = @_; 921 my @array = @_;
805 my $count = scalar(@array); 922 my $count = scalar(@array);
923 if ($count == 0 ) {
924 return (0,0,0,0,0,0,0);
925 }
806 @array = sort { $a <=> $b } @array; 926 @array = sort { $a <=> $b } @array;
807 # median 927 # median
808 my $median = 0; 928 my $median = 0;
809 if ($count % 2) { 929 if ($count % 2) {
810 $median = $array[int($count/2)]; 930 $median = $array[int($count/2)];
822 my $min = $array[0]; 942 my $min = $array[0];
823 my $max = $array[($count-1)]; 943 my $max = $array[($count-1)];
824 return ($average,$median,$min,$max,$first,$third,$sum); 944 return ($average,$median,$min,$max,$first,$third,$sum);
825 } 945 }
826 946
827 sub GlobalSummary { 947 sub GetStats {
828 my ($bam,$targets) = @_; 948 my ($aref) = @_;
829 949 if (scalar(@$aref) == 0) {
830 my $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage"; 950 return qw/0 0/;
831 if (exists($commandsrun{$command})) { 951 }
832 return; 952 # median
833 } 953 my @s = sort {$a <=> $b } @$aref;
834 system($command); 954 my $nrzero = 0;
835 $commandsrun{$command} = 1; 955 my $len = scalar(@s);
836 } 956 for (my $i = 0; $i< $len;$i++) {
837 957 if ($s[$i] == 0) {
838 sub CoveragePerRegion { 958 $nrzero++;
839 my ($bam,$targets) = @_; 959 }
840 my $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage"; 960 else {
841 if (exists($commandsrun{$command})) { 961 last;
842 return; 962 }
843 } 963 }
844 system($command); 964 my $nrcov = $len - $nrzero;
845 $commandsrun{$command} = 1; 965 # avg
846 } 966 my $avg = 0;
967 foreach (@s) { $avg += $_ };
968 $avg = sprintf("%.1f",($avg / scalar(@s)));
969 return($avg,$len,$nrcov);
970 }
971
847 972
848 sub SubRegionCoverage { 973 sub SubRegionCoverage {
849 my ($bam,$targets) = @_; 974 my ($bam,$targets) = @_;
850 my $command = "cd $wd && coverageBed -abam $bam -b $targets -d > $wd/Targets.Position.Coverage"; 975 my $command = "cd $wd && coverageBed -abam $bam -b $targets -d > $wd/Targets.Position.Coverage";
851 system($command); 976 system($command);