Mercurial > repos > geert-vandeweyer > coverage_report
comparison CoverageReport.pl @ 26:859999cb135b draft
revised routines. Better handling of collapsing, isoforms and mutiple mapppings. Barplots revised to visualise average base coverage in exon instead of total number of reads in exon
| author | geert-vandeweyer |
|---|---|
| date | Wed, 29 Nov 2017 08:12:27 -0500 |
| parents | 6cb012c8497a |
| children | 576bfc1586f9 |
comparison
equal
deleted
inserted
replaced
| 25:6cb012c8497a | 26:859999cb135b |
|---|---|
| 24 # A : (A)ll exons will be plotted. | 24 # A : (A)ll exons will be plotted. |
| 25 # L : (L)ist failed exons instead of plotting | 25 # L : (L)ist failed exons instead of plotting |
| 26 # m : (m)inimal Coverage threshold | 26 # m : (m)inimal Coverage threshold |
| 27 # f : fraction of average as threshold | 27 # f : fraction of average as threshold |
| 28 # n : sample (n)ame. | 28 # n : sample (n)ame. |
| 29 | 29 # T : collapse overlapping Target regions. |
| 30 | 30 |
| 31 getopts('b:t:o:z:rsSALm:n:f:T', \%opts) ; | 31 getopts('b:t:o:z:rsSALm:n:f:T', \%opts) ; |
| 32 | 32 |
| 33 # make output directory in (tmp) working dir | 33 # make output directory in (tmp) working dir |
| 34 our $wd = "/tmp/Coverage.".int(rand(1000)); | 34 our $wd = "/tmp/Coverage.".int(rand(1000)); |
| 35 | |
| 35 while (-d $wd) { | 36 while (-d $wd) { |
| 36 $wd = "/tmp/Coverage.".int(rand(1000)); | 37 $wd = "/tmp/Coverage.".int(rand(1000)); |
| 37 | 38 |
| 38 } | 39 } |
| 39 system("mkdir $wd"); | 40 system("mkdir $wd"); |
| 40 | 41 $wd = "/tmp/Coverage.993"; |
| 42 print "wd : $wd\n"; | |
| 41 ## variables | 43 ## variables |
| 42 our %commandsrun = (); | 44 our %commandsrun = (); |
| 43 | 45 |
| 44 if (!exists($opts{'b'}) || !-e $opts{'b'}) { | 46 if (!exists($opts{'b'}) || !-e $opts{'b'}) { |
| 45 die('Bam File not found'); | 47 die('Bam File not found'); |
| 87 } | 89 } |
| 88 my $targets = $opts{'t'}; | 90 my $targets = $opts{'t'}; |
| 89 my $tmptargets = "$wd/collapsedtargets.bed"; | 91 my $tmptargets = "$wd/collapsedtargets.bed"; |
| 90 system("sort -k1,1 -k2,2n $targets > $wd/sorted.targets.bed"); | 92 system("sort -k1,1 -k2,2n $targets > $wd/sorted.targets.bed"); |
| 91 system("bedtools merge -s -scores max -nms -i $wd/sorted.targets.bed > $tmptargets"); | 93 system("bedtools merge -s -scores max -nms -i $wd/sorted.targets.bed > $tmptargets"); |
| 92 $opts{'t'} = $tmptargets; | 94 open IN, $tmptargets; |
| 93 } | 95 open OUT, ">$wd/collapsed.targets.renamed.bed"; |
| 94 | 96 # we assume that overlapping fragments come from isoforms, not from different genes. |
| 95 # 1. Global Summary => default | 97 my %counters = (); |
| 96 &GlobalSummary($opts{'b'}, $opts{'t'}); | 98 my @genes = (); |
| 99 while (<IN>) { | |
| 100 chomp; | |
| 101 my @p = split(/\t/,$_); | |
| 102 my @g = split(/,/,$p[3]); | |
| 103 $g[0] =~ m/(\S+)\(.*/; | |
| 104 my $gene = $1; | |
| 105 if (!defined($counters{$gene})) { | |
| 106 push(@genes,$gene); | |
| 107 $counters{$gene}{'lines'} = (); | |
| 108 $counters{$gene}{'orient'} = $p[5]; | |
| 109 } | |
| 110 $p[3] = $gene."(COLLAPSED)"; | |
| 111 push(@{$counters{$gene}{'lines'}},\@p); | |
| 112 } | |
| 113 close IN; | |
| 114 foreach my $gene (@genes) { | |
| 115 if ($counters{$gene}{'orient'} eq '-') { | |
| 116 my $idx = scalar(@{$counters{$gene}{'lines'}}) + 1; | |
| 117 foreach my $line (@{$counters{$gene}{'lines'}}) { | |
| 118 $idx--; | |
| 119 $line->[3] .= "|Region_$idx"; | |
| 120 print OUT join("\t",@$line)."\n"; | |
| 121 } | |
| 122 } | |
| 123 else { | |
| 124 my $idx = 0; | |
| 125 foreach my $line (@{$counters{$gene}{'lines'}}) { | |
| 126 $idx++; | |
| 127 $line->[3] .= "|Region_$idx"; | |
| 128 print OUT join("\t",@$line)."\n"; | |
| 129 } | |
| 130 } | |
| 131 } | |
| 132 close OUT; | |
| 133 $opts{'t'} = "$wd/collapsed.targets.renamed.bed"; | |
| 134 } | |
| 135 # 1. Coverage per exon | |
| 136 # included in 2. | |
| 97 | 137 |
| 98 # 2. Coverage per position | 138 # 2. Coverage per position |
| 99 &SubRegionCoverage($opts{'b'}, $opts{'t'}); | 139 &SubRegionCoverage($opts{'b'}, $opts{'t'}); |
| 100 our %filehash; | 140 our %filehash; |
| 141 our $tcov; | |
| 101 if (exists($opts{'s'}) || exists($opts{'S'}) || exists($opts{'A'}) || exists($opts{'L'})) { | 142 if (exists($opts{'s'}) || exists($opts{'S'}) || exists($opts{'A'}) || exists($opts{'L'})) { |
| 102 system("mkdir $wd/SplitFiles"); | 143 system("mkdir -p $wd/SplitFiles"); |
| 144 system("rm $wd/SplitFiles/*"); | |
| 103 ## get position coverages | 145 ## get position coverages |
| 104 ## split input files | 146 ## split input files |
| 105 open IN, "$wd/Targets.Position.Coverage"; | 147 open IN, "$wd/Targets.Position.Coverage"; |
| 148 open BCOVSUM, ">$wd/Results/".$opts{'n'}.".Average_Region_Coverage.txt"; | |
| 106 my $fileidx = 0; | 149 my $fileidx = 0; |
| 107 my $currreg = ''; | 150 my $currreg = ''; |
| 151 my $elength = 0; | |
| 152 my $esum = 0; | |
| 153 my $eline = ""; | |
| 154 my %out = (); | |
| 108 while (<IN>) { | 155 while (<IN>) { |
| 109 my $line = $_; | 156 my $line = $_; |
| 110 chomp($line); | 157 chomp($line); |
| 111 my @p = split(/\t/,$line); | 158 my @p = split(/\t/,$line); |
| 112 my $reg = $p[0].'-'.$p[1].'-'.$p[2]; #.$p[3]; | 159 my $reg = $p[0].'-'.$p[1].'-'.$p[2]. ": $p[3]"; |
| 113 my $ex = $p[3]; | 160 my $ex = $p[3]; |
| 161 my $epos = $p[1]; | |
| 162 # average exon coverage calculation. | |
| 163 if (!defined($out{$ex})) { | |
| 164 $out{$ex} = (); | |
| 165 } | |
| 166 if (!defined($out{$ex}{$p[0]})) { | |
| 167 $out{$ex}{$p[0]} = (); | |
| 168 } | |
| 169 # needs to be transcript specific ($ex) and position specific ($epos) to handle both isoforms and PAR/multiple mapping situations. | |
| 170 if (!defined($out{$ex}{$p[0]}{$epos})) { | |
| 171 $out{$ex}{$p[0]}{$epos}{'r'} = "$p[0]\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$p[5]"; | |
| 172 $out{$ex}{$p[0]}{$epos}{'c'} = (); | |
| 173 | |
| 174 } | |
| 175 push(@{$out{$ex}{$p[0]}{$epos}{'c'}},$p[-1]); | |
| 176 # splitting files | |
| 114 if ($reg ne $currreg) { | 177 if ($reg ne $currreg) { |
| 115 ## new exon open new outfile | 178 ## new exon open new outfile |
| 116 if ($currreg ne '') { | 179 if ($currreg ne '') { |
| 180 print BCOVSUM "$eline\t".($esum/$elength)."\n"; | |
| 117 ## filehandle is open. close it | 181 ## filehandle is open. close it |
| 118 close OUT; | 182 close OUT; |
| 119 } | 183 } |
| 184 $eline = "$p[0]\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$p[5]"; #\t$p[6]\t$p[7]\t$p[8]"; | |
| 185 $esum = 0; | |
| 186 $elength = 0; | |
| 120 if (!exists($filehash{$reg})) { | 187 if (!exists($filehash{$reg})) { |
| 121 $fileidx++; | 188 $fileidx++; |
| 122 $filehash{$reg}{'idx'} = $fileidx; | 189 $filehash{$reg}{'idx'} = $fileidx; |
| 123 $filehash{$reg}{'exon'} = $ex; | 190 $filehash{$reg}{'exon'} = $reg; |
| 124 open OUT, ">> $wd/SplitFiles/File_$fileidx.txt"; | 191 open OUT, ">> $wd/SplitFiles/File_$fileidx.txt"; |
| 125 $currreg = $reg; | 192 $currreg = $reg; |
| 126 } | 193 } |
| 127 else { | 194 else { |
| 128 open OUT, ">> $wd/SplitFiles/File_".$filehash{$reg}{'idx'}.".txt"; | 195 open OUT, ">> $wd/SplitFiles/File_".$filehash{$reg}{'idx'}.".txt"; |
| 129 $currreg = $reg; | 196 $currreg = $reg; |
| 130 } | 197 } |
| 131 } | 198 } |
| 132 ## print the line to the open filehandle. | 199 ## print the line to the open filehandle. |
| 133 print OUT "$line\n"; | 200 print OUT "$line\n"; |
| 201 $esum += $p[-1]; | |
| 202 $elength++; | |
| 134 } | 203 } |
| 135 close OUT; | 204 close OUT; |
| 136 close IN; | 205 close IN; |
| 137 | 206 if ($esum > 0) { |
| 138 } | 207 print BCOVSUM "$eline\t".($esum/$elength)."\n"; |
| 139 | 208 } |
| 209 close BCOVSUM; | |
| 210 open OUT, ">$wd/avg.tcov.txt"; | |
| 211 | |
| 212 foreach my $tr_ex (sort {$a cmp $b} keys(%out)) { | |
| 213 foreach my $chr (sort {$a cmp $b} keys(%{$out{$tr_ex}})) { | |
| 214 foreach(sort {$a <=> $b} keys(%{$out{$tr_ex}{$chr}})) { | |
| 215 my ($avg,$nr,$nrcov) = GetStats(\@{$out{$tr_ex}{$chr}{$_}{'c'}}); | |
| 216 my $frac = 0; | |
| 217 if ($nr > 0) { | |
| 218 $frac = ($nrcov / $nr); | |
| 219 } | |
| 220 print OUT $out{$tr_ex}{$chr}{$_}{'r'}."\t".$avg."\t".$nrcov."\t".$nr."\t".$frac."\n"; | |
| 221 } | |
| 222 } | |
| 223 } | |
| 224 close OUT; | |
| 225 $tcov = "$wd/avg.tcov.txt"; | |
| 226 | |
| 227 } | |
| 140 ## sort output files according to targets file | 228 ## sort output files according to targets file |
| 141 if (exists($opts{'r'}) ) { | 229 my %hash = (); |
| 142 my %hash = (); | 230 open IN, $tcov; |
| 143 open IN, "$wd/Targets.Global.Coverage"; | 231 while (<IN>) { |
| 144 while (<IN>) { | 232 my @p = split(/\t/,$_) ; |
| 145 my @p = split(/\t/,$_) ; | 233 $hash{$p[3].':'.$p[1]} = $_; |
| 146 $hash{$p[3]} = $_; | 234 } |
| 147 } | 235 close IN; |
| 148 close IN; | 236 open OUT, ">$tcov"; |
| 149 open OUT, ">$wd/Targets.Global.Coverage"; | 237 open IN, $opts{'t'}; |
| 150 open IN, $opts{'t'}; | 238 while (<IN>) { |
| 151 while (<IN>) { | 239 my @p = split(/\t/,$_) ; |
| 152 my @p = split(/\t/,$_) ; | 240 print OUT $hash{$p[3].':'.$p[1]}; |
| 153 print OUT $hash{$p[3]}; | 241 } |
| 154 } | 242 close IN; |
| 155 close IN; | 243 close OUT; |
| 156 close OUT; | |
| 157 } | |
| 158 | 244 |
| 159 | 245 |
| 160 #################################### | 246 #################################### |
| 161 ## PROCESS RESULTS & CREATE PLOTS ## | 247 ## PROCESS RESULTS & CREATE PLOTS ## |
| 162 #################################### | 248 #################################### |
| 209 # Get number of reads mapped in total | 295 # Get number of reads mapped in total |
| 210 ## updated on 2012-10-1 !! | 296 ## updated on 2012-10-1 !! |
| 211 $totalmapped = $s[2]; | 297 $totalmapped = $s[2]; |
| 212 $totalmapped =~ s/^(\d+)(\s.+)/$1/; | 298 $totalmapped =~ s/^(\d+)(\s.+)/$1/; |
| 213 # count columns | 299 # count columns |
| 214 my $head = `head -n 1 $wd/Targets.Global.Coverage`; | 300 my $head = `head -n 1 $tcov`; |
| 215 chomp($head); | 301 chomp($head); |
| 216 my @cols = split(/\t/,$head); | 302 my @cols = split(/\t/,$head); |
| 217 my $nrcols = scalar(@cols); | 303 my $nrcols = scalar(@cols); |
| 218 my $covcol = $nrcols - 3; | 304 my $covcol = $nrcols - 3; |
| 219 # get min/max/median/average coverage => values | 305 # get min/max/median/average coverage => values |
| 220 my $covs = `cut -f $covcol $wd/Targets.Global.Coverage`; | 306 my $covs = `cut -f $covcol $tcov`; |
| 221 my @coverages = split(/\n/,$covs); | 307 my @coverages = split(/\n/,$covs); |
| 222 my ($eavg,$med,$min,$max,$first,$third,$ontarget) = arraystats(@coverages); | 308 my ($eavg,$med,$min,$max,$first,$third,$ontarget) = arraystats(@coverages); |
| 223 my $spec = sprintf("%.1f",($ontarget / $totalmapped)*100); | 309 my $spec = ''; |
| 310 if ($totalmapped != 0 && $totalmapped ne '') { | |
| 311 $spec = sprintf("%.1f",($ontarget / $totalmapped)*100); | |
| 312 } | |
| 224 # get min/max/median/average coverage => boxplot in R | 313 # get min/max/median/average coverage => boxplot in R |
| 225 open OUT, ">$wd/Rout/boxplot.R"; | 314 open OUT, ">$wd/Rout/boxplot.R"; |
| 226 print OUT 'coverage <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n"; | 315 print OUT 'coverage <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n"; |
| 227 print OUT 'coverage <- coverage[,'.$covcol.']'."\n"; | 316 print OUT 'coverage <- coverage[,'.$covcol.']'."\n"; |
| 228 print OUT 'png(file="../Plots/CoverageBoxPlot.png", bg="white", width=240, height=480,type=c("cairo"))'."\n"; | 317 print OUT 'png(file="../Plots/CoverageBoxPlot.png", bg="white", width=240, height=480,type=c("cairo"))'."\n"; |
| 229 print OUT 'boxplot(coverage,range=1.5,main="Target Region Coverage")'."\n"; | 318 print OUT 'boxplot(coverage,range=1.5,main="Target Region Coverage")'."\n"; |
| 230 print OUT 'graphics.off()'."\n"; | 319 print OUT 'graphics.off()'."\n"; |
| 231 close OUT; | 320 close OUT; |
| 321 print "Running boxplot.R : \n"; | |
| 232 system("cd $wd/Rout && Rscript boxplot.R"); | 322 system("cd $wd/Rout && Rscript boxplot.R"); |
| 233 | 323 |
| 234 ## global nt coverage plot | 324 ## global nt coverage plot |
| 235 ## use perl to make histogram (lower memory) | 325 ## use perl to make histogram (lower memory) |
| 236 open IN, "$wd/Targets.Position.Coverage"; | 326 open IN, "$wd/Targets.Position.Coverage"; |
| 321 print OUT 'text(1,82,pos=2,col="red",labels=paste("%Bases: ",round(frac.y,2),"%",sep=""))'."\n"; | 411 print OUT 'text(1,82,pos=2,col="red",labels=paste("%Bases: ",round(frac.y,2),"%",sep=""))'."\n"; |
| 322 | 412 |
| 323 print OUT 'graphics.off()'."\n"; | 413 print OUT 'graphics.off()'."\n"; |
| 324 | 414 |
| 325 close OUT; | 415 close OUT; |
| 416 print "Running ntplot.r\n"; | |
| 326 system("cd $wd/Rout && Rscript ntplot.R"); | 417 system("cd $wd/Rout && Rscript ntplot.R"); |
| 327 ## PRINT TO .TEX FILE | 418 ## PRINT TO .TEX FILE |
| 328 open OUT, ">>$wd/Report/Report.tex"; | 419 open OUT, ">>$wd/Report/Report.tex"; |
| 329 # average coverage overviews | 420 # average coverage overviews |
| 330 print OUT '\subsection*{Overall Summary}'."\n"; | 421 print OUT '\subsection*{Overall Summary}'."\n"; |
| 357 $two =~ s/(\s\+.*)|(:.*)/\)/; | 448 $two =~ s/(\s\+.*)|(:.*)/\)/; |
| 358 $two =~ s/%/\\%/g; | 449 $two =~ s/%/\\%/g; |
| 359 $two =~ s/>=/\$\\ge\$/g; | 450 $two =~ s/>=/\$\\ge\$/g; |
| 360 $two = ucfirst($two); | 451 $two = ucfirst($two); |
| 361 print OUT '\textbf{'.$two.'} & '.$one.' \\\\'."\n"; | 452 print OUT '\textbf{'.$two.'} & '.$one.' \\\\'."\n"; |
| 453 | |
| 454 | |
| 362 } | 455 } |
| 363 print OUT '\end{tabular}\end{minipage}'."\n"; | 456 print OUT '\end{tabular}\end{minipage}'."\n"; |
| 364 print OUT '\hspace{1.5cm}'."\n"; | 457 print OUT '\hspace{1.5cm}'."\n"; |
| 365 # target coverage statistics | 458 # target coverage statistics |
| 366 print OUT '\begin{minipage}{0.4\linewidth}'."\n"; | 459 print OUT '\begin{minipage}{0.4\linewidth}'."\n"; |
| 386 # 2. GLOBAL COVERAGE OVERVIEW PER GENE | 479 # 2. GLOBAL COVERAGE OVERVIEW PER GENE |
| 387 @failedexons; | 480 @failedexons; |
| 388 @allexons; | 481 @allexons; |
| 389 @allregions; | 482 @allregions; |
| 390 @failedregions; | 483 @failedregions; |
| 391 if (exists($opts{'r'}) || exists($opts{'s'}) || exists($opts{'S'})) { | 484 %failednames; |
| 485 %allnames; | |
| 486 | |
| 487 if (exists($opts{'r'}) || exists($opts{'s'}) || exists($opts{'S'})|| exists($opts{'A'})) { | |
| 392 # count columns | 488 # count columns |
| 393 my $head = `head -n 1 $wd/Targets.Global.Coverage`; | 489 my $head = `head -n 1 '$tcov'`; |
| 394 chomp($head); | 490 chomp($head); |
| 395 my @cols = split(/\t/,$head); | 491 my @cols = split(/\t/,$head); |
| 396 my $nrcols = scalar(@cols); | 492 my $nrcols = scalar(@cols); |
| 397 my $covcol = $nrcols - 3; | 493 my $covcol = $nrcols - 3; |
| 398 # Coverage Plots for each gene => barplots in R, table here. | 494 # Coverage Plots for each gene => barplots in R, table here. |
| 399 open IN, "$wd/Targets.Global.Coverage"; | 495 open IN, "$tcov"; |
| 400 my $currgroup = ''; | 496 my $currgroup = ''; |
| 401 my $startline = 0; | 497 my $startline = 0; |
| 402 my $stopline = 0; | 498 my $stopline = 0; |
| 403 $linecounter = 0; | 499 $linecounter = 0; |
| 404 while (<IN>) { | 500 while (<IN>) { |
| 405 $linecounter++; | 501 $linecounter++; |
| 406 chomp($_); | 502 chomp($_); |
| 407 my @c = split(/\t/,$_); | 503 my @c = split(/\t/,$_); |
| 408 push(@allregions,$c[0].'-'.$c[1].'-'.$c[2]); | 504 my $reg = $c[0].'-'.$c[1].'-'.$c[2]; |
| 409 my $group = $c[3]; | 505 push(@allregions,$reg); |
| 506 my $group = $reg .": ".$c[3]; | |
| 507 #my $gene = $c[3]; | |
| 410 ## coverage failure? | 508 ## coverage failure? |
| 411 if ($c[$nrcol-1] < 1 || $c[$covcol-1] < $thresh) { | 509 if ($c[$nrcol-1] < 1 || $c[$covcol-1] < $thresh) { |
| 412 push(@failedexons,$group); | 510 push(@failedexons,$group); |
| 413 push(@failedregions,$c[0].'-'.$c[1].'-'.$c[2]); | 511 push(@failedregions,$c[0].'-'.$c[1].'-'.$c[2]); |
| 512 $failednames{$group} = $c[0].'-'.$c[1].'-'.$c[2]; | |
| 414 } | 513 } |
| 415 ## store exon | 514 ## store exon |
| 416 push(@allexons,$group); | 515 push(@allexons,$group); |
| 516 $allnames{$group} = $c[0].'-'.$c[1].'-'.$c[2]; | |
| 517 if (!exists($opts{'r'}) && !exists($opts{'s'}) && !exists($opts{'S'}) && exists($opts{'A'})) { | |
| 518 ## no need for barplots | |
| 519 next; | |
| 520 } | |
| 417 ## extract and check gene | 521 ## extract and check gene |
| 418 $group =~ s/^(\S+)[\|\s](.+)/$1/; | 522 my $gene = $group; |
| 419 if ($group ne $currgroup ) { | 523 $gene =~ s/^chr\S+: (\S+)[\|\s](.+)/$1/; |
| 524 if ($gene ne $currgroup ) { | |
| 420 if ($currgroup ne '') { | 525 if ($currgroup ne '') { |
| 421 # new gene, make plot. | 526 # new gene, make plot. |
| 422 open OUT, ">$wd/Rout/barplot.R"; | 527 open OUT, ">$wd/Rout/barplot.R"; |
| 423 print OUT 'coveragetable <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n"; | 528 print OUT 'coveragetable <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n"; |
| 424 print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n"; | 529 print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n"; |
| 425 print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n"; | 530 print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n"; |
| 426 print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n"; | 531 print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n"; |
| 427 print OUT 'coverage[coverage < 1] <- 1'."\n"; | 532 print OUT 'coverage[coverage < 1] <- 1'."\n"; |
| 428 print OUT 'colors <- c(rep("grey",length(coverage)))'."\n"; | 533 print OUT 'colors <- c(rep("grey",length(coverage)))'."\n"; |
| 455 else { | 560 else { |
| 456 push(@large,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}'); | 561 push(@large,'\includegraphics[width=\textwidth,keepaspectratio=true]{../Plots/Coverage_'.$currgroup.'.png}'); |
| 457 } | 562 } |
| 458 | 563 |
| 459 } | 564 } |
| 460 $currgroup = $group; | 565 $currgroup = $gene; |
| 461 $startline = $linecounter; | 566 $startline = $linecounter; |
| 462 } | 567 } |
| 463 $stopline = $linecounter; | 568 $stopline = $linecounter; |
| 464 } | 569 } |
| 465 close IN; | 570 close IN; |
| 466 if ($currgroup ne '') { | 571 if ($currgroup ne '') { |
| 467 # last gene, make plot. | 572 # last gene, make plot. |
| 468 open OUT, ">$wd/Rout/barplot.R"; | 573 open OUT, ">$wd/Rout/barplot.R"; |
| 469 print OUT 'coveragetable <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n"; | 574 print OUT 'coveragetable <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n"; |
| 470 print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n"; | 575 print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n"; |
| 471 print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n"; | 576 print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n"; |
| 472 print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n"; | 577 print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n"; |
| 473 print OUT 'coverage[coverage < 1] <- 1'."\n"; | 578 print OUT 'coverage[coverage < 1] <- 1'."\n"; |
| 474 print OUT 'colors <- c(rep("grey",length(coverage)))'."\n"; | 579 print OUT 'colors <- c(rep("grey",length(coverage)))'."\n"; |
| 500 } | 605 } |
| 501 ## print to TEX | 606 ## print to TEX |
| 502 open OUT, ">>$wd/Report/Report.tex"; | 607 open OUT, ">>$wd/Report/Report.tex"; |
| 503 print OUT '\subsection*{Gene Summaries}'."\n"; | 608 print OUT '\subsection*{Gene Summaries}'."\n"; |
| 504 print OUT '\underline{Legend:} \\\\'."\n"; | 609 print OUT '\underline{Legend:} \\\\'."\n"; |
| 505 print OUT '{\color{red}\textbf{RED:} Coverage did not reach set threshold of '.$thresh.'} \\\\'."\n"; | 610 print OUT '{\color{red}\textbf{RED:} Average coverage did not reach set threshold of '.$thresh.'} \\\\'."\n"; |
| 506 print OUT '{\color{orange}\textbf{ORANGE:} Coverage was incomplete for the exon. Overruled by red.} \\\\' ."\n"; | 611 print OUT '{\color{orange}\textbf{ORANGE:} Coverage was incomplete for the exon (section with zero coverage found). Overruled by red.} \\\\' ."\n"; |
| 507 $col = 1; | 612 $col = 1; |
| 508 foreach (@small) { | 613 foreach (@small) { |
| 509 if ($col > 2) { | 614 if ($col > 2) { |
| 510 $col = 1; | 615 $col = 1; |
| 511 print OUT "\n"; | 616 print OUT "\n"; |
| 538 # tex section header | 643 # tex section header |
| 539 open TEX, ">>$wd/Report/Report.tex"; | 644 open TEX, ">>$wd/Report/Report.tex"; |
| 540 print TEX '\subsection*{Failed Exon Plots}'."\n"; | 645 print TEX '\subsection*{Failed Exon Plots}'."\n"; |
| 541 $col = 1; | 646 $col = 1; |
| 542 print TEX '\underline{NOTE:} Only exons with global coverage $<$'.$thresh.' or incomplete coverage were plotted \\\\'."\n"; | 647 print TEX '\underline{NOTE:} Only exons with global coverage $<$'.$thresh.' or incomplete coverage were plotted \\\\'."\n"; |
| 543 foreach(@failedregions) { | 648 foreach(sort(keys(%failednames)) ) { |
| 649 #foreach(@failedregions) { | |
| 544 if ($col > 2) { | 650 if ($col > 2) { |
| 545 $col = 1; | 651 $col = 1; |
| 546 print TEX "\n"; | 652 print TEX "\n"; |
| 547 } | 653 } |
| 548 # which exon | 654 # which exon |
| 549 my $region = $_; | 655 my $group = $_; |
| 550 my $exon = $filehash{$region}{'exon'}; | 656 my ($region,$name) = split(/: /,$group); |
| 657 #my $region = $failednames{$_}; | |
| 658 my $exon = $filehash{$group}{'exon'}; | |
| 551 # link exon to tmp file | 659 # link exon to tmp file |
| 552 my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt"; | 660 my $exonfile = "$wd/SplitFiles/File_".$filehash{$group}{'idx'}.".txt"; |
| 553 ## determine transcript orientation and location | 661 ## determine transcript orientation and location |
| 554 my $firstline = `head -n 1 $exonfile`; | 662 my $firstline = `head -n 1 $exonfile`; |
| 555 my @firstcols = split(/\t/,$firstline); | 663 my @firstcols = split(/\t/,$firstline); |
| 556 my $orient = $firstcols[5]; | 664 my $orient = $firstcols[5]; |
| 557 my $genomicchr = $firstcols[0]; | 665 my $genomicchr = $firstcols[0]; |
| 574 | 682 |
| 575 my $width = 480 ; | 683 my $width = 480 ; |
| 576 my $height = 240 ; | 684 my $height = 240 ; |
| 577 my $exonstr = $exon; | 685 my $exonstr = $exon; |
| 578 $exonstr =~ s/\s/_/g; | 686 $exonstr =~ s/\s/_/g; |
| 687 $exonstr =~ s/:/_/g; | |
| 579 $exon =~ s/_/ /g; | 688 $exon =~ s/_/ /g; |
| 580 $exon =~ s/\|/ /g; | 689 $exon =~ s/\|/ /g; |
| 690 $exon =~ s/chr.*: (.*)$/$1/; | |
| 581 print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n"; | 691 print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n"; |
| 582 print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n"; | 692 print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n"; |
| 583 if ($orient eq '-') { | 693 if ($orient eq '-') { |
| 584 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n"; | 694 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n"; |
| 585 print OUT 'mtext("'.$subtitle.'")'."\n"; | 695 print OUT 'mtext("'.$subtitle.'")'."\n"; |
| 622 } | 732 } |
| 623 elsif (exists($opts{'A'})) { | 733 elsif (exists($opts{'A'})) { |
| 624 print TEX '\underline{NOTE:} ALL exons are plotted, regardless of coverage \\\\'."\n"; | 734 print TEX '\underline{NOTE:} ALL exons are plotted, regardless of coverage \\\\'."\n"; |
| 625 } | 735 } |
| 626 $col = 1; | 736 $col = 1; |
| 627 foreach(@allregions) { | 737 foreach(sort(keys(%allnames))) { |
| 738 | |
| 739 #foreach(@allregions) { | |
| 628 if ($col > 2) { | 740 if ($col > 2) { |
| 629 $col = 1; | 741 $col = 1; |
| 630 print TEX "\n"; | 742 print TEX "\n"; |
| 631 } | 743 } |
| 744 my $group = $_; | |
| 745 my ($region,$name) = split(/: /,$group); | |
| 632 # which exon | 746 # which exon |
| 633 my $region = $_; | 747 #my $region = $_; |
| 634 my $exon = $filehash{$region}{'exon'}; | 748 #my $region = $allnames{$_}; |
| 749 my $exon = $filehash{$group}{'exon'}; | |
| 635 # grep exon to tmp file | 750 # grep exon to tmp file |
| 636 my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt"; | 751 my $exonfile = "$wd/SplitFiles/File_".$filehash{$group}{'idx'}.".txt"; |
| 637 ## determine transcript orientation. | 752 ## determine transcript orientation. |
| 638 my $firstline = `head -n 1 $exonfile`; | 753 my $firstline = `head -n 1 $exonfile`; |
| 639 my @firstcols = split(/\t/,$firstline); | 754 my @firstcols = split(/\t/,$firstline); |
| 640 my $orient = $firstcols[5]; | 755 my $orient = $firstcols[5]; |
| 641 my $genomicchr = $firstcols[0]; | 756 my $genomicchr = $firstcols[0]; |
| 671 print OUT 'positions <- coveragetable[,'.$poscol.']'."\n"; | 786 print OUT 'positions <- coveragetable[,'.$poscol.']'."\n"; |
| 672 my $width = 480 ; | 787 my $width = 480 ; |
| 673 my $height = 240 ; | 788 my $height = 240 ; |
| 674 my $exonstr = $exon; | 789 my $exonstr = $exon; |
| 675 $exonstr =~ s/\s/_/g; | 790 $exonstr =~ s/\s/_/g; |
| 791 $exonstr =~ s/:/_/g; | |
| 676 $exon =~ s/_/ /g; | 792 $exon =~ s/_/ /g; |
| 677 $exon =~ s/\|/ /g; | 793 $exon =~ s/\|/ /g; |
| 794 $exon =~ s/^chr.*: (.*)$/$1/; | |
| 678 print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n"; | 795 print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n"; |
| 679 print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n"; | 796 print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n"; |
| 680 if ($orient eq '-') { | 797 if ($orient eq '-') { |
| 681 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n"; | 798 print OUT 'plot(positions,log10(coverage),type="n",main="Coverage for '.$exon.'",ylab="log10(Coverage)",ylim=ylim,xlab="Position",xlim=rev(range(positions)),sub="(Transcribed from minus strand)")'."\n"; |
| 682 print OUT 'mtext("'.$subtitle.'")'."\n"; | 799 print OUT 'mtext("'.$subtitle.'")'."\n"; |
| 737 my $genomicchr = $firstcols[0]; | 854 my $genomicchr = $firstcols[0]; |
| 738 my $genomicstart = $firstcols[1]; | 855 my $genomicstart = $firstcols[1]; |
| 739 my $genomicstop = $firstcols[2]; | 856 my $genomicstop = $firstcols[2]; |
| 740 | 857 |
| 741 if ($orient eq '+') { | 858 if ($orient eq '+') { |
| 742 $bps = $genomicstop - $genomicstart + 1; | 859 #$bps = $genomicstop - $genomicstart + 1; |
| 743 $subtitle = "$genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop); | 860 $subtitle = "$genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop); |
| 744 | 861 |
| 745 } | 862 } |
| 746 else { | 863 else { |
| 747 $bps = $genomicstop - $genomicstart + 1; | 864 #$bps = $genomicstop - $genomicstart + 1; |
| 748 $subtitle = "$genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop); | 865 $subtitle = "$genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop); |
| 749 } | 866 } |
| 750 | 867 |
| 751 # check if failed | 868 # check if failed |
| 752 my $cs = `cut -f $covcol '$exonfile' `; | 869 my $cs = `cut -f $covcol '$exonfile' `; |
| 794 system("cp -Rf $wd/Targets.Position.Coverage $wd/Results/"); | 911 system("cp -Rf $wd/Targets.Position.Coverage $wd/Results/"); |
| 795 } | 912 } |
| 796 | 913 |
| 797 system("cd $wd && tar czf '$tarfile' Results/"); | 914 system("cd $wd && tar czf '$tarfile' Results/"); |
| 798 ## clean up (galaxy stores outside wd) | 915 ## clean up (galaxy stores outside wd) |
| 799 system("rm -Rf $wd"); | 916 #system("rm -Rf $wd"); |
| 800 ############### | 917 ############### |
| 801 ## FUNCTIONS ## | 918 ## FUNCTIONS ## |
| 802 ############### | 919 ############### |
| 803 sub arraystats{ | 920 sub arraystats{ |
| 804 my @array = @_; | 921 my @array = @_; |
| 805 my $count = scalar(@array); | 922 my $count = scalar(@array); |
| 923 if ($count == 0 ) { | |
| 924 return (0,0,0,0,0,0,0); | |
| 925 } | |
| 806 @array = sort { $a <=> $b } @array; | 926 @array = sort { $a <=> $b } @array; |
| 807 # median | 927 # median |
| 808 my $median = 0; | 928 my $median = 0; |
| 809 if ($count % 2) { | 929 if ($count % 2) { |
| 810 $median = $array[int($count/2)]; | 930 $median = $array[int($count/2)]; |
| 822 my $min = $array[0]; | 942 my $min = $array[0]; |
| 823 my $max = $array[($count-1)]; | 943 my $max = $array[($count-1)]; |
| 824 return ($average,$median,$min,$max,$first,$third,$sum); | 944 return ($average,$median,$min,$max,$first,$third,$sum); |
| 825 } | 945 } |
| 826 | 946 |
| 827 sub GlobalSummary { | 947 sub GetStats { |
| 828 my ($bam,$targets) = @_; | 948 my ($aref) = @_; |
| 829 | 949 if (scalar(@$aref) == 0) { |
| 830 my $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage"; | 950 return qw/0 0/; |
| 831 if (exists($commandsrun{$command})) { | 951 } |
| 832 return; | 952 # median |
| 833 } | 953 my @s = sort {$a <=> $b } @$aref; |
| 834 system($command); | 954 my $nrzero = 0; |
| 835 $commandsrun{$command} = 1; | 955 my $len = scalar(@s); |
| 836 } | 956 for (my $i = 0; $i< $len;$i++) { |
| 837 | 957 if ($s[$i] == 0) { |
| 838 sub CoveragePerRegion { | 958 $nrzero++; |
| 839 my ($bam,$targets) = @_; | 959 } |
| 840 my $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage"; | 960 else { |
| 841 if (exists($commandsrun{$command})) { | 961 last; |
| 842 return; | 962 } |
| 843 } | 963 } |
| 844 system($command); | 964 my $nrcov = $len - $nrzero; |
| 845 $commandsrun{$command} = 1; | 965 # avg |
| 846 } | 966 my $avg = 0; |
| 967 foreach (@s) { $avg += $_ }; | |
| 968 $avg = sprintf("%.1f",($avg / scalar(@s))); | |
| 969 return($avg,$len,$nrcov); | |
| 970 } | |
| 971 | |
| 847 | 972 |
| 848 sub SubRegionCoverage { | 973 sub SubRegionCoverage { |
| 849 my ($bam,$targets) = @_; | 974 my ($bam,$targets) = @_; |
| 850 my $command = "cd $wd && coverageBed -abam $bam -b $targets -d > $wd/Targets.Position.Coverage"; | 975 my $command = "cd $wd && coverageBed -abam $bam -b $targets -d > $wd/Targets.Position.Coverage"; |
| 851 system($command); | 976 system($command); |
