Mercurial > repos > geert-vandeweyer > coverage_report
changeset 26:859999cb135b draft
revised routines. Better handling of collapsing, isoforms and mutiple mapppings. Barplots revised to visualise average base coverage in exon instead of total number of reads in exon
author | geert-vandeweyer |
---|---|
date | Wed, 29 Nov 2017 08:12:27 -0500 |
parents | 6cb012c8497a |
children | 576bfc1586f9 |
files | CoverageReport.pl |
diffstat | 1 files changed, 195 insertions(+), 70 deletions(-) [+] |
line wrap: on
line diff
--- a/CoverageReport.pl Thu Feb 12 09:54:03 2015 -0500 +++ b/CoverageReport.pl Wed Nov 29 08:12:27 2017 -0500 @@ -26,18 +26,20 @@ # m : (m)inimal Coverage threshold # f : fraction of average as threshold # n : sample (n)ame. - +# T : collapse overlapping Target regions. getopts('b:t:o:z:rsSALm:n:f:T', \%opts) ; # make output directory in (tmp) working dir our $wd = "/tmp/Coverage.".int(rand(1000)); + while (-d $wd) { $wd = "/tmp/Coverage.".int(rand(1000)); } system("mkdir $wd"); - +$wd = "/tmp/Coverage.993"; +print "wd : $wd\n"; ## variables our %commandsrun = (); @@ -89,38 +91,103 @@ my $tmptargets = "$wd/collapsedtargets.bed"; system("sort -k1,1 -k2,2n $targets > $wd/sorted.targets.bed"); system("bedtools merge -s -scores max -nms -i $wd/sorted.targets.bed > $tmptargets"); - $opts{'t'} = $tmptargets; + open IN, $tmptargets; + open OUT, ">$wd/collapsed.targets.renamed.bed"; + # we assume that overlapping fragments come from isoforms, not from different genes. + my %counters = (); + my @genes = (); + while (<IN>) { + chomp; + my @p = split(/\t/,$_); + my @g = split(/,/,$p[3]); + $g[0] =~ m/(\S+)\(.*/; + my $gene = $1; + if (!defined($counters{$gene})) { + push(@genes,$gene); + $counters{$gene}{'lines'} = (); + $counters{$gene}{'orient'} = $p[5]; + } + $p[3] = $gene."(COLLAPSED)"; + push(@{$counters{$gene}{'lines'}},\@p); + } + close IN; + foreach my $gene (@genes) { + if ($counters{$gene}{'orient'} eq '-') { + my $idx = scalar(@{$counters{$gene}{'lines'}}) + 1; + foreach my $line (@{$counters{$gene}{'lines'}}) { + $idx--; + $line->[3] .= "|Region_$idx"; + print OUT join("\t",@$line)."\n"; + } + } + else { + my $idx = 0; + foreach my $line (@{$counters{$gene}{'lines'}}) { + $idx++; + $line->[3] .= "|Region_$idx"; + print OUT join("\t",@$line)."\n"; + } + } + } + close OUT; + $opts{'t'} = "$wd/collapsed.targets.renamed.bed"; } - -# 1. Global Summary => default -&GlobalSummary($opts{'b'}, $opts{'t'}); +# 1. Coverage per exon +# included in 2. # 2. Coverage per position &SubRegionCoverage($opts{'b'}, $opts{'t'}); our %filehash; +our $tcov; if (exists($opts{'s'}) || exists($opts{'S'}) || exists($opts{'A'}) || exists($opts{'L'})) { - system("mkdir $wd/SplitFiles"); + system("mkdir -p $wd/SplitFiles"); + system("rm $wd/SplitFiles/*"); ## get position coverages ## split input files open IN, "$wd/Targets.Position.Coverage"; + open BCOVSUM, ">$wd/Results/".$opts{'n'}.".Average_Region_Coverage.txt"; my $fileidx = 0; my $currreg = ''; + my $elength = 0; + my $esum = 0; + my $eline = ""; + my %out = (); while (<IN>) { my $line = $_; chomp($line); my @p = split(/\t/,$line); - my $reg = $p[0].'-'.$p[1].'-'.$p[2]; #.$p[3]; + my $reg = $p[0].'-'.$p[1].'-'.$p[2]. ": $p[3]"; my $ex = $p[3]; + my $epos = $p[1]; + # average exon coverage calculation. + if (!defined($out{$ex})) { + $out{$ex} = (); + } + if (!defined($out{$ex}{$p[0]})) { + $out{$ex}{$p[0]} = (); + } + # needs to be transcript specific ($ex) and position specific ($epos) to handle both isoforms and PAR/multiple mapping situations. + if (!defined($out{$ex}{$p[0]}{$epos})) { + $out{$ex}{$p[0]}{$epos}{'r'} = "$p[0]\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$p[5]"; + $out{$ex}{$p[0]}{$epos}{'c'} = (); + + } + push(@{$out{$ex}{$p[0]}{$epos}{'c'}},$p[-1]); + # splitting files if ($reg ne $currreg) { ## new exon open new outfile if ($currreg ne '') { + print BCOVSUM "$eline\t".($esum/$elength)."\n"; ## filehandle is open. close it close OUT; } + $eline = "$p[0]\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$p[5]"; #\t$p[6]\t$p[7]\t$p[8]"; + $esum = 0; + $elength = 0; if (!exists($filehash{$reg})) { $fileidx++; $filehash{$reg}{'idx'} = $fileidx; - $filehash{$reg}{'exon'} = $ex; + $filehash{$reg}{'exon'} = $reg; open OUT, ">> $wd/SplitFiles/File_$fileidx.txt"; $currreg = $reg; } @@ -131,30 +198,49 @@ } ## print the line to the open filehandle. print OUT "$line\n"; + $esum += $p[-1]; + $elength++; } close OUT; close IN; + if ($esum > 0) { + print BCOVSUM "$eline\t".($esum/$elength)."\n"; + } + close BCOVSUM; + open OUT, ">$wd/avg.tcov.txt"; + + foreach my $tr_ex (sort {$a cmp $b} keys(%out)) { + foreach my $chr (sort {$a cmp $b} keys(%{$out{$tr_ex}})) { + foreach(sort {$a <=> $b} keys(%{$out{$tr_ex}{$chr}})) { + my ($avg,$nr,$nrcov) = GetStats(\@{$out{$tr_ex}{$chr}{$_}{'c'}}); + my $frac = 0; + if ($nr > 0) { + $frac = ($nrcov / $nr); + } + print OUT $out{$tr_ex}{$chr}{$_}{'r'}."\t".$avg."\t".$nrcov."\t".$nr."\t".$frac."\n"; + } + } + } + close OUT; + $tcov = "$wd/avg.tcov.txt"; } - ## sort output files according to targets file -if (exists($opts{'r'}) ) { - my %hash = (); - open IN, "$wd/Targets.Global.Coverage"; - while (<IN>) { - my @p = split(/\t/,$_) ; - $hash{$p[3]} = $_; - } - close IN; - open OUT, ">$wd/Targets.Global.Coverage"; - open IN, $opts{'t'}; - while (<IN>) { - my @p = split(/\t/,$_) ; - print OUT $hash{$p[3]}; - } - close IN; - close OUT; +my %hash = (); +open IN, $tcov; +while (<IN>) { + my @p = split(/\t/,$_) ; + $hash{$p[3].':'.$p[1]} = $_; } +close IN; +open OUT, ">$tcov"; +open IN, $opts{'t'}; +while (<IN>) { + my @p = split(/\t/,$_) ; + print OUT $hash{$p[3].':'.$p[1]}; +} +close IN; +close OUT; #################################### @@ -211,24 +297,28 @@ $totalmapped = $s[2]; $totalmapped =~ s/^(\d+)(\s.+)/$1/; # count columns -my $head = `head -n 1 $wd/Targets.Global.Coverage`; +my $head = `head -n 1 $tcov`; chomp($head); my @cols = split(/\t/,$head); my $nrcols = scalar(@cols); my $covcol = $nrcols - 3; # get min/max/median/average coverage => values -my $covs = `cut -f $covcol $wd/Targets.Global.Coverage`; +my $covs = `cut -f $covcol $tcov`; my @coverages = split(/\n/,$covs); my ($eavg,$med,$min,$max,$first,$third,$ontarget) = arraystats(@coverages); -my $spec = sprintf("%.1f",($ontarget / $totalmapped)*100); +my $spec = ''; +if ($totalmapped != 0 && $totalmapped ne '') { + $spec = sprintf("%.1f",($ontarget / $totalmapped)*100); +} # get min/max/median/average coverage => boxplot in R open OUT, ">$wd/Rout/boxplot.R"; -print OUT 'coverage <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n"; +print OUT 'coverage <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n"; print OUT 'coverage <- coverage[,'.$covcol.']'."\n"; print OUT 'png(file="../Plots/CoverageBoxPlot.png", bg="white", width=240, height=480,type=c("cairo"))'."\n"; print OUT 'boxplot(coverage,range=1.5,main="Target Region Coverage")'."\n"; print OUT 'graphics.off()'."\n"; close OUT; +print "Running boxplot.R : \n"; system("cd $wd/Rout && Rscript boxplot.R"); ## global nt coverage plot @@ -323,6 +413,7 @@ print OUT 'graphics.off()'."\n"; close OUT; +print "Running ntplot.r\n"; system("cd $wd/Rout && Rscript ntplot.R"); ## PRINT TO .TEX FILE open OUT, ">>$wd/Report/Report.tex"; @@ -359,6 +450,8 @@ $two =~ s/>=/\$\\ge\$/g; $two = ucfirst($two); print OUT '\textbf{'.$two.'} & '.$one.' \\\\'."\n"; + + } print OUT '\end{tabular}\end{minipage}'."\n"; print OUT '\hspace{1.5cm}'."\n"; @@ -388,15 +481,18 @@ @allexons; @allregions; @failedregions; -if (exists($opts{'r'}) || exists($opts{'s'}) || exists($opts{'S'})) { +%failednames; +%allnames; + +if (exists($opts{'r'}) || exists($opts{'s'}) || exists($opts{'S'})|| exists($opts{'A'})) { # count columns - my $head = `head -n 1 $wd/Targets.Global.Coverage`; + my $head = `head -n 1 '$tcov'`; chomp($head); my @cols = split(/\t/,$head); my $nrcols = scalar(@cols); my $covcol = $nrcols - 3; # Coverage Plots for each gene => barplots in R, table here. - open IN, "$wd/Targets.Global.Coverage"; + open IN, "$tcov"; my $currgroup = ''; my $startline = 0; my $stopline = 0; @@ -405,22 +501,31 @@ $linecounter++; chomp($_); my @c = split(/\t/,$_); - push(@allregions,$c[0].'-'.$c[1].'-'.$c[2]); - my $group = $c[3]; + my $reg = $c[0].'-'.$c[1].'-'.$c[2]; + push(@allregions,$reg); + my $group = $reg .": ".$c[3]; + #my $gene = $c[3]; ## coverage failure? if ($c[$nrcol-1] < 1 || $c[$covcol-1] < $thresh) { push(@failedexons,$group); push(@failedregions,$c[0].'-'.$c[1].'-'.$c[2]); + $failednames{$group} = $c[0].'-'.$c[1].'-'.$c[2]; } ## store exon push(@allexons,$group); + $allnames{$group} = $c[0].'-'.$c[1].'-'.$c[2]; + if (!exists($opts{'r'}) && !exists($opts{'s'}) && !exists($opts{'S'}) && exists($opts{'A'})) { + ## no need for barplots + next; + } ## extract and check gene - $group =~ s/^(\S+)[\|\s](.+)/$1/; - if ($group ne $currgroup ) { + my $gene = $group; + $gene =~ s/^chr\S+: (\S+)[\|\s](.+)/$1/; + if ($gene ne $currgroup ) { if ($currgroup ne '') { # new gene, make plot. open OUT, ">$wd/Rout/barplot.R"; - print OUT 'coveragetable <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n"; + print OUT 'coveragetable <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n"; print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n"; print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n"; print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n"; @@ -457,7 +562,7 @@ } } - $currgroup = $group; + $currgroup = $gene; $startline = $linecounter; } $stopline = $linecounter; @@ -466,7 +571,7 @@ if ($currgroup ne '') { # last gene, make plot. open OUT, ">$wd/Rout/barplot.R"; - print OUT 'coveragetable <- read.table("../Targets.Global.Coverage",as.is=TRUE,sep="\t",header=FALSE)'."\n"; + print OUT 'coveragetable <- read.table("'.$tcov.'",as.is=TRUE,sep="\t",header=FALSE)'."\n"; print OUT 'coverage <- coveragetable[c('.$startline.':'.$stopline.'),'.$covcol.']'."\n"; print OUT 'entries <- coveragetable[c('.$startline.':'.$stopline.'),4]'."\n"; print OUT 'entries <- sub("\\\\S+\\\\|","",entries,perl=TRUE)'."\n"; @@ -502,8 +607,8 @@ open OUT, ">>$wd/Report/Report.tex"; print OUT '\subsection*{Gene Summaries}'."\n"; print OUT '\underline{Legend:} \\\\'."\n"; - print OUT '{\color{red}\textbf{RED:} Coverage did not reach set threshold of '.$thresh.'} \\\\'."\n"; - print OUT '{\color{orange}\textbf{ORANGE:} Coverage was incomplete for the exon. Overruled by red.} \\\\' ."\n"; + print OUT '{\color{red}\textbf{RED:} Average coverage did not reach set threshold of '.$thresh.'} \\\\'."\n"; + print OUT '{\color{orange}\textbf{ORANGE:} Coverage was incomplete for the exon (section with zero coverage found). Overruled by red.} \\\\' ."\n"; $col = 1; foreach (@small) { if ($col > 2) { @@ -540,16 +645,19 @@ print TEX '\subsection*{Failed Exon Plots}'."\n"; $col = 1; print TEX '\underline{NOTE:} Only exons with global coverage $<$'.$thresh.' or incomplete coverage were plotted \\\\'."\n"; - foreach(@failedregions) { + foreach(sort(keys(%failednames)) ) { + #foreach(@failedregions) { if ($col > 2) { $col = 1; print TEX "\n"; } # which exon - my $region = $_; - my $exon = $filehash{$region}{'exon'}; + my $group = $_; + my ($region,$name) = split(/: /,$group); + #my $region = $failednames{$_}; + my $exon = $filehash{$group}{'exon'}; # link exon to tmp file - my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt"; + my $exonfile = "$wd/SplitFiles/File_".$filehash{$group}{'idx'}.".txt"; ## determine transcript orientation and location my $firstline = `head -n 1 $exonfile`; my @firstcols = split(/\t/,$firstline); @@ -576,8 +684,10 @@ my $height = 240 ; my $exonstr = $exon; $exonstr =~ s/\s/_/g; + $exonstr =~ s/:/_/g; $exon =~ s/_/ /g; $exon =~ s/\|/ /g; + $exon =~ s/chr.*: (.*)$/$1/; print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n"; print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n"; if ($orient eq '-') { @@ -624,16 +734,21 @@ print TEX '\underline{NOTE:} ALL exons are plotted, regardless of coverage \\\\'."\n"; } $col = 1; - foreach(@allregions) { + foreach(sort(keys(%allnames))) { + + #foreach(@allregions) { if ($col > 2) { $col = 1; print TEX "\n"; } + my $group = $_; + my ($region,$name) = split(/: /,$group); # which exon - my $region = $_; - my $exon = $filehash{$region}{'exon'}; + #my $region = $_; + #my $region = $allnames{$_}; + my $exon = $filehash{$group}{'exon'}; # grep exon to tmp file - my $exonfile = "$wd/SplitFiles/File_".$filehash{$region}{'idx'}.".txt"; + my $exonfile = "$wd/SplitFiles/File_".$filehash{$group}{'idx'}.".txt"; ## determine transcript orientation. my $firstline = `head -n 1 $exonfile`; my @firstcols = split(/\t/,$firstline); @@ -673,8 +788,10 @@ my $height = 240 ; my $exonstr = $exon; $exonstr =~ s/\s/_/g; + $exonstr =~ s/:/_/g; $exon =~ s/_/ /g; $exon =~ s/\|/ /g; + $exon =~ s/^chr.*: (.*)$/$1/; print OUT 'png(file="../Plots/Coverage_'.$exonstr.'.png", bg="white", width='.$width.', height='.$height.',type=c("cairo"))'."\n"; print OUT 'ylim = c(0,log10(max(max(coverage),'.($thresh+10).')))'."\n"; if ($orient eq '-') { @@ -739,12 +856,12 @@ my $genomicstop = $firstcols[2]; if ($orient eq '+') { - $bps = $genomicstop - $genomicstart + 1; + #$bps = $genomicstop - $genomicstart + 1; $subtitle = "$genomicchr:".$de->format_number($genomicstart)."+".$de->format_number($genomicstop); } else { - $bps = $genomicstop - $genomicstart + 1; + #$bps = $genomicstop - $genomicstart + 1; $subtitle = "$genomicchr:".$de->format_number($genomicstart)."-".$de->format_number($genomicstop); } @@ -796,13 +913,16 @@ system("cd $wd && tar czf '$tarfile' Results/"); ## clean up (galaxy stores outside wd) -system("rm -Rf $wd"); +#system("rm -Rf $wd"); ############### ## FUNCTIONS ## ############### sub arraystats{ my @array = @_; my $count = scalar(@array); + if ($count == 0 ) { + return (0,0,0,0,0,0,0); + } @array = sort { $a <=> $b } @array; # median my $median = 0; @@ -824,26 +944,31 @@ return ($average,$median,$min,$max,$first,$third,$sum); } -sub GlobalSummary { - my ($bam,$targets) = @_; - - my $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage"; - if (exists($commandsrun{$command})) { - return; +sub GetStats { + my ($aref) = @_; + if (scalar(@$aref) == 0) { + return qw/0 0/; } - system($command); - $commandsrun{$command} = 1; + # median + my @s = sort {$a <=> $b } @$aref; + my $nrzero = 0; + my $len = scalar(@s); + for (my $i = 0; $i< $len;$i++) { + if ($s[$i] == 0) { + $nrzero++; + } + else { + last; + } + } + my $nrcov = $len - $nrzero; + # avg + my $avg = 0; + foreach (@s) { $avg += $_ }; + $avg = sprintf("%.1f",($avg / scalar(@s))); + return($avg,$len,$nrcov); } -sub CoveragePerRegion { - my ($bam,$targets) = @_; - my $command = "cd $wd && coverageBed -abam $bam -b $targets > $wd/Targets.Global.Coverage"; - if (exists($commandsrun{$command})) { - return; - } - system($command); - $commandsrun{$command} = 1; -} sub SubRegionCoverage { my ($bam,$targets) = @_;