Mercurial > repos > lsong10 > psiclass
diff PsiCLASS-1.0.2/samtools-0.1.19/misc/plot-bamcheck @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/PsiCLASS-1.0.2/samtools-0.1.19/misc/plot-bamcheck Fri Mar 26 16:52:45 2021 +0000 @@ -0,0 +1,882 @@ +#!/usr/bin/env perl +# +# Author: petr.danecek@sanger +# + +use strict; +use warnings; +use Carp; + +my $opts = parse_params(); +parse_bamcheck($opts); +plot_qualities($opts); +plot_acgt_cycles($opts); +plot_gc($opts); +plot_gc_depth($opts); +plot_isize($opts); +plot_coverage($opts); +plot_mismatches_per_cycle($opts); +plot_indel_dist($opts); +plot_indel_cycles($opts); + +exit; + +#-------------------------------- + +sub error +{ + my (@msg) = @_; + if ( scalar @msg ) { confess @msg; } + die + "Usage: plot-bamcheck [OPTIONS] file.bam.bc\n", + " plot-bamcheck -p outdir/ file.bam.bc\n", + "Options:\n", + " -k, --keep-files Do not remove temporary files.\n", + " -p, --prefix <path> The output files prefix, add a slash to create new directory.\n", + " -r, --ref-stats <file.fa.gc> Optional reference stats file with expected GC content (created with -s).\n", + " -s, --do-ref-stats <file.fa> Calculate reference sequence GC for later use with -r\n", + " -t, --targets <file.tab> Restrict -s to the listed regions (tab-delimited chr,from,to. 1-based, inclusive)\n", + " -h, -?, --help This help message.\n", + "\n"; +} + + +sub parse_params +{ + $0 =~ s{^.+/}{}; + my $opts = { args=>join(' ',$0,@ARGV) }; + while (defined(my $arg=shift(@ARGV))) + { + if ( $arg eq '-k' || $arg eq '--keep-files' ) { $$opts{keep_files}=1; next; } + if ( $arg eq '-r' || $arg eq '--ref-stats' ) { $$opts{ref_stats}=shift(@ARGV); next; } + if ( $arg eq '-s' || $arg eq '--do-ref-stats' ) { $$opts{do_ref_stats}=shift(@ARGV); next; } + if ( $arg eq '-t' || $arg eq '--targets' ) { $$opts{targets}=shift(@ARGV); next; } + if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; } + if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } + if ( -e $arg ) { $$opts{bamcheck}=$arg; next; } + error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n"); + } + if ( exists($$opts{do_ref_stats }) ) { do_ref_stats($opts); exit; } + if ( !exists($$opts{bamcheck}) ) { error("No bamcheck file?\n") } + if ( !exists($$opts{prefix}) ) { error("Expected -p parameter.\n") } + if ( $$opts{prefix}=~m{/$} ) { `mkdir -p $$opts{prefix}`; } + elsif ( !($$opts{prefix}=~/-$/) ) { $$opts{prefix} .= '-'; } + return $opts; +} + + +# Creates GC stats for either the whole reference or only on target regions for exome QC +sub do_ref_stats +{ + my ($opts) = @_; + + + my %targets = (); + if ( exists($$opts{targets}) ) + { + my ($prev_chr,$prev_pos); + open(my $fh,'<',$$opts{targets}) or error("$$opts{targets}: $!"); + while (my $line=<$fh>) + { + if ( $line=~/^#/ ) { next; } + my ($chr,$from,$to) = split(/\s+/,$line); + chomp($to); + push @{$targets{$chr}}, $from,$to; + if ( !defined $prev_chr or $chr ne $prev_chr ) { $prev_chr=$chr; $prev_pos=$from } + if ( $prev_pos > $from ) { error("The file must be sorted: $$opts{targets}\n"); } + $prev_pos = $from; + } + close($fh); + } + + my $_len = 60; # for now do only standard fasta's with 60 bases per line + my %gc_counts = (); + my ($skip_chr,$pos,$ireg,$regions); + open(my $fh,'<',$$opts{do_ref_stats}) or error("$$opts{do_ref_stats}: $!"); + while (my $line=<$fh>) + { + if ( $line=~/^>/ ) + { + if ( !scalar %targets ) { next; } + + if ( !($line=~/>(\S+)/) ) { error("FIXME: could not determine chromosome name: $line"); } + if ( !exists($targets{$1}) ) { $skip_chr=$1; next; } + undef $skip_chr; + $pos = 0; + $ireg = 0; + $regions = $targets{$1}; + } + if ( defined $skip_chr ) { next; } + + # Only $_len sized lines are considered and no chopping for target regions. + chomp($line); + my $len = length($line); + if ( $len ne $_len ) { next; } + + if ( scalar %targets ) + { + while ( $ireg<@$regions && $$regions[$ireg+1]<=$pos ) { $ireg += 2; } + $pos += $len; + if ( $ireg==@$regions ) { next; } + if ( $pos < $$regions[$ireg] ) { next; } + } + + my $gc_count = 0; + for (my $i=0; $i<$len; $i++) + { + my $base = substr($line,$i,1); + if ( $base eq 'g' || $base eq 'G' || $base eq 'c' || $base eq 'C' ) { $gc_count++; } + } + $gc_counts{$gc_count}++; + } + + print "# Generated by $$opts{args}\n"; + print "# The columns are: GC content bin, normalized frequency\n"; + my $max; + for my $count (values %gc_counts) + { + if ( !defined $max or $count>$max ) { $max=$count; } + } + for my $gc (sort {$a<=>$b} keys %gc_counts) + { + if ( $gc==0 ) { next; } + printf "%f\t%f\n", $gc*100./$_len, $gc_counts{$gc}/$max; + } +} + +sub plot +{ + my ($cmdfile) = @_; + my $cmd = "gnuplot $cmdfile"; + system($cmd); + if ( $? ) { error("The command exited with non-zero status $?:\n\t$cmd\n\n"); } +} + + +sub parse_bamcheck +{ + my ($opts) = @_; + open(my $fh,'<',$$opts{bamcheck}) or error("$$opts{bamcheck}: $!"); + my $line = <$fh>; + if ( !($line=~/^# This file was produced by bamcheck (\S+)/) ) { error("Sanity check failed: was this file generated by bamcheck?"); } + $$opts{dat}{version} = $1; + while ($line=<$fh>) + { + if ( $line=~/^#/ ) { next; } + my @items = split(/\t/,$line); + chomp($items[-1]); + if ( $items[0] eq 'SN' ) + { + $$opts{dat}{$items[1]} = splice(@items,2); + next; + } + push @{$$opts{dat}{$items[0]}}, [splice(@items,1)]; + } + close($fh); + + # Check sanity + if ( !exists($$opts{dat}{'sequences:'}) or !$$opts{dat}{'sequences:'} ) + { + error("Sanity check failed: no sequences found by bamcheck??\n"); + } +} + +sub older_than +{ + my ($opts,$version) = @_; + my ($year,$month,$day) = split(/-/,$version); + $version = $$opts{dat}{version}; + if ( !($version=~/\((\d+)-(\d+)-(\d+)\)$/) ) { return 1; } + if ( $1<$year ) { return 1; } + elsif ( $1>$year ) { return 0; } + if ( $2<$month ) { return 1; } + elsif ( $2>$month ) { return 0; } + if ( $3<$day ) { return 1; } + return 0; +} + +sub get_defaults +{ + my ($opts,$img_fname,%args) = @_; + + if ( !($img_fname=~/\.png$/i) ) { error("FIXME: currently only PNG supported. (Easy to extend.)\n"); } + + # Determine the gnuplot script file name + my $gp_file = $img_fname; + $gp_file =~ s{\.[^.]+$}{.gp}; + if ( !($gp_file=~/.gp$/) ) { $gp_file .= '.gp'; } + + # Determine the default title: + # 5446_6/5446_6.bam.bc.gp -> 5446_6 + # test.aaa.png -> test.aaa + if ( !($$opts{bamcheck}=~m{([^/]+?)(?:\.bam)?(?:\.bc)?$}i) ) { error("FIXME: Could not determine the title from [$img_fname]\n"); } + my $title = $1; + + my $dir = $gp_file; + $dir =~ s{/[^/]+$}{}; + if ( $dir && $dir ne $gp_file ) { `mkdir -p $dir`; } + + my $wh = exists($args{wh}) ? $args{wh} : '600,400'; + + open(my $fh,'>',$gp_file) or error("$gp_file: $!"); + return { + title => $title, + gp => $gp_file, + img => $img_fname, + fh => $fh, + terminal => qq[set terminal png size $wh truecolor], + grid => 'set grid xtics ytics y2tics back lc rgb "#cccccc"', + }; +} + +sub percentile +{ + my ($p,@vals) = @_; + my $N = 0; + for my $val (@vals) { $N += $val; } + my $n = $p*($N+1)/100.; + my $k = int($n); + my $d = $n-$k; + if ( $k<=0 ) { return 0; } + if ( $k>=$N ) { return scalar @vals-1; } + my $cnt; + for (my $i=0; $i<@vals; $i++) + { + $cnt += $vals[$i]; + if ( $cnt>=$k ) { return $i; } + } + error("FIXME: this should not happen [percentile]\n"); +} + +sub plot_qualities +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{FFQ}) or !@{$$opts{dat}{FFQ}} ) { return; } + + my $yrange = @{$$opts{dat}{FFQ}[0]} > 50 ? @{$$opts{dat}{FFQ}[0]} : 50; + my $is_paired = $$opts{dat}{'is paired:'}; + + # Average quality per cycle, forward and reverse reads in one plot + my $args = get_defaults($opts,"$$opts{prefix}quals.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set ylabel "Average Quality" + set xlabel "Cycle" + set yrange [0:$yrange] + set title "$$args{title}" + plot '-' using 1:2 with lines title 'Forward reads' ] . ($is_paired ? q[, '-' using 1:2 with lines title 'Reverse reads'] : '') . q[ + ]; + my (@fp75,@fp50,@fmean); + my (@lp75,@lp50,@lmean); + my ($fmax,$fmax_qual,$fmax_cycle); + my ($lmax,$lmax_qual,$lmax_cycle); + for my $cycle (@{$$opts{dat}{FFQ}}) + { + my $sum=0; my $n=0; + for (my $iqual=1; $iqual<@$cycle; $iqual++) + { + $sum += $$cycle[$iqual]*$iqual; + $n += $$cycle[$iqual]; + if ( !defined $fmax or $fmax<$$cycle[$iqual] ) { $fmax=$$cycle[$iqual]; $fmax_qual=$iqual; $fmax_cycle=$$cycle[0]; } + } + my $p25 = percentile(25,(@$cycle)[1..$#$cycle]); + my $p50 = percentile(50,(@$cycle)[1..$#$cycle]); + my $p75 = percentile(75,(@$cycle)[1..$#$cycle]); + if ( !$n ) { next; } + push @fp75, "$$cycle[0]\t$p25\t$p75\n"; + push @fp50, "$$cycle[0]\t$p50\n"; + push @fmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n; + printf $fh $fmean[-1]; + } + print $fh "end\n"; + if ( $is_paired ) + { + for my $cycle (@{$$opts{dat}{LFQ}}) + { + my $sum=0; my $n=0; + for (my $iqual=1; $iqual<@$cycle; $iqual++) + { + $sum += $$cycle[$iqual]*$iqual; + $n += $$cycle[$iqual]; + if ( !defined $lmax or $lmax<$$cycle[$iqual] ) { $lmax=$$cycle[$iqual]; $lmax_qual=$iqual; $lmax_cycle=$$cycle[0]; } + } + my $p25 = percentile(25,(@$cycle)[1..$#$cycle]); + my $p50 = percentile(50,(@$cycle)[1..$#$cycle]); + my $p75 = percentile(75,(@$cycle)[1..$#$cycle]); + if ( !$n ) { next; } + push @lp75, "$$cycle[0]\t$p25\t$p75\n"; + push @lp50, "$$cycle[0]\t$p50\n"; + push @lmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n; + printf $fh $lmean[-1]; + } + print $fh "end\n"; + } + close($fh); + plot($$args{gp}); + + + + # Average, mean and quality percentiles per cycle, forward and reverse reads in separate plots + $args = get_defaults($opts,"$$opts{prefix}quals2.png",wh=>'700,500'); + $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set multiplot + set rmargin 0 + set lmargin 0 + set tmargin 0 + set bmargin 0 + set origin 0.1,0.1 + set size 0.4,0.8 + set yrange [0:$yrange] + set ylabel "Quality" + set xlabel "Cycle (fwd reads)" + plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 1 t 'Mean' + ]; + print $fh join('',@fp75),"end\n"; + print $fh join('',@fp50),"end\n"; + print $fh join('',@fmean),"end\n"; + if ( $is_paired ) + { + print $fh qq[ + set origin 0.55,0.1 + set size 0.4,0.8 + unset ytics + set y2tics mirror + set yrange [0:$yrange] + unset ylabel + set xlabel "Cycle (rev reads)" + set label "$$args{title}" at screen 0.5,0.95 center + plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 2 t 'Mean' + ]; + print $fh join('',@lp75),"end\n"; + print $fh join('',@lp50),"end\n"; + print $fh join('',@lmean),"end\n"; + } + close($fh); + plot($$args{gp}); + + + + # Quality distribution per cycle, the distribution is for each cycle plotted as a separate curve + $args = get_defaults($opts,"$$opts{prefix}quals3.png",wh=>'600,600'); + $fh = $$args{fh}; + my $nquals = @{$$opts{dat}{FFQ}[0]}-1; + my $ncycles = @{$$opts{dat}{FFQ}}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set multiplot + set rmargin 0 + set lmargin 0 + set tmargin 0 + set bmargin 0 + set origin 0.15,0.52 + set size 0.8,0.4 + set title "$$args{title}" + set ylabel "Frequency (fwd reads)" + set label "Cycle $fmax_cycle" at $fmax_qual+1,$fmax + unset xlabel + set xrange [0:$nquals] + set format x "" + ]; + my @plots; + for (my $i=0; $i<$ncycles; $i++) { push @plots, q['-' using 1:2 with lines t ''] } + print $fh "plot ", join(",", @plots), "\n"; + for my $cycle (@{$$opts{dat}{FFQ}}) + { + for (my $iqual=1; $iqual<$nquals; $iqual++) { print $fh "$iqual\t$$cycle[$iqual]\n"; } + print $fh "end\n"; + } + if ( $is_paired ) + { + print $fh qq[ + set origin 0.15,0.1 + set size 0.8,0.4 + unset title + unset format + set xtics + set xlabel "Quality" + unset label + set label "Cycle $lmax_cycle" at $lmax_qual+1,$lmax + set ylabel "Frequency (rev reads)" + ]; + print $fh "plot ", join(",", @plots), "\n"; + for my $cycle (@{$$opts{dat}{LFQ}}) + { + for (my $iqual=1; $iqual<$nquals; $iqual++) + { + print $fh "$iqual\t$$cycle[$iqual]\n"; + } + print $fh "end\n"; + } + } + close($fh); + plot($$args{gp}); + + + # Heatmap qualitites + $args = get_defaults($opts,"$$opts{prefix}quals-hm.png", wh=>'600,500'); + $fh = $$args{fh}; + my $max = defined $lmax && $lmax > $fmax ? $lmax : $fmax; + my @ytics; + for my $cycle (@{$$opts{dat}{FFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } } + my $ytics = join(',', @ytics); + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + unset key + unset colorbox + set palette defined (0 0 0 0, 1 0 0 1, 3 0 1 0, 4 1 0 0, 6 1 1 1) + set cbrange [0:$max] + set yrange [0:$ncycles] + set xrange [0:$nquals] + set view map + set multiplot + set rmargin 0 + set lmargin 0 + set tmargin 0 + set bmargin 0 + set origin 0,0.46 + set size 0.95,0.6 + set obj 1 rectangle behind from first 0,0 to first $nquals,$ncycles + set obj 1 fillstyle solid 1.0 fillcolor rgbcolor "black" + set ylabel "Cycle (fwd reads)" offset character -1,0 + unset ytics + set ytics ($ytics) + unset xtics + set title "$$args{title}" + splot '-' matrix with image + ]; + for my $cycle (@{$$opts{dat}{FFQ}}) + { + for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; } + print $fh "\n"; + } + print $fh "end\nend\n"; + @ytics = (); + for my $cycle (@{$$opts{dat}{LFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } } + $ytics = join(',', @ytics); + print $fh qq[ + set origin 0,0.03 + set size 0.95,0.6 + set ylabel "Cycle (rev reads)" offset character -1,0 + set xlabel "Base Quality" + unset title + unset ytics + set ytics ($ytics) + set xrange [0:$nquals] + set xtics + set colorbox vertical user origin first ($nquals+1),0 size screen 0.025,0.812 + set cblabel "Number of bases" + splot '-' matrix with image + ]; + for my $cycle (@{$$opts{dat}{LFQ}}) + { + for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; } + print $fh "\n"; + } + print $fh "end\nend\n"; + close($fh); + plot($$args{gp}); +} + + +sub plot_acgt_cycles +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{GCC}) or !@{$$opts{dat}{GCC}} ) { return; } + + my $args = get_defaults($opts,"$$opts{prefix}acgt-cycles.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set style line 1 linecolor rgb "green" + set style line 2 linecolor rgb "red" + set style line 3 linecolor rgb "black" + set style line 4 linecolor rgb "blue" + set style increment user + set ylabel "Base content [%]" + set xlabel "Read Cycle" + set yrange [0:100] + set title "$$args{title}" + plot '-' w l ti 'A', '-' w l ti 'C', '-' w l ti 'G', '-' w l ti 'T' + ]; + for my $base (1..4) + { + for my $cycle (@{$$opts{dat}{GCC}}) + { + print $fh $$cycle[0]+1,"\t",$$cycle[$base],"\n"; + } + print $fh "end\n"; + } + close($fh); + plot($$args{gp}); +} + + +sub plot_gc +{ + my ($opts) = @_; + + my $is_paired = $$opts{dat}{'is paired:'}; + my $args = get_defaults($opts,"$$opts{prefix}gc-content.png"); + my $fh = $$args{fh}; + my ($gcl_max,$gcf_max,$lmax,$fmax); + for my $gc (@{$$opts{dat}{GCF}}) { if ( !defined $gcf_max or $gcf_max<$$gc[1] ) { $gcf_max=$$gc[1]; $fmax=$$gc[0]; } } + for my $gc (@{$$opts{dat}{GCL}}) { if ( !defined $gcl_max or $gcl_max<$$gc[1] ) { $gcl_max=$$gc[1]; $lmax=$$gc[0]; } } + my $gcmax = $is_paired && $gcl_max > $gcf_max ? $lmax : $fmax; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set title "$$args{title}" + set ylabel "Normalized Frequency" + set xlabel "GC Content [%]" + set yrange [0:1.1] + set label sprintf("%.1f",$gcmax) at $gcmax,1 front offset 1,0 + plot ] + . (exists($$opts{ref_stats}) ? q['-' smooth csplines with lines lt 0 title 'Reference', ] : '') + . q['-' smooth csplines with lines lc 1 title 'First fragments' ] + . ($is_paired ? q[, '-' smooth csplines with lines lc 2 title 'Last fragments'] : '') + . q[ + ]; + if ( exists($$opts{ref_stats}) ) + { + open(my $ref,'<',$$opts{ref_stats}) or error("$$opts{ref_stats}: $!"); + while (my $line=<$ref>) { print $fh $line } + close($ref); + print $fh "end\n"; + } + for my $cycle (@{$$opts{dat}{GCF}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcf_max; } + print $fh "end\n"; + if ( $is_paired ) + { + for my $cycle (@{$$opts{dat}{GCL}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcl_max; } + print $fh "end\n"; + } + close($fh); + plot($$args{gp}); +} + + +sub plot_gc_depth +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{GCD}) or !@{$$opts{dat}{GCD}} ) { return; } + + # Find unique sequence percentiles for 30,40, and 50% GC content, just to draw x2tics. + my @tics = ( {gc=>30},{gc=>40},{gc=>50} ); + for my $gc (@{$$opts{dat}{GCD}}) + { + for my $tic (@tics) + { + my $diff = abs($$gc[0]-$$tic{gc}); + if ( !exists($$tic{pr}) or $diff<$$tic{diff} ) { $$tic{pr}=$$gc[1]; $$tic{diff}=$diff; } + } + } + + my @x2tics; + for my $tic (@tics) { push @x2tics, qq["$$tic{gc}" $$tic{pr}]; } + my $x2tics = join(',',@x2tics); + + my $args = get_defaults($opts,"$$opts{prefix}gc-depth.png", wh=>'600,500'); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set ylabel "Mapped depth" + set xlabel "Percentile of mapped sequence ordered by GC content" + set x2label "GC Content [%]" + set title "$$args{title}" + set x2tics ($x2tics) + set xtics nomirror + set xrange [0.1:99.9] + + plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#dedede" t '10-90th percentile' , \\ + '-' using 1:2:3 with filledcurve lt 1 lc rgb "#bbdeff" t '25-75th percentile' , \\ + '-' using 1:2 with lines lc rgb "#0084ff" t 'Median' + ]; + for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[2]\t$$gc[6]\n"; } print $fh "end\n"; + for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[3]\t$$gc[5]\n"; } print $fh "end\n"; + for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[4]\n"; } print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + + +sub plot_isize +{ + my ($opts) = @_; + + if ( !$$opts{dat}{'is paired:'} or !exists($$opts{dat}{IS}) or !@{$$opts{dat}{IS}} ) { return; } + + my ($isize_max,$isize_cnt); + for my $isize (@{$$opts{dat}{IS}}) + { + if ( !defined $isize_max or $isize_cnt<$$isize[1] ) { $isize_cnt=$$isize[1]; $isize_max=$$isize[0]; } + } + + my $args = get_defaults($opts,"$$opts{prefix}insert-size.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set rmargin 5 + set label sprintf("%d",$isize_max) at $isize_max+10,$isize_cnt + set ylabel "Number of pairs" + set xlabel "Insert Size" + set title "$$args{title}" + plot \\ + '-' with lines lc rgb 'black' title 'All pairs', \\ + '-' with lines title 'Inward', \\ + '-' with lines title 'Outward', \\ + '-' with lines title 'Other' + ]; + for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[1]\n"; } print $fh "end\n"; + for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[2]\n"; } print $fh "end\n"; + for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[3]\n"; } print $fh "end\n"; + for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[4]\n"; } print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + + +sub plot_coverage +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{COV}) or !@{$$opts{dat}{COV}} ) { return; } + + my @vals; + for my $cov (@{$$opts{dat}{COV}}) { push @vals,$$cov[2]; } + my $i = percentile(99.8,@vals); + my $p99 = $$opts{dat}{COV}[$i][1]; + + my $args = get_defaults($opts,"$$opts{prefix}coverage.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set ylabel "Number of mapped bases" + set xlabel "Coverage" + set style fill solid border -1 + set title "$$args{title}" + set xrange [:$p99] + plot '-' with lines notitle + ]; + for my $cov (@{$$opts{dat}{COV}}) + { + if ( $$cov[2]==0 ) { next; } + print $fh "$$cov[1]\t$$cov[2]\n"; + } + print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + + +sub plot_mismatches_per_cycle +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{MPC}) or !@{$$opts{dat}{MPC}} ) { return; } + if ( older_than($opts,'2012-02-06') ) { plot_mismatches_per_cycle_old($opts); } + + my $nquals = @{$$opts{dat}{MPC}[0]} - 2; + my $ncycles = @{$$opts{dat}{MPC}}; + my ($style,$with); + if ( $ncycles>100 ) { $style = ''; $with = 'w l'; } + else { $style = 'set style data histogram; set style histogram rowstacked'; $with = ''; } + + my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set style line 1 linecolor rgb "#e40000" + set style line 2 linecolor rgb "#ff9f00" + set style line 3 linecolor rgb "#eeee00" + set style line 4 linecolor rgb "#4ebd68" + set style line 5 linecolor rgb "#0061ff" + set style increment user + set key left top + $style + set ylabel "Number of mismatches" + set xlabel "Read Cycle" + set style fill solid border -1 + set title "$$args{title}" + set xrange [-1:$ncycles] + plot '-' $with ti 'Base Quality>30', \\ + '-' $with ti '30>=Q>20', \\ + '-' $with ti '20>=Q>10', \\ + '-' $with ti '10>=Q', \\ + '-' $with ti "N's" + ]; + for my $cycle (@{$$opts{dat}{MPC}}) + { + my $sum; for my $idx (31..$#$cycle) { $sum += $$cycle[$idx]; } + print $fh "$sum\n"; + } + print $fh "end\n"; + for my $cycle (@{$$opts{dat}{MPC}}) + { + my $sum; for my $idx (22..31) { $sum += $$cycle[$idx]; } + print $fh "$sum\n"; + } + print $fh "end\n"; + for my $cycle (@{$$opts{dat}{MPC}}) + { + my $sum; for my $idx (12..21) { $sum += $$cycle[$idx]; } + print $fh "$sum\n"; + } + print $fh "end\n"; + for my $cycle (@{$$opts{dat}{MPC}}) + { + my $sum; for my $idx (2..11) { $sum += $$cycle[$idx]; } + print $fh "$sum\n"; + } + print $fh "end\n"; + for my $cycle (@{$$opts{dat}{MPC}}) { print $fh "$$cycle[1]\n"; } + print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + +sub plot_indel_dist +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{ID}) or !@{$$opts{dat}{ID}} ) { return; } + + my $args = get_defaults($opts,"$$opts{prefix}indel-dist.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set style line 1 linetype 1 linecolor rgb "red" + set style line 2 linetype 2 linecolor rgb "black" + set style line 3 linetype 3 linecolor rgb "green" + set style increment user + set ylabel "Indel count [log]" + set xlabel "Indel length" + set y2label "Insertions/Deletions ratio" + set log y + set y2tics nomirror + set ytics nomirror + set title "$$args{title}" + plot '-' w l ti 'Insertions', '-' w l ti 'Deletions', '-' axes x1y2 w l ti "Ins/Dels ratio" + ]; + for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n"; + for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n"; + for my $len (@{$$opts{dat}{ID}}) { printf $fh "%d\t%f\n", $$len[0],$$len[2]?$$len[1]/$$len[2]:0; } print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + +sub plot_indel_cycles +{ + my ($opts) = @_; + + if ( !exists($$opts{dat}{IC}) or !@{$$opts{dat}{IC}} ) { return; } + + my $args = get_defaults($opts,"$$opts{prefix}indel-cycles.png"); + my $fh = $$args{fh}; + print $fh qq[ + $$args{terminal} + set output "$$args{img}" + $$args{grid} + set style line 1 linetype 1 linecolor rgb "red" + set style line 2 linetype 2 linecolor rgb "black" + set style line 3 linetype 3 linecolor rgb "green" + set style line 4 linetype 4 linecolor rgb "blue" + set style increment user + set ylabel "Indel count" + set xlabel "Read Cycle" + set title "$$args{title}" + plot '-' w l ti 'Insertions (fwd)', '' w l ti 'Insertions (rev)', '' w l ti 'Deletions (fwd)', '' w l ti 'Deletions (rev)' + ]; + for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n"; + for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n"; + for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[3]\n"; } print $fh "end\n"; + for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[4]\n"; } print $fh "end\n"; + close($fh); + plot($$args{gp}); +} + + + + + + + +sub has_values +{ + my ($opts,@tags) = @_; + for my $tag (@tags) + { + my (@lines) = `cat $$opts{bamcheck} | grep ^$tag | wc -l`; + chomp($lines[0]); + if ( $lines[0]<2 ) { return 0; } + } + return 1; +} + +sub plot_mismatches_per_cycle_old +{ + my ($opts) = @_; + + my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png"); + my ($nquals) = `grep ^MPC $$opts{bamcheck} | awk '\$2==1' | sed 's,\\t,\\n,g' | wc -l`; + my ($ncycles) = `grep ^MPC $$opts{bamcheck} | wc -l`; + chomp($nquals); + chomp($ncycles); + $nquals--; + $ncycles--; + my @gr0_15 = (2..17); + my @gr16_30 = (18..32); + my @gr31_n = (33..$nquals); + my $gr0_15 = '$'. join('+$',@gr0_15); + my $gr16_30 = '$'. join('+$',@gr16_30); + my $gr31_n = '$'. join('+$',@gr31_n); + + open(my $fh,'>',$$args{gp}) or error("$$args{gp}: $!"); + print $fh q[ + set terminal png size 600,400 truecolor font "DejaVuSansMono,9" + set output "] . $$args{img} . q[" + + set key left top + set style data histogram + set style histogram rowstacked + + set grid back lc rgb "#aaaaaa" + set ylabel "Number of mismatches" + set xlabel "Read Cycle" + set style fill solid border -1 + set title "] . $$args{title} . qq[" + set xrange [-1:$ncycles] + + plot '< grep ^MPC $$opts{bamcheck} | cut -f 2-' using ($gr31_n) ti 'Base Quality>30', '' using ($gr16_30) ti '30>=Q>15', '' using ($gr0_15) ti '15>=Q' + ]; + close($fh); + + plot($$args{gp}); +} + +