Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/misc/plot-bamcheck @ 0:903fc43d6227 draft default tip
Uploaded
| author | lsong10 |
|---|---|
| date | Fri, 26 Mar 2021 16:52:45 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:903fc43d6227 |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 # | |
| 3 # Author: petr.danecek@sanger | |
| 4 # | |
| 5 | |
| 6 use strict; | |
| 7 use warnings; | |
| 8 use Carp; | |
| 9 | |
| 10 my $opts = parse_params(); | |
| 11 parse_bamcheck($opts); | |
| 12 plot_qualities($opts); | |
| 13 plot_acgt_cycles($opts); | |
| 14 plot_gc($opts); | |
| 15 plot_gc_depth($opts); | |
| 16 plot_isize($opts); | |
| 17 plot_coverage($opts); | |
| 18 plot_mismatches_per_cycle($opts); | |
| 19 plot_indel_dist($opts); | |
| 20 plot_indel_cycles($opts); | |
| 21 | |
| 22 exit; | |
| 23 | |
| 24 #-------------------------------- | |
| 25 | |
| 26 sub error | |
| 27 { | |
| 28 my (@msg) = @_; | |
| 29 if ( scalar @msg ) { confess @msg; } | |
| 30 die | |
| 31 "Usage: plot-bamcheck [OPTIONS] file.bam.bc\n", | |
| 32 " plot-bamcheck -p outdir/ file.bam.bc\n", | |
| 33 "Options:\n", | |
| 34 " -k, --keep-files Do not remove temporary files.\n", | |
| 35 " -p, --prefix <path> The output files prefix, add a slash to create new directory.\n", | |
| 36 " -r, --ref-stats <file.fa.gc> Optional reference stats file with expected GC content (created with -s).\n", | |
| 37 " -s, --do-ref-stats <file.fa> Calculate reference sequence GC for later use with -r\n", | |
| 38 " -t, --targets <file.tab> Restrict -s to the listed regions (tab-delimited chr,from,to. 1-based, inclusive)\n", | |
| 39 " -h, -?, --help This help message.\n", | |
| 40 "\n"; | |
| 41 } | |
| 42 | |
| 43 | |
| 44 sub parse_params | |
| 45 { | |
| 46 $0 =~ s{^.+/}{}; | |
| 47 my $opts = { args=>join(' ',$0,@ARGV) }; | |
| 48 while (defined(my $arg=shift(@ARGV))) | |
| 49 { | |
| 50 if ( $arg eq '-k' || $arg eq '--keep-files' ) { $$opts{keep_files}=1; next; } | |
| 51 if ( $arg eq '-r' || $arg eq '--ref-stats' ) { $$opts{ref_stats}=shift(@ARGV); next; } | |
| 52 if ( $arg eq '-s' || $arg eq '--do-ref-stats' ) { $$opts{do_ref_stats}=shift(@ARGV); next; } | |
| 53 if ( $arg eq '-t' || $arg eq '--targets' ) { $$opts{targets}=shift(@ARGV); next; } | |
| 54 if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; } | |
| 55 if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } | |
| 56 if ( -e $arg ) { $$opts{bamcheck}=$arg; next; } | |
| 57 error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n"); | |
| 58 } | |
| 59 if ( exists($$opts{do_ref_stats }) ) { do_ref_stats($opts); exit; } | |
| 60 if ( !exists($$opts{bamcheck}) ) { error("No bamcheck file?\n") } | |
| 61 if ( !exists($$opts{prefix}) ) { error("Expected -p parameter.\n") } | |
| 62 if ( $$opts{prefix}=~m{/$} ) { `mkdir -p $$opts{prefix}`; } | |
| 63 elsif ( !($$opts{prefix}=~/-$/) ) { $$opts{prefix} .= '-'; } | |
| 64 return $opts; | |
| 65 } | |
| 66 | |
| 67 | |
| 68 # Creates GC stats for either the whole reference or only on target regions for exome QC | |
| 69 sub do_ref_stats | |
| 70 { | |
| 71 my ($opts) = @_; | |
| 72 | |
| 73 | |
| 74 my %targets = (); | |
| 75 if ( exists($$opts{targets}) ) | |
| 76 { | |
| 77 my ($prev_chr,$prev_pos); | |
| 78 open(my $fh,'<',$$opts{targets}) or error("$$opts{targets}: $!"); | |
| 79 while (my $line=<$fh>) | |
| 80 { | |
| 81 if ( $line=~/^#/ ) { next; } | |
| 82 my ($chr,$from,$to) = split(/\s+/,$line); | |
| 83 chomp($to); | |
| 84 push @{$targets{$chr}}, $from,$to; | |
| 85 if ( !defined $prev_chr or $chr ne $prev_chr ) { $prev_chr=$chr; $prev_pos=$from } | |
| 86 if ( $prev_pos > $from ) { error("The file must be sorted: $$opts{targets}\n"); } | |
| 87 $prev_pos = $from; | |
| 88 } | |
| 89 close($fh); | |
| 90 } | |
| 91 | |
| 92 my $_len = 60; # for now do only standard fasta's with 60 bases per line | |
| 93 my %gc_counts = (); | |
| 94 my ($skip_chr,$pos,$ireg,$regions); | |
| 95 open(my $fh,'<',$$opts{do_ref_stats}) or error("$$opts{do_ref_stats}: $!"); | |
| 96 while (my $line=<$fh>) | |
| 97 { | |
| 98 if ( $line=~/^>/ ) | |
| 99 { | |
| 100 if ( !scalar %targets ) { next; } | |
| 101 | |
| 102 if ( !($line=~/>(\S+)/) ) { error("FIXME: could not determine chromosome name: $line"); } | |
| 103 if ( !exists($targets{$1}) ) { $skip_chr=$1; next; } | |
| 104 undef $skip_chr; | |
| 105 $pos = 0; | |
| 106 $ireg = 0; | |
| 107 $regions = $targets{$1}; | |
| 108 } | |
| 109 if ( defined $skip_chr ) { next; } | |
| 110 | |
| 111 # Only $_len sized lines are considered and no chopping for target regions. | |
| 112 chomp($line); | |
| 113 my $len = length($line); | |
| 114 if ( $len ne $_len ) { next; } | |
| 115 | |
| 116 if ( scalar %targets ) | |
| 117 { | |
| 118 while ( $ireg<@$regions && $$regions[$ireg+1]<=$pos ) { $ireg += 2; } | |
| 119 $pos += $len; | |
| 120 if ( $ireg==@$regions ) { next; } | |
| 121 if ( $pos < $$regions[$ireg] ) { next; } | |
| 122 } | |
| 123 | |
| 124 my $gc_count = 0; | |
| 125 for (my $i=0; $i<$len; $i++) | |
| 126 { | |
| 127 my $base = substr($line,$i,1); | |
| 128 if ( $base eq 'g' || $base eq 'G' || $base eq 'c' || $base eq 'C' ) { $gc_count++; } | |
| 129 } | |
| 130 $gc_counts{$gc_count}++; | |
| 131 } | |
| 132 | |
| 133 print "# Generated by $$opts{args}\n"; | |
| 134 print "# The columns are: GC content bin, normalized frequency\n"; | |
| 135 my $max; | |
| 136 for my $count (values %gc_counts) | |
| 137 { | |
| 138 if ( !defined $max or $count>$max ) { $max=$count; } | |
| 139 } | |
| 140 for my $gc (sort {$a<=>$b} keys %gc_counts) | |
| 141 { | |
| 142 if ( $gc==0 ) { next; } | |
| 143 printf "%f\t%f\n", $gc*100./$_len, $gc_counts{$gc}/$max; | |
| 144 } | |
| 145 } | |
| 146 | |
| 147 sub plot | |
| 148 { | |
| 149 my ($cmdfile) = @_; | |
| 150 my $cmd = "gnuplot $cmdfile"; | |
| 151 system($cmd); | |
| 152 if ( $? ) { error("The command exited with non-zero status $?:\n\t$cmd\n\n"); } | |
| 153 } | |
| 154 | |
| 155 | |
| 156 sub parse_bamcheck | |
| 157 { | |
| 158 my ($opts) = @_; | |
| 159 open(my $fh,'<',$$opts{bamcheck}) or error("$$opts{bamcheck}: $!"); | |
| 160 my $line = <$fh>; | |
| 161 if ( !($line=~/^# This file was produced by bamcheck (\S+)/) ) { error("Sanity check failed: was this file generated by bamcheck?"); } | |
| 162 $$opts{dat}{version} = $1; | |
| 163 while ($line=<$fh>) | |
| 164 { | |
| 165 if ( $line=~/^#/ ) { next; } | |
| 166 my @items = split(/\t/,$line); | |
| 167 chomp($items[-1]); | |
| 168 if ( $items[0] eq 'SN' ) | |
| 169 { | |
| 170 $$opts{dat}{$items[1]} = splice(@items,2); | |
| 171 next; | |
| 172 } | |
| 173 push @{$$opts{dat}{$items[0]}}, [splice(@items,1)]; | |
| 174 } | |
| 175 close($fh); | |
| 176 | |
| 177 # Check sanity | |
| 178 if ( !exists($$opts{dat}{'sequences:'}) or !$$opts{dat}{'sequences:'} ) | |
| 179 { | |
| 180 error("Sanity check failed: no sequences found by bamcheck??\n"); | |
| 181 } | |
| 182 } | |
| 183 | |
| 184 sub older_than | |
| 185 { | |
| 186 my ($opts,$version) = @_; | |
| 187 my ($year,$month,$day) = split(/-/,$version); | |
| 188 $version = $$opts{dat}{version}; | |
| 189 if ( !($version=~/\((\d+)-(\d+)-(\d+)\)$/) ) { return 1; } | |
| 190 if ( $1<$year ) { return 1; } | |
| 191 elsif ( $1>$year ) { return 0; } | |
| 192 if ( $2<$month ) { return 1; } | |
| 193 elsif ( $2>$month ) { return 0; } | |
| 194 if ( $3<$day ) { return 1; } | |
| 195 return 0; | |
| 196 } | |
| 197 | |
| 198 sub get_defaults | |
| 199 { | |
| 200 my ($opts,$img_fname,%args) = @_; | |
| 201 | |
| 202 if ( !($img_fname=~/\.png$/i) ) { error("FIXME: currently only PNG supported. (Easy to extend.)\n"); } | |
| 203 | |
| 204 # Determine the gnuplot script file name | |
| 205 my $gp_file = $img_fname; | |
| 206 $gp_file =~ s{\.[^.]+$}{.gp}; | |
| 207 if ( !($gp_file=~/.gp$/) ) { $gp_file .= '.gp'; } | |
| 208 | |
| 209 # Determine the default title: | |
| 210 # 5446_6/5446_6.bam.bc.gp -> 5446_6 | |
| 211 # test.aaa.png -> test.aaa | |
| 212 if ( !($$opts{bamcheck}=~m{([^/]+?)(?:\.bam)?(?:\.bc)?$}i) ) { error("FIXME: Could not determine the title from [$img_fname]\n"); } | |
| 213 my $title = $1; | |
| 214 | |
| 215 my $dir = $gp_file; | |
| 216 $dir =~ s{/[^/]+$}{}; | |
| 217 if ( $dir && $dir ne $gp_file ) { `mkdir -p $dir`; } | |
| 218 | |
| 219 my $wh = exists($args{wh}) ? $args{wh} : '600,400'; | |
| 220 | |
| 221 open(my $fh,'>',$gp_file) or error("$gp_file: $!"); | |
| 222 return { | |
| 223 title => $title, | |
| 224 gp => $gp_file, | |
| 225 img => $img_fname, | |
| 226 fh => $fh, | |
| 227 terminal => qq[set terminal png size $wh truecolor], | |
| 228 grid => 'set grid xtics ytics y2tics back lc rgb "#cccccc"', | |
| 229 }; | |
| 230 } | |
| 231 | |
| 232 sub percentile | |
| 233 { | |
| 234 my ($p,@vals) = @_; | |
| 235 my $N = 0; | |
| 236 for my $val (@vals) { $N += $val; } | |
| 237 my $n = $p*($N+1)/100.; | |
| 238 my $k = int($n); | |
| 239 my $d = $n-$k; | |
| 240 if ( $k<=0 ) { return 0; } | |
| 241 if ( $k>=$N ) { return scalar @vals-1; } | |
| 242 my $cnt; | |
| 243 for (my $i=0; $i<@vals; $i++) | |
| 244 { | |
| 245 $cnt += $vals[$i]; | |
| 246 if ( $cnt>=$k ) { return $i; } | |
| 247 } | |
| 248 error("FIXME: this should not happen [percentile]\n"); | |
| 249 } | |
| 250 | |
| 251 sub plot_qualities | |
| 252 { | |
| 253 my ($opts) = @_; | |
| 254 | |
| 255 if ( !exists($$opts{dat}{FFQ}) or !@{$$opts{dat}{FFQ}} ) { return; } | |
| 256 | |
| 257 my $yrange = @{$$opts{dat}{FFQ}[0]} > 50 ? @{$$opts{dat}{FFQ}[0]} : 50; | |
| 258 my $is_paired = $$opts{dat}{'is paired:'}; | |
| 259 | |
| 260 # Average quality per cycle, forward and reverse reads in one plot | |
| 261 my $args = get_defaults($opts,"$$opts{prefix}quals.png"); | |
| 262 my $fh = $$args{fh}; | |
| 263 print $fh qq[ | |
| 264 $$args{terminal} | |
| 265 set output "$$args{img}" | |
| 266 $$args{grid} | |
| 267 set ylabel "Average Quality" | |
| 268 set xlabel "Cycle" | |
| 269 set yrange [0:$yrange] | |
| 270 set title "$$args{title}" | |
| 271 plot '-' using 1:2 with lines title 'Forward reads' ] . ($is_paired ? q[, '-' using 1:2 with lines title 'Reverse reads'] : '') . q[ | |
| 272 ]; | |
| 273 my (@fp75,@fp50,@fmean); | |
| 274 my (@lp75,@lp50,@lmean); | |
| 275 my ($fmax,$fmax_qual,$fmax_cycle); | |
| 276 my ($lmax,$lmax_qual,$lmax_cycle); | |
| 277 for my $cycle (@{$$opts{dat}{FFQ}}) | |
| 278 { | |
| 279 my $sum=0; my $n=0; | |
| 280 for (my $iqual=1; $iqual<@$cycle; $iqual++) | |
| 281 { | |
| 282 $sum += $$cycle[$iqual]*$iqual; | |
| 283 $n += $$cycle[$iqual]; | |
| 284 if ( !defined $fmax or $fmax<$$cycle[$iqual] ) { $fmax=$$cycle[$iqual]; $fmax_qual=$iqual; $fmax_cycle=$$cycle[0]; } | |
| 285 } | |
| 286 my $p25 = percentile(25,(@$cycle)[1..$#$cycle]); | |
| 287 my $p50 = percentile(50,(@$cycle)[1..$#$cycle]); | |
| 288 my $p75 = percentile(75,(@$cycle)[1..$#$cycle]); | |
| 289 if ( !$n ) { next; } | |
| 290 push @fp75, "$$cycle[0]\t$p25\t$p75\n"; | |
| 291 push @fp50, "$$cycle[0]\t$p50\n"; | |
| 292 push @fmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n; | |
| 293 printf $fh $fmean[-1]; | |
| 294 } | |
| 295 print $fh "end\n"; | |
| 296 if ( $is_paired ) | |
| 297 { | |
| 298 for my $cycle (@{$$opts{dat}{LFQ}}) | |
| 299 { | |
| 300 my $sum=0; my $n=0; | |
| 301 for (my $iqual=1; $iqual<@$cycle; $iqual++) | |
| 302 { | |
| 303 $sum += $$cycle[$iqual]*$iqual; | |
| 304 $n += $$cycle[$iqual]; | |
| 305 if ( !defined $lmax or $lmax<$$cycle[$iqual] ) { $lmax=$$cycle[$iqual]; $lmax_qual=$iqual; $lmax_cycle=$$cycle[0]; } | |
| 306 } | |
| 307 my $p25 = percentile(25,(@$cycle)[1..$#$cycle]); | |
| 308 my $p50 = percentile(50,(@$cycle)[1..$#$cycle]); | |
| 309 my $p75 = percentile(75,(@$cycle)[1..$#$cycle]); | |
| 310 if ( !$n ) { next; } | |
| 311 push @lp75, "$$cycle[0]\t$p25\t$p75\n"; | |
| 312 push @lp50, "$$cycle[0]\t$p50\n"; | |
| 313 push @lmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n; | |
| 314 printf $fh $lmean[-1]; | |
| 315 } | |
| 316 print $fh "end\n"; | |
| 317 } | |
| 318 close($fh); | |
| 319 plot($$args{gp}); | |
| 320 | |
| 321 | |
| 322 | |
| 323 # Average, mean and quality percentiles per cycle, forward and reverse reads in separate plots | |
| 324 $args = get_defaults($opts,"$$opts{prefix}quals2.png",wh=>'700,500'); | |
| 325 $fh = $$args{fh}; | |
| 326 print $fh qq[ | |
| 327 $$args{terminal} | |
| 328 set output "$$args{img}" | |
| 329 $$args{grid} | |
| 330 set multiplot | |
| 331 set rmargin 0 | |
| 332 set lmargin 0 | |
| 333 set tmargin 0 | |
| 334 set bmargin 0 | |
| 335 set origin 0.1,0.1 | |
| 336 set size 0.4,0.8 | |
| 337 set yrange [0:$yrange] | |
| 338 set ylabel "Quality" | |
| 339 set xlabel "Cycle (fwd reads)" | |
| 340 plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 1 t 'Mean' | |
| 341 ]; | |
| 342 print $fh join('',@fp75),"end\n"; | |
| 343 print $fh join('',@fp50),"end\n"; | |
| 344 print $fh join('',@fmean),"end\n"; | |
| 345 if ( $is_paired ) | |
| 346 { | |
| 347 print $fh qq[ | |
| 348 set origin 0.55,0.1 | |
| 349 set size 0.4,0.8 | |
| 350 unset ytics | |
| 351 set y2tics mirror | |
| 352 set yrange [0:$yrange] | |
| 353 unset ylabel | |
| 354 set xlabel "Cycle (rev reads)" | |
| 355 set label "$$args{title}" at screen 0.5,0.95 center | |
| 356 plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 2 t 'Mean' | |
| 357 ]; | |
| 358 print $fh join('',@lp75),"end\n"; | |
| 359 print $fh join('',@lp50),"end\n"; | |
| 360 print $fh join('',@lmean),"end\n"; | |
| 361 } | |
| 362 close($fh); | |
| 363 plot($$args{gp}); | |
| 364 | |
| 365 | |
| 366 | |
| 367 # Quality distribution per cycle, the distribution is for each cycle plotted as a separate curve | |
| 368 $args = get_defaults($opts,"$$opts{prefix}quals3.png",wh=>'600,600'); | |
| 369 $fh = $$args{fh}; | |
| 370 my $nquals = @{$$opts{dat}{FFQ}[0]}-1; | |
| 371 my $ncycles = @{$$opts{dat}{FFQ}}; | |
| 372 print $fh qq[ | |
| 373 $$args{terminal} | |
| 374 set output "$$args{img}" | |
| 375 $$args{grid} | |
| 376 set multiplot | |
| 377 set rmargin 0 | |
| 378 set lmargin 0 | |
| 379 set tmargin 0 | |
| 380 set bmargin 0 | |
| 381 set origin 0.15,0.52 | |
| 382 set size 0.8,0.4 | |
| 383 set title "$$args{title}" | |
| 384 set ylabel "Frequency (fwd reads)" | |
| 385 set label "Cycle $fmax_cycle" at $fmax_qual+1,$fmax | |
| 386 unset xlabel | |
| 387 set xrange [0:$nquals] | |
| 388 set format x "" | |
| 389 ]; | |
| 390 my @plots; | |
| 391 for (my $i=0; $i<$ncycles; $i++) { push @plots, q['-' using 1:2 with lines t ''] } | |
| 392 print $fh "plot ", join(",", @plots), "\n"; | |
| 393 for my $cycle (@{$$opts{dat}{FFQ}}) | |
| 394 { | |
| 395 for (my $iqual=1; $iqual<$nquals; $iqual++) { print $fh "$iqual\t$$cycle[$iqual]\n"; } | |
| 396 print $fh "end\n"; | |
| 397 } | |
| 398 if ( $is_paired ) | |
| 399 { | |
| 400 print $fh qq[ | |
| 401 set origin 0.15,0.1 | |
| 402 set size 0.8,0.4 | |
| 403 unset title | |
| 404 unset format | |
| 405 set xtics | |
| 406 set xlabel "Quality" | |
| 407 unset label | |
| 408 set label "Cycle $lmax_cycle" at $lmax_qual+1,$lmax | |
| 409 set ylabel "Frequency (rev reads)" | |
| 410 ]; | |
| 411 print $fh "plot ", join(",", @plots), "\n"; | |
| 412 for my $cycle (@{$$opts{dat}{LFQ}}) | |
| 413 { | |
| 414 for (my $iqual=1; $iqual<$nquals; $iqual++) | |
| 415 { | |
| 416 print $fh "$iqual\t$$cycle[$iqual]\n"; | |
| 417 } | |
| 418 print $fh "end\n"; | |
| 419 } | |
| 420 } | |
| 421 close($fh); | |
| 422 plot($$args{gp}); | |
| 423 | |
| 424 | |
| 425 # Heatmap qualitites | |
| 426 $args = get_defaults($opts,"$$opts{prefix}quals-hm.png", wh=>'600,500'); | |
| 427 $fh = $$args{fh}; | |
| 428 my $max = defined $lmax && $lmax > $fmax ? $lmax : $fmax; | |
| 429 my @ytics; | |
| 430 for my $cycle (@{$$opts{dat}{FFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } } | |
| 431 my $ytics = join(',', @ytics); | |
| 432 print $fh qq[ | |
| 433 $$args{terminal} | |
| 434 set output "$$args{img}" | |
| 435 unset key | |
| 436 unset colorbox | |
| 437 set palette defined (0 0 0 0, 1 0 0 1, 3 0 1 0, 4 1 0 0, 6 1 1 1) | |
| 438 set cbrange [0:$max] | |
| 439 set yrange [0:$ncycles] | |
| 440 set xrange [0:$nquals] | |
| 441 set view map | |
| 442 set multiplot | |
| 443 set rmargin 0 | |
| 444 set lmargin 0 | |
| 445 set tmargin 0 | |
| 446 set bmargin 0 | |
| 447 set origin 0,0.46 | |
| 448 set size 0.95,0.6 | |
| 449 set obj 1 rectangle behind from first 0,0 to first $nquals,$ncycles | |
| 450 set obj 1 fillstyle solid 1.0 fillcolor rgbcolor "black" | |
| 451 set ylabel "Cycle (fwd reads)" offset character -1,0 | |
| 452 unset ytics | |
| 453 set ytics ($ytics) | |
| 454 unset xtics | |
| 455 set title "$$args{title}" | |
| 456 splot '-' matrix with image | |
| 457 ]; | |
| 458 for my $cycle (@{$$opts{dat}{FFQ}}) | |
| 459 { | |
| 460 for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; } | |
| 461 print $fh "\n"; | |
| 462 } | |
| 463 print $fh "end\nend\n"; | |
| 464 @ytics = (); | |
| 465 for my $cycle (@{$$opts{dat}{LFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } } | |
| 466 $ytics = join(',', @ytics); | |
| 467 print $fh qq[ | |
| 468 set origin 0,0.03 | |
| 469 set size 0.95,0.6 | |
| 470 set ylabel "Cycle (rev reads)" offset character -1,0 | |
| 471 set xlabel "Base Quality" | |
| 472 unset title | |
| 473 unset ytics | |
| 474 set ytics ($ytics) | |
| 475 set xrange [0:$nquals] | |
| 476 set xtics | |
| 477 set colorbox vertical user origin first ($nquals+1),0 size screen 0.025,0.812 | |
| 478 set cblabel "Number of bases" | |
| 479 splot '-' matrix with image | |
| 480 ]; | |
| 481 for my $cycle (@{$$opts{dat}{LFQ}}) | |
| 482 { | |
| 483 for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; } | |
| 484 print $fh "\n"; | |
| 485 } | |
| 486 print $fh "end\nend\n"; | |
| 487 close($fh); | |
| 488 plot($$args{gp}); | |
| 489 } | |
| 490 | |
| 491 | |
| 492 sub plot_acgt_cycles | |
| 493 { | |
| 494 my ($opts) = @_; | |
| 495 | |
| 496 if ( !exists($$opts{dat}{GCC}) or !@{$$opts{dat}{GCC}} ) { return; } | |
| 497 | |
| 498 my $args = get_defaults($opts,"$$opts{prefix}acgt-cycles.png"); | |
| 499 my $fh = $$args{fh}; | |
| 500 print $fh qq[ | |
| 501 $$args{terminal} | |
| 502 set output "$$args{img}" | |
| 503 $$args{grid} | |
| 504 set style line 1 linecolor rgb "green" | |
| 505 set style line 2 linecolor rgb "red" | |
| 506 set style line 3 linecolor rgb "black" | |
| 507 set style line 4 linecolor rgb "blue" | |
| 508 set style increment user | |
| 509 set ylabel "Base content [%]" | |
| 510 set xlabel "Read Cycle" | |
| 511 set yrange [0:100] | |
| 512 set title "$$args{title}" | |
| 513 plot '-' w l ti 'A', '-' w l ti 'C', '-' w l ti 'G', '-' w l ti 'T' | |
| 514 ]; | |
| 515 for my $base (1..4) | |
| 516 { | |
| 517 for my $cycle (@{$$opts{dat}{GCC}}) | |
| 518 { | |
| 519 print $fh $$cycle[0]+1,"\t",$$cycle[$base],"\n"; | |
| 520 } | |
| 521 print $fh "end\n"; | |
| 522 } | |
| 523 close($fh); | |
| 524 plot($$args{gp}); | |
| 525 } | |
| 526 | |
| 527 | |
| 528 sub plot_gc | |
| 529 { | |
| 530 my ($opts) = @_; | |
| 531 | |
| 532 my $is_paired = $$opts{dat}{'is paired:'}; | |
| 533 my $args = get_defaults($opts,"$$opts{prefix}gc-content.png"); | |
| 534 my $fh = $$args{fh}; | |
| 535 my ($gcl_max,$gcf_max,$lmax,$fmax); | |
| 536 for my $gc (@{$$opts{dat}{GCF}}) { if ( !defined $gcf_max or $gcf_max<$$gc[1] ) { $gcf_max=$$gc[1]; $fmax=$$gc[0]; } } | |
| 537 for my $gc (@{$$opts{dat}{GCL}}) { if ( !defined $gcl_max or $gcl_max<$$gc[1] ) { $gcl_max=$$gc[1]; $lmax=$$gc[0]; } } | |
| 538 my $gcmax = $is_paired && $gcl_max > $gcf_max ? $lmax : $fmax; | |
| 539 print $fh qq[ | |
| 540 $$args{terminal} | |
| 541 set output "$$args{img}" | |
| 542 $$args{grid} | |
| 543 set title "$$args{title}" | |
| 544 set ylabel "Normalized Frequency" | |
| 545 set xlabel "GC Content [%]" | |
| 546 set yrange [0:1.1] | |
| 547 set label sprintf("%.1f",$gcmax) at $gcmax,1 front offset 1,0 | |
| 548 plot ] | |
| 549 . (exists($$opts{ref_stats}) ? q['-' smooth csplines with lines lt 0 title 'Reference', ] : '') | |
| 550 . q['-' smooth csplines with lines lc 1 title 'First fragments' ] | |
| 551 . ($is_paired ? q[, '-' smooth csplines with lines lc 2 title 'Last fragments'] : '') | |
| 552 . q[ | |
| 553 ]; | |
| 554 if ( exists($$opts{ref_stats}) ) | |
| 555 { | |
| 556 open(my $ref,'<',$$opts{ref_stats}) or error("$$opts{ref_stats}: $!"); | |
| 557 while (my $line=<$ref>) { print $fh $line } | |
| 558 close($ref); | |
| 559 print $fh "end\n"; | |
| 560 } | |
| 561 for my $cycle (@{$$opts{dat}{GCF}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcf_max; } | |
| 562 print $fh "end\n"; | |
| 563 if ( $is_paired ) | |
| 564 { | |
| 565 for my $cycle (@{$$opts{dat}{GCL}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcl_max; } | |
| 566 print $fh "end\n"; | |
| 567 } | |
| 568 close($fh); | |
| 569 plot($$args{gp}); | |
| 570 } | |
| 571 | |
| 572 | |
| 573 sub plot_gc_depth | |
| 574 { | |
| 575 my ($opts) = @_; | |
| 576 | |
| 577 if ( !exists($$opts{dat}{GCD}) or !@{$$opts{dat}{GCD}} ) { return; } | |
| 578 | |
| 579 # Find unique sequence percentiles for 30,40, and 50% GC content, just to draw x2tics. | |
| 580 my @tics = ( {gc=>30},{gc=>40},{gc=>50} ); | |
| 581 for my $gc (@{$$opts{dat}{GCD}}) | |
| 582 { | |
| 583 for my $tic (@tics) | |
| 584 { | |
| 585 my $diff = abs($$gc[0]-$$tic{gc}); | |
| 586 if ( !exists($$tic{pr}) or $diff<$$tic{diff} ) { $$tic{pr}=$$gc[1]; $$tic{diff}=$diff; } | |
| 587 } | |
| 588 } | |
| 589 | |
| 590 my @x2tics; | |
| 591 for my $tic (@tics) { push @x2tics, qq["$$tic{gc}" $$tic{pr}]; } | |
| 592 my $x2tics = join(',',@x2tics); | |
| 593 | |
| 594 my $args = get_defaults($opts,"$$opts{prefix}gc-depth.png", wh=>'600,500'); | |
| 595 my $fh = $$args{fh}; | |
| 596 print $fh qq[ | |
| 597 $$args{terminal} | |
| 598 set output "$$args{img}" | |
| 599 $$args{grid} | |
| 600 set ylabel "Mapped depth" | |
| 601 set xlabel "Percentile of mapped sequence ordered by GC content" | |
| 602 set x2label "GC Content [%]" | |
| 603 set title "$$args{title}" | |
| 604 set x2tics ($x2tics) | |
| 605 set xtics nomirror | |
| 606 set xrange [0.1:99.9] | |
| 607 | |
| 608 plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#dedede" t '10-90th percentile' , \\ | |
| 609 '-' using 1:2:3 with filledcurve lt 1 lc rgb "#bbdeff" t '25-75th percentile' , \\ | |
| 610 '-' using 1:2 with lines lc rgb "#0084ff" t 'Median' | |
| 611 ]; | |
| 612 for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[2]\t$$gc[6]\n"; } print $fh "end\n"; | |
| 613 for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[3]\t$$gc[5]\n"; } print $fh "end\n"; | |
| 614 for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[4]\n"; } print $fh "end\n"; | |
| 615 close($fh); | |
| 616 plot($$args{gp}); | |
| 617 } | |
| 618 | |
| 619 | |
| 620 sub plot_isize | |
| 621 { | |
| 622 my ($opts) = @_; | |
| 623 | |
| 624 if ( !$$opts{dat}{'is paired:'} or !exists($$opts{dat}{IS}) or !@{$$opts{dat}{IS}} ) { return; } | |
| 625 | |
| 626 my ($isize_max,$isize_cnt); | |
| 627 for my $isize (@{$$opts{dat}{IS}}) | |
| 628 { | |
| 629 if ( !defined $isize_max or $isize_cnt<$$isize[1] ) { $isize_cnt=$$isize[1]; $isize_max=$$isize[0]; } | |
| 630 } | |
| 631 | |
| 632 my $args = get_defaults($opts,"$$opts{prefix}insert-size.png"); | |
| 633 my $fh = $$args{fh}; | |
| 634 print $fh qq[ | |
| 635 $$args{terminal} | |
| 636 set output "$$args{img}" | |
| 637 $$args{grid} | |
| 638 set rmargin 5 | |
| 639 set label sprintf("%d",$isize_max) at $isize_max+10,$isize_cnt | |
| 640 set ylabel "Number of pairs" | |
| 641 set xlabel "Insert Size" | |
| 642 set title "$$args{title}" | |
| 643 plot \\ | |
| 644 '-' with lines lc rgb 'black' title 'All pairs', \\ | |
| 645 '-' with lines title 'Inward', \\ | |
| 646 '-' with lines title 'Outward', \\ | |
| 647 '-' with lines title 'Other' | |
| 648 ]; | |
| 649 for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[1]\n"; } print $fh "end\n"; | |
| 650 for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[2]\n"; } print $fh "end\n"; | |
| 651 for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[3]\n"; } print $fh "end\n"; | |
| 652 for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[4]\n"; } print $fh "end\n"; | |
| 653 close($fh); | |
| 654 plot($$args{gp}); | |
| 655 } | |
| 656 | |
| 657 | |
| 658 sub plot_coverage | |
| 659 { | |
| 660 my ($opts) = @_; | |
| 661 | |
| 662 if ( !exists($$opts{dat}{COV}) or !@{$$opts{dat}{COV}} ) { return; } | |
| 663 | |
| 664 my @vals; | |
| 665 for my $cov (@{$$opts{dat}{COV}}) { push @vals,$$cov[2]; } | |
| 666 my $i = percentile(99.8,@vals); | |
| 667 my $p99 = $$opts{dat}{COV}[$i][1]; | |
| 668 | |
| 669 my $args = get_defaults($opts,"$$opts{prefix}coverage.png"); | |
| 670 my $fh = $$args{fh}; | |
| 671 print $fh qq[ | |
| 672 $$args{terminal} | |
| 673 set output "$$args{img}" | |
| 674 $$args{grid} | |
| 675 set ylabel "Number of mapped bases" | |
| 676 set xlabel "Coverage" | |
| 677 set style fill solid border -1 | |
| 678 set title "$$args{title}" | |
| 679 set xrange [:$p99] | |
| 680 plot '-' with lines notitle | |
| 681 ]; | |
| 682 for my $cov (@{$$opts{dat}{COV}}) | |
| 683 { | |
| 684 if ( $$cov[2]==0 ) { next; } | |
| 685 print $fh "$$cov[1]\t$$cov[2]\n"; | |
| 686 } | |
| 687 print $fh "end\n"; | |
| 688 close($fh); | |
| 689 plot($$args{gp}); | |
| 690 } | |
| 691 | |
| 692 | |
| 693 sub plot_mismatches_per_cycle | |
| 694 { | |
| 695 my ($opts) = @_; | |
| 696 | |
| 697 if ( !exists($$opts{dat}{MPC}) or !@{$$opts{dat}{MPC}} ) { return; } | |
| 698 if ( older_than($opts,'2012-02-06') ) { plot_mismatches_per_cycle_old($opts); } | |
| 699 | |
| 700 my $nquals = @{$$opts{dat}{MPC}[0]} - 2; | |
| 701 my $ncycles = @{$$opts{dat}{MPC}}; | |
| 702 my ($style,$with); | |
| 703 if ( $ncycles>100 ) { $style = ''; $with = 'w l'; } | |
| 704 else { $style = 'set style data histogram; set style histogram rowstacked'; $with = ''; } | |
| 705 | |
| 706 my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png"); | |
| 707 my $fh = $$args{fh}; | |
| 708 print $fh qq[ | |
| 709 $$args{terminal} | |
| 710 set output "$$args{img}" | |
| 711 $$args{grid} | |
| 712 set style line 1 linecolor rgb "#e40000" | |
| 713 set style line 2 linecolor rgb "#ff9f00" | |
| 714 set style line 3 linecolor rgb "#eeee00" | |
| 715 set style line 4 linecolor rgb "#4ebd68" | |
| 716 set style line 5 linecolor rgb "#0061ff" | |
| 717 set style increment user | |
| 718 set key left top | |
| 719 $style | |
| 720 set ylabel "Number of mismatches" | |
| 721 set xlabel "Read Cycle" | |
| 722 set style fill solid border -1 | |
| 723 set title "$$args{title}" | |
| 724 set xrange [-1:$ncycles] | |
| 725 plot '-' $with ti 'Base Quality>30', \\ | |
| 726 '-' $with ti '30>=Q>20', \\ | |
| 727 '-' $with ti '20>=Q>10', \\ | |
| 728 '-' $with ti '10>=Q', \\ | |
| 729 '-' $with ti "N's" | |
| 730 ]; | |
| 731 for my $cycle (@{$$opts{dat}{MPC}}) | |
| 732 { | |
| 733 my $sum; for my $idx (31..$#$cycle) { $sum += $$cycle[$idx]; } | |
| 734 print $fh "$sum\n"; | |
| 735 } | |
| 736 print $fh "end\n"; | |
| 737 for my $cycle (@{$$opts{dat}{MPC}}) | |
| 738 { | |
| 739 my $sum; for my $idx (22..31) { $sum += $$cycle[$idx]; } | |
| 740 print $fh "$sum\n"; | |
| 741 } | |
| 742 print $fh "end\n"; | |
| 743 for my $cycle (@{$$opts{dat}{MPC}}) | |
| 744 { | |
| 745 my $sum; for my $idx (12..21) { $sum += $$cycle[$idx]; } | |
| 746 print $fh "$sum\n"; | |
| 747 } | |
| 748 print $fh "end\n"; | |
| 749 for my $cycle (@{$$opts{dat}{MPC}}) | |
| 750 { | |
| 751 my $sum; for my $idx (2..11) { $sum += $$cycle[$idx]; } | |
| 752 print $fh "$sum\n"; | |
| 753 } | |
| 754 print $fh "end\n"; | |
| 755 for my $cycle (@{$$opts{dat}{MPC}}) { print $fh "$$cycle[1]\n"; } | |
| 756 print $fh "end\n"; | |
| 757 close($fh); | |
| 758 plot($$args{gp}); | |
| 759 } | |
| 760 | |
| 761 sub plot_indel_dist | |
| 762 { | |
| 763 my ($opts) = @_; | |
| 764 | |
| 765 if ( !exists($$opts{dat}{ID}) or !@{$$opts{dat}{ID}} ) { return; } | |
| 766 | |
| 767 my $args = get_defaults($opts,"$$opts{prefix}indel-dist.png"); | |
| 768 my $fh = $$args{fh}; | |
| 769 print $fh qq[ | |
| 770 $$args{terminal} | |
| 771 set output "$$args{img}" | |
| 772 $$args{grid} | |
| 773 set style line 1 linetype 1 linecolor rgb "red" | |
| 774 set style line 2 linetype 2 linecolor rgb "black" | |
| 775 set style line 3 linetype 3 linecolor rgb "green" | |
| 776 set style increment user | |
| 777 set ylabel "Indel count [log]" | |
| 778 set xlabel "Indel length" | |
| 779 set y2label "Insertions/Deletions ratio" | |
| 780 set log y | |
| 781 set y2tics nomirror | |
| 782 set ytics nomirror | |
| 783 set title "$$args{title}" | |
| 784 plot '-' w l ti 'Insertions', '-' w l ti 'Deletions', '-' axes x1y2 w l ti "Ins/Dels ratio" | |
| 785 ]; | |
| 786 for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n"; | |
| 787 for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n"; | |
| 788 for my $len (@{$$opts{dat}{ID}}) { printf $fh "%d\t%f\n", $$len[0],$$len[2]?$$len[1]/$$len[2]:0; } print $fh "end\n"; | |
| 789 close($fh); | |
| 790 plot($$args{gp}); | |
| 791 } | |
| 792 | |
| 793 sub plot_indel_cycles | |
| 794 { | |
| 795 my ($opts) = @_; | |
| 796 | |
| 797 if ( !exists($$opts{dat}{IC}) or !@{$$opts{dat}{IC}} ) { return; } | |
| 798 | |
| 799 my $args = get_defaults($opts,"$$opts{prefix}indel-cycles.png"); | |
| 800 my $fh = $$args{fh}; | |
| 801 print $fh qq[ | |
| 802 $$args{terminal} | |
| 803 set output "$$args{img}" | |
| 804 $$args{grid} | |
| 805 set style line 1 linetype 1 linecolor rgb "red" | |
| 806 set style line 2 linetype 2 linecolor rgb "black" | |
| 807 set style line 3 linetype 3 linecolor rgb "green" | |
| 808 set style line 4 linetype 4 linecolor rgb "blue" | |
| 809 set style increment user | |
| 810 set ylabel "Indel count" | |
| 811 set xlabel "Read Cycle" | |
| 812 set title "$$args{title}" | |
| 813 plot '-' w l ti 'Insertions (fwd)', '' w l ti 'Insertions (rev)', '' w l ti 'Deletions (fwd)', '' w l ti 'Deletions (rev)' | |
| 814 ]; | |
| 815 for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n"; | |
| 816 for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n"; | |
| 817 for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[3]\n"; } print $fh "end\n"; | |
| 818 for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[4]\n"; } print $fh "end\n"; | |
| 819 close($fh); | |
| 820 plot($$args{gp}); | |
| 821 } | |
| 822 | |
| 823 | |
| 824 | |
| 825 | |
| 826 | |
| 827 | |
| 828 | |
| 829 sub has_values | |
| 830 { | |
| 831 my ($opts,@tags) = @_; | |
| 832 for my $tag (@tags) | |
| 833 { | |
| 834 my (@lines) = `cat $$opts{bamcheck} | grep ^$tag | wc -l`; | |
| 835 chomp($lines[0]); | |
| 836 if ( $lines[0]<2 ) { return 0; } | |
| 837 } | |
| 838 return 1; | |
| 839 } | |
| 840 | |
| 841 sub plot_mismatches_per_cycle_old | |
| 842 { | |
| 843 my ($opts) = @_; | |
| 844 | |
| 845 my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png"); | |
| 846 my ($nquals) = `grep ^MPC $$opts{bamcheck} | awk '\$2==1' | sed 's,\\t,\\n,g' | wc -l`; | |
| 847 my ($ncycles) = `grep ^MPC $$opts{bamcheck} | wc -l`; | |
| 848 chomp($nquals); | |
| 849 chomp($ncycles); | |
| 850 $nquals--; | |
| 851 $ncycles--; | |
| 852 my @gr0_15 = (2..17); | |
| 853 my @gr16_30 = (18..32); | |
| 854 my @gr31_n = (33..$nquals); | |
| 855 my $gr0_15 = '$'. join('+$',@gr0_15); | |
| 856 my $gr16_30 = '$'. join('+$',@gr16_30); | |
| 857 my $gr31_n = '$'. join('+$',@gr31_n); | |
| 858 | |
| 859 open(my $fh,'>',$$args{gp}) or error("$$args{gp}: $!"); | |
| 860 print $fh q[ | |
| 861 set terminal png size 600,400 truecolor font "DejaVuSansMono,9" | |
| 862 set output "] . $$args{img} . q[" | |
| 863 | |
| 864 set key left top | |
| 865 set style data histogram | |
| 866 set style histogram rowstacked | |
| 867 | |
| 868 set grid back lc rgb "#aaaaaa" | |
| 869 set ylabel "Number of mismatches" | |
| 870 set xlabel "Read Cycle" | |
| 871 set style fill solid border -1 | |
| 872 set title "] . $$args{title} . qq[" | |
| 873 set xrange [-1:$ncycles] | |
| 874 | |
| 875 plot '< grep ^MPC $$opts{bamcheck} | cut -f 2-' using ($gr31_n) ti 'Base Quality>30', '' using ($gr16_30) ti '30>=Q>15', '' using ($gr0_15) ti '15>=Q' | |
| 876 ]; | |
| 877 close($fh); | |
| 878 | |
| 879 plot($$args{gp}); | |
| 880 } | |
| 881 | |
| 882 |
