Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/misc/plot-bamcheck @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:903fc43d6227 |
---|---|
1 #!/usr/bin/env perl | |
2 # | |
3 # Author: petr.danecek@sanger | |
4 # | |
5 | |
6 use strict; | |
7 use warnings; | |
8 use Carp; | |
9 | |
10 my $opts = parse_params(); | |
11 parse_bamcheck($opts); | |
12 plot_qualities($opts); | |
13 plot_acgt_cycles($opts); | |
14 plot_gc($opts); | |
15 plot_gc_depth($opts); | |
16 plot_isize($opts); | |
17 plot_coverage($opts); | |
18 plot_mismatches_per_cycle($opts); | |
19 plot_indel_dist($opts); | |
20 plot_indel_cycles($opts); | |
21 | |
22 exit; | |
23 | |
24 #-------------------------------- | |
25 | |
26 sub error | |
27 { | |
28 my (@msg) = @_; | |
29 if ( scalar @msg ) { confess @msg; } | |
30 die | |
31 "Usage: plot-bamcheck [OPTIONS] file.bam.bc\n", | |
32 " plot-bamcheck -p outdir/ file.bam.bc\n", | |
33 "Options:\n", | |
34 " -k, --keep-files Do not remove temporary files.\n", | |
35 " -p, --prefix <path> The output files prefix, add a slash to create new directory.\n", | |
36 " -r, --ref-stats <file.fa.gc> Optional reference stats file with expected GC content (created with -s).\n", | |
37 " -s, --do-ref-stats <file.fa> Calculate reference sequence GC for later use with -r\n", | |
38 " -t, --targets <file.tab> Restrict -s to the listed regions (tab-delimited chr,from,to. 1-based, inclusive)\n", | |
39 " -h, -?, --help This help message.\n", | |
40 "\n"; | |
41 } | |
42 | |
43 | |
44 sub parse_params | |
45 { | |
46 $0 =~ s{^.+/}{}; | |
47 my $opts = { args=>join(' ',$0,@ARGV) }; | |
48 while (defined(my $arg=shift(@ARGV))) | |
49 { | |
50 if ( $arg eq '-k' || $arg eq '--keep-files' ) { $$opts{keep_files}=1; next; } | |
51 if ( $arg eq '-r' || $arg eq '--ref-stats' ) { $$opts{ref_stats}=shift(@ARGV); next; } | |
52 if ( $arg eq '-s' || $arg eq '--do-ref-stats' ) { $$opts{do_ref_stats}=shift(@ARGV); next; } | |
53 if ( $arg eq '-t' || $arg eq '--targets' ) { $$opts{targets}=shift(@ARGV); next; } | |
54 if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; } | |
55 if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); } | |
56 if ( -e $arg ) { $$opts{bamcheck}=$arg; next; } | |
57 error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n"); | |
58 } | |
59 if ( exists($$opts{do_ref_stats }) ) { do_ref_stats($opts); exit; } | |
60 if ( !exists($$opts{bamcheck}) ) { error("No bamcheck file?\n") } | |
61 if ( !exists($$opts{prefix}) ) { error("Expected -p parameter.\n") } | |
62 if ( $$opts{prefix}=~m{/$} ) { `mkdir -p $$opts{prefix}`; } | |
63 elsif ( !($$opts{prefix}=~/-$/) ) { $$opts{prefix} .= '-'; } | |
64 return $opts; | |
65 } | |
66 | |
67 | |
68 # Creates GC stats for either the whole reference or only on target regions for exome QC | |
69 sub do_ref_stats | |
70 { | |
71 my ($opts) = @_; | |
72 | |
73 | |
74 my %targets = (); | |
75 if ( exists($$opts{targets}) ) | |
76 { | |
77 my ($prev_chr,$prev_pos); | |
78 open(my $fh,'<',$$opts{targets}) or error("$$opts{targets}: $!"); | |
79 while (my $line=<$fh>) | |
80 { | |
81 if ( $line=~/^#/ ) { next; } | |
82 my ($chr,$from,$to) = split(/\s+/,$line); | |
83 chomp($to); | |
84 push @{$targets{$chr}}, $from,$to; | |
85 if ( !defined $prev_chr or $chr ne $prev_chr ) { $prev_chr=$chr; $prev_pos=$from } | |
86 if ( $prev_pos > $from ) { error("The file must be sorted: $$opts{targets}\n"); } | |
87 $prev_pos = $from; | |
88 } | |
89 close($fh); | |
90 } | |
91 | |
92 my $_len = 60; # for now do only standard fasta's with 60 bases per line | |
93 my %gc_counts = (); | |
94 my ($skip_chr,$pos,$ireg,$regions); | |
95 open(my $fh,'<',$$opts{do_ref_stats}) or error("$$opts{do_ref_stats}: $!"); | |
96 while (my $line=<$fh>) | |
97 { | |
98 if ( $line=~/^>/ ) | |
99 { | |
100 if ( !scalar %targets ) { next; } | |
101 | |
102 if ( !($line=~/>(\S+)/) ) { error("FIXME: could not determine chromosome name: $line"); } | |
103 if ( !exists($targets{$1}) ) { $skip_chr=$1; next; } | |
104 undef $skip_chr; | |
105 $pos = 0; | |
106 $ireg = 0; | |
107 $regions = $targets{$1}; | |
108 } | |
109 if ( defined $skip_chr ) { next; } | |
110 | |
111 # Only $_len sized lines are considered and no chopping for target regions. | |
112 chomp($line); | |
113 my $len = length($line); | |
114 if ( $len ne $_len ) { next; } | |
115 | |
116 if ( scalar %targets ) | |
117 { | |
118 while ( $ireg<@$regions && $$regions[$ireg+1]<=$pos ) { $ireg += 2; } | |
119 $pos += $len; | |
120 if ( $ireg==@$regions ) { next; } | |
121 if ( $pos < $$regions[$ireg] ) { next; } | |
122 } | |
123 | |
124 my $gc_count = 0; | |
125 for (my $i=0; $i<$len; $i++) | |
126 { | |
127 my $base = substr($line,$i,1); | |
128 if ( $base eq 'g' || $base eq 'G' || $base eq 'c' || $base eq 'C' ) { $gc_count++; } | |
129 } | |
130 $gc_counts{$gc_count}++; | |
131 } | |
132 | |
133 print "# Generated by $$opts{args}\n"; | |
134 print "# The columns are: GC content bin, normalized frequency\n"; | |
135 my $max; | |
136 for my $count (values %gc_counts) | |
137 { | |
138 if ( !defined $max or $count>$max ) { $max=$count; } | |
139 } | |
140 for my $gc (sort {$a<=>$b} keys %gc_counts) | |
141 { | |
142 if ( $gc==0 ) { next; } | |
143 printf "%f\t%f\n", $gc*100./$_len, $gc_counts{$gc}/$max; | |
144 } | |
145 } | |
146 | |
147 sub plot | |
148 { | |
149 my ($cmdfile) = @_; | |
150 my $cmd = "gnuplot $cmdfile"; | |
151 system($cmd); | |
152 if ( $? ) { error("The command exited with non-zero status $?:\n\t$cmd\n\n"); } | |
153 } | |
154 | |
155 | |
156 sub parse_bamcheck | |
157 { | |
158 my ($opts) = @_; | |
159 open(my $fh,'<',$$opts{bamcheck}) or error("$$opts{bamcheck}: $!"); | |
160 my $line = <$fh>; | |
161 if ( !($line=~/^# This file was produced by bamcheck (\S+)/) ) { error("Sanity check failed: was this file generated by bamcheck?"); } | |
162 $$opts{dat}{version} = $1; | |
163 while ($line=<$fh>) | |
164 { | |
165 if ( $line=~/^#/ ) { next; } | |
166 my @items = split(/\t/,$line); | |
167 chomp($items[-1]); | |
168 if ( $items[0] eq 'SN' ) | |
169 { | |
170 $$opts{dat}{$items[1]} = splice(@items,2); | |
171 next; | |
172 } | |
173 push @{$$opts{dat}{$items[0]}}, [splice(@items,1)]; | |
174 } | |
175 close($fh); | |
176 | |
177 # Check sanity | |
178 if ( !exists($$opts{dat}{'sequences:'}) or !$$opts{dat}{'sequences:'} ) | |
179 { | |
180 error("Sanity check failed: no sequences found by bamcheck??\n"); | |
181 } | |
182 } | |
183 | |
184 sub older_than | |
185 { | |
186 my ($opts,$version) = @_; | |
187 my ($year,$month,$day) = split(/-/,$version); | |
188 $version = $$opts{dat}{version}; | |
189 if ( !($version=~/\((\d+)-(\d+)-(\d+)\)$/) ) { return 1; } | |
190 if ( $1<$year ) { return 1; } | |
191 elsif ( $1>$year ) { return 0; } | |
192 if ( $2<$month ) { return 1; } | |
193 elsif ( $2>$month ) { return 0; } | |
194 if ( $3<$day ) { return 1; } | |
195 return 0; | |
196 } | |
197 | |
198 sub get_defaults | |
199 { | |
200 my ($opts,$img_fname,%args) = @_; | |
201 | |
202 if ( !($img_fname=~/\.png$/i) ) { error("FIXME: currently only PNG supported. (Easy to extend.)\n"); } | |
203 | |
204 # Determine the gnuplot script file name | |
205 my $gp_file = $img_fname; | |
206 $gp_file =~ s{\.[^.]+$}{.gp}; | |
207 if ( !($gp_file=~/.gp$/) ) { $gp_file .= '.gp'; } | |
208 | |
209 # Determine the default title: | |
210 # 5446_6/5446_6.bam.bc.gp -> 5446_6 | |
211 # test.aaa.png -> test.aaa | |
212 if ( !($$opts{bamcheck}=~m{([^/]+?)(?:\.bam)?(?:\.bc)?$}i) ) { error("FIXME: Could not determine the title from [$img_fname]\n"); } | |
213 my $title = $1; | |
214 | |
215 my $dir = $gp_file; | |
216 $dir =~ s{/[^/]+$}{}; | |
217 if ( $dir && $dir ne $gp_file ) { `mkdir -p $dir`; } | |
218 | |
219 my $wh = exists($args{wh}) ? $args{wh} : '600,400'; | |
220 | |
221 open(my $fh,'>',$gp_file) or error("$gp_file: $!"); | |
222 return { | |
223 title => $title, | |
224 gp => $gp_file, | |
225 img => $img_fname, | |
226 fh => $fh, | |
227 terminal => qq[set terminal png size $wh truecolor], | |
228 grid => 'set grid xtics ytics y2tics back lc rgb "#cccccc"', | |
229 }; | |
230 } | |
231 | |
232 sub percentile | |
233 { | |
234 my ($p,@vals) = @_; | |
235 my $N = 0; | |
236 for my $val (@vals) { $N += $val; } | |
237 my $n = $p*($N+1)/100.; | |
238 my $k = int($n); | |
239 my $d = $n-$k; | |
240 if ( $k<=0 ) { return 0; } | |
241 if ( $k>=$N ) { return scalar @vals-1; } | |
242 my $cnt; | |
243 for (my $i=0; $i<@vals; $i++) | |
244 { | |
245 $cnt += $vals[$i]; | |
246 if ( $cnt>=$k ) { return $i; } | |
247 } | |
248 error("FIXME: this should not happen [percentile]\n"); | |
249 } | |
250 | |
251 sub plot_qualities | |
252 { | |
253 my ($opts) = @_; | |
254 | |
255 if ( !exists($$opts{dat}{FFQ}) or !@{$$opts{dat}{FFQ}} ) { return; } | |
256 | |
257 my $yrange = @{$$opts{dat}{FFQ}[0]} > 50 ? @{$$opts{dat}{FFQ}[0]} : 50; | |
258 my $is_paired = $$opts{dat}{'is paired:'}; | |
259 | |
260 # Average quality per cycle, forward and reverse reads in one plot | |
261 my $args = get_defaults($opts,"$$opts{prefix}quals.png"); | |
262 my $fh = $$args{fh}; | |
263 print $fh qq[ | |
264 $$args{terminal} | |
265 set output "$$args{img}" | |
266 $$args{grid} | |
267 set ylabel "Average Quality" | |
268 set xlabel "Cycle" | |
269 set yrange [0:$yrange] | |
270 set title "$$args{title}" | |
271 plot '-' using 1:2 with lines title 'Forward reads' ] . ($is_paired ? q[, '-' using 1:2 with lines title 'Reverse reads'] : '') . q[ | |
272 ]; | |
273 my (@fp75,@fp50,@fmean); | |
274 my (@lp75,@lp50,@lmean); | |
275 my ($fmax,$fmax_qual,$fmax_cycle); | |
276 my ($lmax,$lmax_qual,$lmax_cycle); | |
277 for my $cycle (@{$$opts{dat}{FFQ}}) | |
278 { | |
279 my $sum=0; my $n=0; | |
280 for (my $iqual=1; $iqual<@$cycle; $iqual++) | |
281 { | |
282 $sum += $$cycle[$iqual]*$iqual; | |
283 $n += $$cycle[$iqual]; | |
284 if ( !defined $fmax or $fmax<$$cycle[$iqual] ) { $fmax=$$cycle[$iqual]; $fmax_qual=$iqual; $fmax_cycle=$$cycle[0]; } | |
285 } | |
286 my $p25 = percentile(25,(@$cycle)[1..$#$cycle]); | |
287 my $p50 = percentile(50,(@$cycle)[1..$#$cycle]); | |
288 my $p75 = percentile(75,(@$cycle)[1..$#$cycle]); | |
289 if ( !$n ) { next; } | |
290 push @fp75, "$$cycle[0]\t$p25\t$p75\n"; | |
291 push @fp50, "$$cycle[0]\t$p50\n"; | |
292 push @fmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n; | |
293 printf $fh $fmean[-1]; | |
294 } | |
295 print $fh "end\n"; | |
296 if ( $is_paired ) | |
297 { | |
298 for my $cycle (@{$$opts{dat}{LFQ}}) | |
299 { | |
300 my $sum=0; my $n=0; | |
301 for (my $iqual=1; $iqual<@$cycle; $iqual++) | |
302 { | |
303 $sum += $$cycle[$iqual]*$iqual; | |
304 $n += $$cycle[$iqual]; | |
305 if ( !defined $lmax or $lmax<$$cycle[$iqual] ) { $lmax=$$cycle[$iqual]; $lmax_qual=$iqual; $lmax_cycle=$$cycle[0]; } | |
306 } | |
307 my $p25 = percentile(25,(@$cycle)[1..$#$cycle]); | |
308 my $p50 = percentile(50,(@$cycle)[1..$#$cycle]); | |
309 my $p75 = percentile(75,(@$cycle)[1..$#$cycle]); | |
310 if ( !$n ) { next; } | |
311 push @lp75, "$$cycle[0]\t$p25\t$p75\n"; | |
312 push @lp50, "$$cycle[0]\t$p50\n"; | |
313 push @lmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n; | |
314 printf $fh $lmean[-1]; | |
315 } | |
316 print $fh "end\n"; | |
317 } | |
318 close($fh); | |
319 plot($$args{gp}); | |
320 | |
321 | |
322 | |
323 # Average, mean and quality percentiles per cycle, forward and reverse reads in separate plots | |
324 $args = get_defaults($opts,"$$opts{prefix}quals2.png",wh=>'700,500'); | |
325 $fh = $$args{fh}; | |
326 print $fh qq[ | |
327 $$args{terminal} | |
328 set output "$$args{img}" | |
329 $$args{grid} | |
330 set multiplot | |
331 set rmargin 0 | |
332 set lmargin 0 | |
333 set tmargin 0 | |
334 set bmargin 0 | |
335 set origin 0.1,0.1 | |
336 set size 0.4,0.8 | |
337 set yrange [0:$yrange] | |
338 set ylabel "Quality" | |
339 set xlabel "Cycle (fwd reads)" | |
340 plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 1 t 'Mean' | |
341 ]; | |
342 print $fh join('',@fp75),"end\n"; | |
343 print $fh join('',@fp50),"end\n"; | |
344 print $fh join('',@fmean),"end\n"; | |
345 if ( $is_paired ) | |
346 { | |
347 print $fh qq[ | |
348 set origin 0.55,0.1 | |
349 set size 0.4,0.8 | |
350 unset ytics | |
351 set y2tics mirror | |
352 set yrange [0:$yrange] | |
353 unset ylabel | |
354 set xlabel "Cycle (rev reads)" | |
355 set label "$$args{title}" at screen 0.5,0.95 center | |
356 plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 2 t 'Mean' | |
357 ]; | |
358 print $fh join('',@lp75),"end\n"; | |
359 print $fh join('',@lp50),"end\n"; | |
360 print $fh join('',@lmean),"end\n"; | |
361 } | |
362 close($fh); | |
363 plot($$args{gp}); | |
364 | |
365 | |
366 | |
367 # Quality distribution per cycle, the distribution is for each cycle plotted as a separate curve | |
368 $args = get_defaults($opts,"$$opts{prefix}quals3.png",wh=>'600,600'); | |
369 $fh = $$args{fh}; | |
370 my $nquals = @{$$opts{dat}{FFQ}[0]}-1; | |
371 my $ncycles = @{$$opts{dat}{FFQ}}; | |
372 print $fh qq[ | |
373 $$args{terminal} | |
374 set output "$$args{img}" | |
375 $$args{grid} | |
376 set multiplot | |
377 set rmargin 0 | |
378 set lmargin 0 | |
379 set tmargin 0 | |
380 set bmargin 0 | |
381 set origin 0.15,0.52 | |
382 set size 0.8,0.4 | |
383 set title "$$args{title}" | |
384 set ylabel "Frequency (fwd reads)" | |
385 set label "Cycle $fmax_cycle" at $fmax_qual+1,$fmax | |
386 unset xlabel | |
387 set xrange [0:$nquals] | |
388 set format x "" | |
389 ]; | |
390 my @plots; | |
391 for (my $i=0; $i<$ncycles; $i++) { push @plots, q['-' using 1:2 with lines t ''] } | |
392 print $fh "plot ", join(",", @plots), "\n"; | |
393 for my $cycle (@{$$opts{dat}{FFQ}}) | |
394 { | |
395 for (my $iqual=1; $iqual<$nquals; $iqual++) { print $fh "$iqual\t$$cycle[$iqual]\n"; } | |
396 print $fh "end\n"; | |
397 } | |
398 if ( $is_paired ) | |
399 { | |
400 print $fh qq[ | |
401 set origin 0.15,0.1 | |
402 set size 0.8,0.4 | |
403 unset title | |
404 unset format | |
405 set xtics | |
406 set xlabel "Quality" | |
407 unset label | |
408 set label "Cycle $lmax_cycle" at $lmax_qual+1,$lmax | |
409 set ylabel "Frequency (rev reads)" | |
410 ]; | |
411 print $fh "plot ", join(",", @plots), "\n"; | |
412 for my $cycle (@{$$opts{dat}{LFQ}}) | |
413 { | |
414 for (my $iqual=1; $iqual<$nquals; $iqual++) | |
415 { | |
416 print $fh "$iqual\t$$cycle[$iqual]\n"; | |
417 } | |
418 print $fh "end\n"; | |
419 } | |
420 } | |
421 close($fh); | |
422 plot($$args{gp}); | |
423 | |
424 | |
425 # Heatmap qualitites | |
426 $args = get_defaults($opts,"$$opts{prefix}quals-hm.png", wh=>'600,500'); | |
427 $fh = $$args{fh}; | |
428 my $max = defined $lmax && $lmax > $fmax ? $lmax : $fmax; | |
429 my @ytics; | |
430 for my $cycle (@{$$opts{dat}{FFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } } | |
431 my $ytics = join(',', @ytics); | |
432 print $fh qq[ | |
433 $$args{terminal} | |
434 set output "$$args{img}" | |
435 unset key | |
436 unset colorbox | |
437 set palette defined (0 0 0 0, 1 0 0 1, 3 0 1 0, 4 1 0 0, 6 1 1 1) | |
438 set cbrange [0:$max] | |
439 set yrange [0:$ncycles] | |
440 set xrange [0:$nquals] | |
441 set view map | |
442 set multiplot | |
443 set rmargin 0 | |
444 set lmargin 0 | |
445 set tmargin 0 | |
446 set bmargin 0 | |
447 set origin 0,0.46 | |
448 set size 0.95,0.6 | |
449 set obj 1 rectangle behind from first 0,0 to first $nquals,$ncycles | |
450 set obj 1 fillstyle solid 1.0 fillcolor rgbcolor "black" | |
451 set ylabel "Cycle (fwd reads)" offset character -1,0 | |
452 unset ytics | |
453 set ytics ($ytics) | |
454 unset xtics | |
455 set title "$$args{title}" | |
456 splot '-' matrix with image | |
457 ]; | |
458 for my $cycle (@{$$opts{dat}{FFQ}}) | |
459 { | |
460 for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; } | |
461 print $fh "\n"; | |
462 } | |
463 print $fh "end\nend\n"; | |
464 @ytics = (); | |
465 for my $cycle (@{$$opts{dat}{LFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } } | |
466 $ytics = join(',', @ytics); | |
467 print $fh qq[ | |
468 set origin 0,0.03 | |
469 set size 0.95,0.6 | |
470 set ylabel "Cycle (rev reads)" offset character -1,0 | |
471 set xlabel "Base Quality" | |
472 unset title | |
473 unset ytics | |
474 set ytics ($ytics) | |
475 set xrange [0:$nquals] | |
476 set xtics | |
477 set colorbox vertical user origin first ($nquals+1),0 size screen 0.025,0.812 | |
478 set cblabel "Number of bases" | |
479 splot '-' matrix with image | |
480 ]; | |
481 for my $cycle (@{$$opts{dat}{LFQ}}) | |
482 { | |
483 for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; } | |
484 print $fh "\n"; | |
485 } | |
486 print $fh "end\nend\n"; | |
487 close($fh); | |
488 plot($$args{gp}); | |
489 } | |
490 | |
491 | |
492 sub plot_acgt_cycles | |
493 { | |
494 my ($opts) = @_; | |
495 | |
496 if ( !exists($$opts{dat}{GCC}) or !@{$$opts{dat}{GCC}} ) { return; } | |
497 | |
498 my $args = get_defaults($opts,"$$opts{prefix}acgt-cycles.png"); | |
499 my $fh = $$args{fh}; | |
500 print $fh qq[ | |
501 $$args{terminal} | |
502 set output "$$args{img}" | |
503 $$args{grid} | |
504 set style line 1 linecolor rgb "green" | |
505 set style line 2 linecolor rgb "red" | |
506 set style line 3 linecolor rgb "black" | |
507 set style line 4 linecolor rgb "blue" | |
508 set style increment user | |
509 set ylabel "Base content [%]" | |
510 set xlabel "Read Cycle" | |
511 set yrange [0:100] | |
512 set title "$$args{title}" | |
513 plot '-' w l ti 'A', '-' w l ti 'C', '-' w l ti 'G', '-' w l ti 'T' | |
514 ]; | |
515 for my $base (1..4) | |
516 { | |
517 for my $cycle (@{$$opts{dat}{GCC}}) | |
518 { | |
519 print $fh $$cycle[0]+1,"\t",$$cycle[$base],"\n"; | |
520 } | |
521 print $fh "end\n"; | |
522 } | |
523 close($fh); | |
524 plot($$args{gp}); | |
525 } | |
526 | |
527 | |
528 sub plot_gc | |
529 { | |
530 my ($opts) = @_; | |
531 | |
532 my $is_paired = $$opts{dat}{'is paired:'}; | |
533 my $args = get_defaults($opts,"$$opts{prefix}gc-content.png"); | |
534 my $fh = $$args{fh}; | |
535 my ($gcl_max,$gcf_max,$lmax,$fmax); | |
536 for my $gc (@{$$opts{dat}{GCF}}) { if ( !defined $gcf_max or $gcf_max<$$gc[1] ) { $gcf_max=$$gc[1]; $fmax=$$gc[0]; } } | |
537 for my $gc (@{$$opts{dat}{GCL}}) { if ( !defined $gcl_max or $gcl_max<$$gc[1] ) { $gcl_max=$$gc[1]; $lmax=$$gc[0]; } } | |
538 my $gcmax = $is_paired && $gcl_max > $gcf_max ? $lmax : $fmax; | |
539 print $fh qq[ | |
540 $$args{terminal} | |
541 set output "$$args{img}" | |
542 $$args{grid} | |
543 set title "$$args{title}" | |
544 set ylabel "Normalized Frequency" | |
545 set xlabel "GC Content [%]" | |
546 set yrange [0:1.1] | |
547 set label sprintf("%.1f",$gcmax) at $gcmax,1 front offset 1,0 | |
548 plot ] | |
549 . (exists($$opts{ref_stats}) ? q['-' smooth csplines with lines lt 0 title 'Reference', ] : '') | |
550 . q['-' smooth csplines with lines lc 1 title 'First fragments' ] | |
551 . ($is_paired ? q[, '-' smooth csplines with lines lc 2 title 'Last fragments'] : '') | |
552 . q[ | |
553 ]; | |
554 if ( exists($$opts{ref_stats}) ) | |
555 { | |
556 open(my $ref,'<',$$opts{ref_stats}) or error("$$opts{ref_stats}: $!"); | |
557 while (my $line=<$ref>) { print $fh $line } | |
558 close($ref); | |
559 print $fh "end\n"; | |
560 } | |
561 for my $cycle (@{$$opts{dat}{GCF}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcf_max; } | |
562 print $fh "end\n"; | |
563 if ( $is_paired ) | |
564 { | |
565 for my $cycle (@{$$opts{dat}{GCL}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcl_max; } | |
566 print $fh "end\n"; | |
567 } | |
568 close($fh); | |
569 plot($$args{gp}); | |
570 } | |
571 | |
572 | |
573 sub plot_gc_depth | |
574 { | |
575 my ($opts) = @_; | |
576 | |
577 if ( !exists($$opts{dat}{GCD}) or !@{$$opts{dat}{GCD}} ) { return; } | |
578 | |
579 # Find unique sequence percentiles for 30,40, and 50% GC content, just to draw x2tics. | |
580 my @tics = ( {gc=>30},{gc=>40},{gc=>50} ); | |
581 for my $gc (@{$$opts{dat}{GCD}}) | |
582 { | |
583 for my $tic (@tics) | |
584 { | |
585 my $diff = abs($$gc[0]-$$tic{gc}); | |
586 if ( !exists($$tic{pr}) or $diff<$$tic{diff} ) { $$tic{pr}=$$gc[1]; $$tic{diff}=$diff; } | |
587 } | |
588 } | |
589 | |
590 my @x2tics; | |
591 for my $tic (@tics) { push @x2tics, qq["$$tic{gc}" $$tic{pr}]; } | |
592 my $x2tics = join(',',@x2tics); | |
593 | |
594 my $args = get_defaults($opts,"$$opts{prefix}gc-depth.png", wh=>'600,500'); | |
595 my $fh = $$args{fh}; | |
596 print $fh qq[ | |
597 $$args{terminal} | |
598 set output "$$args{img}" | |
599 $$args{grid} | |
600 set ylabel "Mapped depth" | |
601 set xlabel "Percentile of mapped sequence ordered by GC content" | |
602 set x2label "GC Content [%]" | |
603 set title "$$args{title}" | |
604 set x2tics ($x2tics) | |
605 set xtics nomirror | |
606 set xrange [0.1:99.9] | |
607 | |
608 plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#dedede" t '10-90th percentile' , \\ | |
609 '-' using 1:2:3 with filledcurve lt 1 lc rgb "#bbdeff" t '25-75th percentile' , \\ | |
610 '-' using 1:2 with lines lc rgb "#0084ff" t 'Median' | |
611 ]; | |
612 for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[2]\t$$gc[6]\n"; } print $fh "end\n"; | |
613 for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[3]\t$$gc[5]\n"; } print $fh "end\n"; | |
614 for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[4]\n"; } print $fh "end\n"; | |
615 close($fh); | |
616 plot($$args{gp}); | |
617 } | |
618 | |
619 | |
620 sub plot_isize | |
621 { | |
622 my ($opts) = @_; | |
623 | |
624 if ( !$$opts{dat}{'is paired:'} or !exists($$opts{dat}{IS}) or !@{$$opts{dat}{IS}} ) { return; } | |
625 | |
626 my ($isize_max,$isize_cnt); | |
627 for my $isize (@{$$opts{dat}{IS}}) | |
628 { | |
629 if ( !defined $isize_max or $isize_cnt<$$isize[1] ) { $isize_cnt=$$isize[1]; $isize_max=$$isize[0]; } | |
630 } | |
631 | |
632 my $args = get_defaults($opts,"$$opts{prefix}insert-size.png"); | |
633 my $fh = $$args{fh}; | |
634 print $fh qq[ | |
635 $$args{terminal} | |
636 set output "$$args{img}" | |
637 $$args{grid} | |
638 set rmargin 5 | |
639 set label sprintf("%d",$isize_max) at $isize_max+10,$isize_cnt | |
640 set ylabel "Number of pairs" | |
641 set xlabel "Insert Size" | |
642 set title "$$args{title}" | |
643 plot \\ | |
644 '-' with lines lc rgb 'black' title 'All pairs', \\ | |
645 '-' with lines title 'Inward', \\ | |
646 '-' with lines title 'Outward', \\ | |
647 '-' with lines title 'Other' | |
648 ]; | |
649 for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[1]\n"; } print $fh "end\n"; | |
650 for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[2]\n"; } print $fh "end\n"; | |
651 for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[3]\n"; } print $fh "end\n"; | |
652 for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[4]\n"; } print $fh "end\n"; | |
653 close($fh); | |
654 plot($$args{gp}); | |
655 } | |
656 | |
657 | |
658 sub plot_coverage | |
659 { | |
660 my ($opts) = @_; | |
661 | |
662 if ( !exists($$opts{dat}{COV}) or !@{$$opts{dat}{COV}} ) { return; } | |
663 | |
664 my @vals; | |
665 for my $cov (@{$$opts{dat}{COV}}) { push @vals,$$cov[2]; } | |
666 my $i = percentile(99.8,@vals); | |
667 my $p99 = $$opts{dat}{COV}[$i][1]; | |
668 | |
669 my $args = get_defaults($opts,"$$opts{prefix}coverage.png"); | |
670 my $fh = $$args{fh}; | |
671 print $fh qq[ | |
672 $$args{terminal} | |
673 set output "$$args{img}" | |
674 $$args{grid} | |
675 set ylabel "Number of mapped bases" | |
676 set xlabel "Coverage" | |
677 set style fill solid border -1 | |
678 set title "$$args{title}" | |
679 set xrange [:$p99] | |
680 plot '-' with lines notitle | |
681 ]; | |
682 for my $cov (@{$$opts{dat}{COV}}) | |
683 { | |
684 if ( $$cov[2]==0 ) { next; } | |
685 print $fh "$$cov[1]\t$$cov[2]\n"; | |
686 } | |
687 print $fh "end\n"; | |
688 close($fh); | |
689 plot($$args{gp}); | |
690 } | |
691 | |
692 | |
693 sub plot_mismatches_per_cycle | |
694 { | |
695 my ($opts) = @_; | |
696 | |
697 if ( !exists($$opts{dat}{MPC}) or !@{$$opts{dat}{MPC}} ) { return; } | |
698 if ( older_than($opts,'2012-02-06') ) { plot_mismatches_per_cycle_old($opts); } | |
699 | |
700 my $nquals = @{$$opts{dat}{MPC}[0]} - 2; | |
701 my $ncycles = @{$$opts{dat}{MPC}}; | |
702 my ($style,$with); | |
703 if ( $ncycles>100 ) { $style = ''; $with = 'w l'; } | |
704 else { $style = 'set style data histogram; set style histogram rowstacked'; $with = ''; } | |
705 | |
706 my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png"); | |
707 my $fh = $$args{fh}; | |
708 print $fh qq[ | |
709 $$args{terminal} | |
710 set output "$$args{img}" | |
711 $$args{grid} | |
712 set style line 1 linecolor rgb "#e40000" | |
713 set style line 2 linecolor rgb "#ff9f00" | |
714 set style line 3 linecolor rgb "#eeee00" | |
715 set style line 4 linecolor rgb "#4ebd68" | |
716 set style line 5 linecolor rgb "#0061ff" | |
717 set style increment user | |
718 set key left top | |
719 $style | |
720 set ylabel "Number of mismatches" | |
721 set xlabel "Read Cycle" | |
722 set style fill solid border -1 | |
723 set title "$$args{title}" | |
724 set xrange [-1:$ncycles] | |
725 plot '-' $with ti 'Base Quality>30', \\ | |
726 '-' $with ti '30>=Q>20', \\ | |
727 '-' $with ti '20>=Q>10', \\ | |
728 '-' $with ti '10>=Q', \\ | |
729 '-' $with ti "N's" | |
730 ]; | |
731 for my $cycle (@{$$opts{dat}{MPC}}) | |
732 { | |
733 my $sum; for my $idx (31..$#$cycle) { $sum += $$cycle[$idx]; } | |
734 print $fh "$sum\n"; | |
735 } | |
736 print $fh "end\n"; | |
737 for my $cycle (@{$$opts{dat}{MPC}}) | |
738 { | |
739 my $sum; for my $idx (22..31) { $sum += $$cycle[$idx]; } | |
740 print $fh "$sum\n"; | |
741 } | |
742 print $fh "end\n"; | |
743 for my $cycle (@{$$opts{dat}{MPC}}) | |
744 { | |
745 my $sum; for my $idx (12..21) { $sum += $$cycle[$idx]; } | |
746 print $fh "$sum\n"; | |
747 } | |
748 print $fh "end\n"; | |
749 for my $cycle (@{$$opts{dat}{MPC}}) | |
750 { | |
751 my $sum; for my $idx (2..11) { $sum += $$cycle[$idx]; } | |
752 print $fh "$sum\n"; | |
753 } | |
754 print $fh "end\n"; | |
755 for my $cycle (@{$$opts{dat}{MPC}}) { print $fh "$$cycle[1]\n"; } | |
756 print $fh "end\n"; | |
757 close($fh); | |
758 plot($$args{gp}); | |
759 } | |
760 | |
761 sub plot_indel_dist | |
762 { | |
763 my ($opts) = @_; | |
764 | |
765 if ( !exists($$opts{dat}{ID}) or !@{$$opts{dat}{ID}} ) { return; } | |
766 | |
767 my $args = get_defaults($opts,"$$opts{prefix}indel-dist.png"); | |
768 my $fh = $$args{fh}; | |
769 print $fh qq[ | |
770 $$args{terminal} | |
771 set output "$$args{img}" | |
772 $$args{grid} | |
773 set style line 1 linetype 1 linecolor rgb "red" | |
774 set style line 2 linetype 2 linecolor rgb "black" | |
775 set style line 3 linetype 3 linecolor rgb "green" | |
776 set style increment user | |
777 set ylabel "Indel count [log]" | |
778 set xlabel "Indel length" | |
779 set y2label "Insertions/Deletions ratio" | |
780 set log y | |
781 set y2tics nomirror | |
782 set ytics nomirror | |
783 set title "$$args{title}" | |
784 plot '-' w l ti 'Insertions', '-' w l ti 'Deletions', '-' axes x1y2 w l ti "Ins/Dels ratio" | |
785 ]; | |
786 for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n"; | |
787 for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n"; | |
788 for my $len (@{$$opts{dat}{ID}}) { printf $fh "%d\t%f\n", $$len[0],$$len[2]?$$len[1]/$$len[2]:0; } print $fh "end\n"; | |
789 close($fh); | |
790 plot($$args{gp}); | |
791 } | |
792 | |
793 sub plot_indel_cycles | |
794 { | |
795 my ($opts) = @_; | |
796 | |
797 if ( !exists($$opts{dat}{IC}) or !@{$$opts{dat}{IC}} ) { return; } | |
798 | |
799 my $args = get_defaults($opts,"$$opts{prefix}indel-cycles.png"); | |
800 my $fh = $$args{fh}; | |
801 print $fh qq[ | |
802 $$args{terminal} | |
803 set output "$$args{img}" | |
804 $$args{grid} | |
805 set style line 1 linetype 1 linecolor rgb "red" | |
806 set style line 2 linetype 2 linecolor rgb "black" | |
807 set style line 3 linetype 3 linecolor rgb "green" | |
808 set style line 4 linetype 4 linecolor rgb "blue" | |
809 set style increment user | |
810 set ylabel "Indel count" | |
811 set xlabel "Read Cycle" | |
812 set title "$$args{title}" | |
813 plot '-' w l ti 'Insertions (fwd)', '' w l ti 'Insertions (rev)', '' w l ti 'Deletions (fwd)', '' w l ti 'Deletions (rev)' | |
814 ]; | |
815 for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n"; | |
816 for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n"; | |
817 for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[3]\n"; } print $fh "end\n"; | |
818 for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[4]\n"; } print $fh "end\n"; | |
819 close($fh); | |
820 plot($$args{gp}); | |
821 } | |
822 | |
823 | |
824 | |
825 | |
826 | |
827 | |
828 | |
829 sub has_values | |
830 { | |
831 my ($opts,@tags) = @_; | |
832 for my $tag (@tags) | |
833 { | |
834 my (@lines) = `cat $$opts{bamcheck} | grep ^$tag | wc -l`; | |
835 chomp($lines[0]); | |
836 if ( $lines[0]<2 ) { return 0; } | |
837 } | |
838 return 1; | |
839 } | |
840 | |
841 sub plot_mismatches_per_cycle_old | |
842 { | |
843 my ($opts) = @_; | |
844 | |
845 my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png"); | |
846 my ($nquals) = `grep ^MPC $$opts{bamcheck} | awk '\$2==1' | sed 's,\\t,\\n,g' | wc -l`; | |
847 my ($ncycles) = `grep ^MPC $$opts{bamcheck} | wc -l`; | |
848 chomp($nquals); | |
849 chomp($ncycles); | |
850 $nquals--; | |
851 $ncycles--; | |
852 my @gr0_15 = (2..17); | |
853 my @gr16_30 = (18..32); | |
854 my @gr31_n = (33..$nquals); | |
855 my $gr0_15 = '$'. join('+$',@gr0_15); | |
856 my $gr16_30 = '$'. join('+$',@gr16_30); | |
857 my $gr31_n = '$'. join('+$',@gr31_n); | |
858 | |
859 open(my $fh,'>',$$args{gp}) or error("$$args{gp}: $!"); | |
860 print $fh q[ | |
861 set terminal png size 600,400 truecolor font "DejaVuSansMono,9" | |
862 set output "] . $$args{img} . q[" | |
863 | |
864 set key left top | |
865 set style data histogram | |
866 set style histogram rowstacked | |
867 | |
868 set grid back lc rgb "#aaaaaa" | |
869 set ylabel "Number of mismatches" | |
870 set xlabel "Read Cycle" | |
871 set style fill solid border -1 | |
872 set title "] . $$args{title} . qq[" | |
873 set xrange [-1:$ncycles] | |
874 | |
875 plot '< grep ^MPC $$opts{bamcheck} | cut -f 2-' using ($gr31_n) ti 'Base Quality>30', '' using ($gr16_30) ti '30>=Q>15', '' using ($gr0_15) ti '15>=Q' | |
876 ]; | |
877 close($fh); | |
878 | |
879 plot($$args{gp}); | |
880 } | |
881 | |
882 |