0
|
1 #!/usr/bin/env perl
|
|
2 #
|
|
3 # Author: petr.danecek@sanger
|
|
4 #
|
|
5
|
|
6 use strict;
|
|
7 use warnings;
|
|
8 use Carp;
|
|
9
|
|
10 my $opts = parse_params();
|
|
11 parse_bamcheck($opts);
|
|
12 plot_qualities($opts);
|
|
13 plot_acgt_cycles($opts);
|
|
14 plot_gc($opts);
|
|
15 plot_gc_depth($opts);
|
|
16 plot_isize($opts);
|
|
17 plot_coverage($opts);
|
|
18 plot_mismatches_per_cycle($opts);
|
|
19 plot_indel_dist($opts);
|
|
20 plot_indel_cycles($opts);
|
|
21
|
|
22 exit;
|
|
23
|
|
24 #--------------------------------
|
|
25
|
|
26 sub error
|
|
27 {
|
|
28 my (@msg) = @_;
|
|
29 if ( scalar @msg ) { confess @msg; }
|
|
30 die
|
|
31 "Usage: plot-bamcheck [OPTIONS] file.bam.bc\n",
|
|
32 " plot-bamcheck -p outdir/ file.bam.bc\n",
|
|
33 "Options:\n",
|
|
34 " -k, --keep-files Do not remove temporary files.\n",
|
|
35 " -p, --prefix <path> The output files prefix, add a slash to create new directory.\n",
|
|
36 " -r, --ref-stats <file.fa.gc> Optional reference stats file with expected GC content (created with -s).\n",
|
|
37 " -s, --do-ref-stats <file.fa> Calculate reference sequence GC for later use with -r\n",
|
|
38 " -t, --targets <file.tab> Restrict -s to the listed regions (tab-delimited chr,from,to. 1-based, inclusive)\n",
|
|
39 " -h, -?, --help This help message.\n",
|
|
40 "\n";
|
|
41 }
|
|
42
|
|
43
|
|
44 sub parse_params
|
|
45 {
|
|
46 $0 =~ s{^.+/}{};
|
|
47 my $opts = { args=>join(' ',$0,@ARGV) };
|
|
48 while (defined(my $arg=shift(@ARGV)))
|
|
49 {
|
|
50 if ( $arg eq '-k' || $arg eq '--keep-files' ) { $$opts{keep_files}=1; next; }
|
|
51 if ( $arg eq '-r' || $arg eq '--ref-stats' ) { $$opts{ref_stats}=shift(@ARGV); next; }
|
|
52 if ( $arg eq '-s' || $arg eq '--do-ref-stats' ) { $$opts{do_ref_stats}=shift(@ARGV); next; }
|
|
53 if ( $arg eq '-t' || $arg eq '--targets' ) { $$opts{targets}=shift(@ARGV); next; }
|
|
54 if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; }
|
|
55 if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
|
|
56 if ( -e $arg ) { $$opts{bamcheck}=$arg; next; }
|
|
57 error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n");
|
|
58 }
|
|
59 if ( exists($$opts{do_ref_stats }) ) { do_ref_stats($opts); exit; }
|
|
60 if ( !exists($$opts{bamcheck}) ) { error("No bamcheck file?\n") }
|
|
61 if ( !exists($$opts{prefix}) ) { error("Expected -p parameter.\n") }
|
|
62 if ( $$opts{prefix}=~m{/$} ) { `mkdir -p $$opts{prefix}`; }
|
|
63 elsif ( !($$opts{prefix}=~/-$/) ) { $$opts{prefix} .= '-'; }
|
|
64 return $opts;
|
|
65 }
|
|
66
|
|
67
|
|
68 # Creates GC stats for either the whole reference or only on target regions for exome QC
|
|
69 sub do_ref_stats
|
|
70 {
|
|
71 my ($opts) = @_;
|
|
72
|
|
73
|
|
74 my %targets = ();
|
|
75 if ( exists($$opts{targets}) )
|
|
76 {
|
|
77 my ($prev_chr,$prev_pos);
|
|
78 open(my $fh,'<',$$opts{targets}) or error("$$opts{targets}: $!");
|
|
79 while (my $line=<$fh>)
|
|
80 {
|
|
81 if ( $line=~/^#/ ) { next; }
|
|
82 my ($chr,$from,$to) = split(/\s+/,$line);
|
|
83 chomp($to);
|
|
84 push @{$targets{$chr}}, $from,$to;
|
|
85 if ( !defined $prev_chr or $chr ne $prev_chr ) { $prev_chr=$chr; $prev_pos=$from }
|
|
86 if ( $prev_pos > $from ) { error("The file must be sorted: $$opts{targets}\n"); }
|
|
87 $prev_pos = $from;
|
|
88 }
|
|
89 close($fh);
|
|
90 }
|
|
91
|
|
92 my $_len = 60; # for now do only standard fasta's with 60 bases per line
|
|
93 my %gc_counts = ();
|
|
94 my ($skip_chr,$pos,$ireg,$regions);
|
|
95 open(my $fh,'<',$$opts{do_ref_stats}) or error("$$opts{do_ref_stats}: $!");
|
|
96 while (my $line=<$fh>)
|
|
97 {
|
|
98 if ( $line=~/^>/ )
|
|
99 {
|
|
100 if ( !scalar %targets ) { next; }
|
|
101
|
|
102 if ( !($line=~/>(\S+)/) ) { error("FIXME: could not determine chromosome name: $line"); }
|
|
103 if ( !exists($targets{$1}) ) { $skip_chr=$1; next; }
|
|
104 undef $skip_chr;
|
|
105 $pos = 0;
|
|
106 $ireg = 0;
|
|
107 $regions = $targets{$1};
|
|
108 }
|
|
109 if ( defined $skip_chr ) { next; }
|
|
110
|
|
111 # Only $_len sized lines are considered and no chopping for target regions.
|
|
112 chomp($line);
|
|
113 my $len = length($line);
|
|
114 if ( $len ne $_len ) { next; }
|
|
115
|
|
116 if ( scalar %targets )
|
|
117 {
|
|
118 while ( $ireg<@$regions && $$regions[$ireg+1]<=$pos ) { $ireg += 2; }
|
|
119 $pos += $len;
|
|
120 if ( $ireg==@$regions ) { next; }
|
|
121 if ( $pos < $$regions[$ireg] ) { next; }
|
|
122 }
|
|
123
|
|
124 my $gc_count = 0;
|
|
125 for (my $i=0; $i<$len; $i++)
|
|
126 {
|
|
127 my $base = substr($line,$i,1);
|
|
128 if ( $base eq 'g' || $base eq 'G' || $base eq 'c' || $base eq 'C' ) { $gc_count++; }
|
|
129 }
|
|
130 $gc_counts{$gc_count}++;
|
|
131 }
|
|
132
|
|
133 print "# Generated by $$opts{args}\n";
|
|
134 print "# The columns are: GC content bin, normalized frequency\n";
|
|
135 my $max;
|
|
136 for my $count (values %gc_counts)
|
|
137 {
|
|
138 if ( !defined $max or $count>$max ) { $max=$count; }
|
|
139 }
|
|
140 for my $gc (sort {$a<=>$b} keys %gc_counts)
|
|
141 {
|
|
142 if ( $gc==0 ) { next; }
|
|
143 printf "%f\t%f\n", $gc*100./$_len, $gc_counts{$gc}/$max;
|
|
144 }
|
|
145 }
|
|
146
|
|
147 sub plot
|
|
148 {
|
|
149 my ($cmdfile) = @_;
|
|
150 my $cmd = "gnuplot $cmdfile";
|
|
151 system($cmd);
|
|
152 if ( $? ) { error("The command exited with non-zero status $?:\n\t$cmd\n\n"); }
|
|
153 }
|
|
154
|
|
155
|
|
156 sub parse_bamcheck
|
|
157 {
|
|
158 my ($opts) = @_;
|
|
159 open(my $fh,'<',$$opts{bamcheck}) or error("$$opts{bamcheck}: $!");
|
|
160 my $line = <$fh>;
|
|
161 if ( !($line=~/^# This file was produced by bamcheck (\S+)/) ) { error("Sanity check failed: was this file generated by bamcheck?"); }
|
|
162 $$opts{dat}{version} = $1;
|
|
163 while ($line=<$fh>)
|
|
164 {
|
|
165 if ( $line=~/^#/ ) { next; }
|
|
166 my @items = split(/\t/,$line);
|
|
167 chomp($items[-1]);
|
|
168 if ( $items[0] eq 'SN' )
|
|
169 {
|
|
170 $$opts{dat}{$items[1]} = splice(@items,2);
|
|
171 next;
|
|
172 }
|
|
173 push @{$$opts{dat}{$items[0]}}, [splice(@items,1)];
|
|
174 }
|
|
175 close($fh);
|
|
176
|
|
177 # Check sanity
|
|
178 if ( !exists($$opts{dat}{'sequences:'}) or !$$opts{dat}{'sequences:'} )
|
|
179 {
|
|
180 error("Sanity check failed: no sequences found by bamcheck??\n");
|
|
181 }
|
|
182 }
|
|
183
|
|
184 sub older_than
|
|
185 {
|
|
186 my ($opts,$version) = @_;
|
|
187 my ($year,$month,$day) = split(/-/,$version);
|
|
188 $version = $$opts{dat}{version};
|
|
189 if ( !($version=~/\((\d+)-(\d+)-(\d+)\)$/) ) { return 1; }
|
|
190 if ( $1<$year ) { return 1; }
|
|
191 elsif ( $1>$year ) { return 0; }
|
|
192 if ( $2<$month ) { return 1; }
|
|
193 elsif ( $2>$month ) { return 0; }
|
|
194 if ( $3<$day ) { return 1; }
|
|
195 return 0;
|
|
196 }
|
|
197
|
|
198 sub get_defaults
|
|
199 {
|
|
200 my ($opts,$img_fname,%args) = @_;
|
|
201
|
|
202 if ( !($img_fname=~/\.png$/i) ) { error("FIXME: currently only PNG supported. (Easy to extend.)\n"); }
|
|
203
|
|
204 # Determine the gnuplot script file name
|
|
205 my $gp_file = $img_fname;
|
|
206 $gp_file =~ s{\.[^.]+$}{.gp};
|
|
207 if ( !($gp_file=~/.gp$/) ) { $gp_file .= '.gp'; }
|
|
208
|
|
209 # Determine the default title:
|
|
210 # 5446_6/5446_6.bam.bc.gp -> 5446_6
|
|
211 # test.aaa.png -> test.aaa
|
|
212 if ( !($$opts{bamcheck}=~m{([^/]+?)(?:\.bam)?(?:\.bc)?$}i) ) { error("FIXME: Could not determine the title from [$img_fname]\n"); }
|
|
213 my $title = $1;
|
|
214
|
|
215 my $dir = $gp_file;
|
|
216 $dir =~ s{/[^/]+$}{};
|
|
217 if ( $dir && $dir ne $gp_file ) { `mkdir -p $dir`; }
|
|
218
|
|
219 my $wh = exists($args{wh}) ? $args{wh} : '600,400';
|
|
220
|
|
221 open(my $fh,'>',$gp_file) or error("$gp_file: $!");
|
|
222 return {
|
|
223 title => $title,
|
|
224 gp => $gp_file,
|
|
225 img => $img_fname,
|
|
226 fh => $fh,
|
|
227 terminal => qq[set terminal png size $wh truecolor],
|
|
228 grid => 'set grid xtics ytics y2tics back lc rgb "#cccccc"',
|
|
229 };
|
|
230 }
|
|
231
|
|
232 sub percentile
|
|
233 {
|
|
234 my ($p,@vals) = @_;
|
|
235 my $N = 0;
|
|
236 for my $val (@vals) { $N += $val; }
|
|
237 my $n = $p*($N+1)/100.;
|
|
238 my $k = int($n);
|
|
239 my $d = $n-$k;
|
|
240 if ( $k<=0 ) { return 0; }
|
|
241 if ( $k>=$N ) { return scalar @vals-1; }
|
|
242 my $cnt;
|
|
243 for (my $i=0; $i<@vals; $i++)
|
|
244 {
|
|
245 $cnt += $vals[$i];
|
|
246 if ( $cnt>=$k ) { return $i; }
|
|
247 }
|
|
248 error("FIXME: this should not happen [percentile]\n");
|
|
249 }
|
|
250
|
|
251 sub plot_qualities
|
|
252 {
|
|
253 my ($opts) = @_;
|
|
254
|
|
255 if ( !exists($$opts{dat}{FFQ}) or !@{$$opts{dat}{FFQ}} ) { return; }
|
|
256
|
|
257 my $yrange = @{$$opts{dat}{FFQ}[0]} > 50 ? @{$$opts{dat}{FFQ}[0]} : 50;
|
|
258 my $is_paired = $$opts{dat}{'is paired:'};
|
|
259
|
|
260 # Average quality per cycle, forward and reverse reads in one plot
|
|
261 my $args = get_defaults($opts,"$$opts{prefix}quals.png");
|
|
262 my $fh = $$args{fh};
|
|
263 print $fh qq[
|
|
264 $$args{terminal}
|
|
265 set output "$$args{img}"
|
|
266 $$args{grid}
|
|
267 set ylabel "Average Quality"
|
|
268 set xlabel "Cycle"
|
|
269 set yrange [0:$yrange]
|
|
270 set title "$$args{title}"
|
|
271 plot '-' using 1:2 with lines title 'Forward reads' ] . ($is_paired ? q[, '-' using 1:2 with lines title 'Reverse reads'] : '') . q[
|
|
272 ];
|
|
273 my (@fp75,@fp50,@fmean);
|
|
274 my (@lp75,@lp50,@lmean);
|
|
275 my ($fmax,$fmax_qual,$fmax_cycle);
|
|
276 my ($lmax,$lmax_qual,$lmax_cycle);
|
|
277 for my $cycle (@{$$opts{dat}{FFQ}})
|
|
278 {
|
|
279 my $sum=0; my $n=0;
|
|
280 for (my $iqual=1; $iqual<@$cycle; $iqual++)
|
|
281 {
|
|
282 $sum += $$cycle[$iqual]*$iqual;
|
|
283 $n += $$cycle[$iqual];
|
|
284 if ( !defined $fmax or $fmax<$$cycle[$iqual] ) { $fmax=$$cycle[$iqual]; $fmax_qual=$iqual; $fmax_cycle=$$cycle[0]; }
|
|
285 }
|
|
286 my $p25 = percentile(25,(@$cycle)[1..$#$cycle]);
|
|
287 my $p50 = percentile(50,(@$cycle)[1..$#$cycle]);
|
|
288 my $p75 = percentile(75,(@$cycle)[1..$#$cycle]);
|
|
289 if ( !$n ) { next; }
|
|
290 push @fp75, "$$cycle[0]\t$p25\t$p75\n";
|
|
291 push @fp50, "$$cycle[0]\t$p50\n";
|
|
292 push @fmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n;
|
|
293 printf $fh $fmean[-1];
|
|
294 }
|
|
295 print $fh "end\n";
|
|
296 if ( $is_paired )
|
|
297 {
|
|
298 for my $cycle (@{$$opts{dat}{LFQ}})
|
|
299 {
|
|
300 my $sum=0; my $n=0;
|
|
301 for (my $iqual=1; $iqual<@$cycle; $iqual++)
|
|
302 {
|
|
303 $sum += $$cycle[$iqual]*$iqual;
|
|
304 $n += $$cycle[$iqual];
|
|
305 if ( !defined $lmax or $lmax<$$cycle[$iqual] ) { $lmax=$$cycle[$iqual]; $lmax_qual=$iqual; $lmax_cycle=$$cycle[0]; }
|
|
306 }
|
|
307 my $p25 = percentile(25,(@$cycle)[1..$#$cycle]);
|
|
308 my $p50 = percentile(50,(@$cycle)[1..$#$cycle]);
|
|
309 my $p75 = percentile(75,(@$cycle)[1..$#$cycle]);
|
|
310 if ( !$n ) { next; }
|
|
311 push @lp75, "$$cycle[0]\t$p25\t$p75\n";
|
|
312 push @lp50, "$$cycle[0]\t$p50\n";
|
|
313 push @lmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n;
|
|
314 printf $fh $lmean[-1];
|
|
315 }
|
|
316 print $fh "end\n";
|
|
317 }
|
|
318 close($fh);
|
|
319 plot($$args{gp});
|
|
320
|
|
321
|
|
322
|
|
323 # Average, mean and quality percentiles per cycle, forward and reverse reads in separate plots
|
|
324 $args = get_defaults($opts,"$$opts{prefix}quals2.png",wh=>'700,500');
|
|
325 $fh = $$args{fh};
|
|
326 print $fh qq[
|
|
327 $$args{terminal}
|
|
328 set output "$$args{img}"
|
|
329 $$args{grid}
|
|
330 set multiplot
|
|
331 set rmargin 0
|
|
332 set lmargin 0
|
|
333 set tmargin 0
|
|
334 set bmargin 0
|
|
335 set origin 0.1,0.1
|
|
336 set size 0.4,0.8
|
|
337 set yrange [0:$yrange]
|
|
338 set ylabel "Quality"
|
|
339 set xlabel "Cycle (fwd reads)"
|
|
340 plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 1 t 'Mean'
|
|
341 ];
|
|
342 print $fh join('',@fp75),"end\n";
|
|
343 print $fh join('',@fp50),"end\n";
|
|
344 print $fh join('',@fmean),"end\n";
|
|
345 if ( $is_paired )
|
|
346 {
|
|
347 print $fh qq[
|
|
348 set origin 0.55,0.1
|
|
349 set size 0.4,0.8
|
|
350 unset ytics
|
|
351 set y2tics mirror
|
|
352 set yrange [0:$yrange]
|
|
353 unset ylabel
|
|
354 set xlabel "Cycle (rev reads)"
|
|
355 set label "$$args{title}" at screen 0.5,0.95 center
|
|
356 plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 2 t 'Mean'
|
|
357 ];
|
|
358 print $fh join('',@lp75),"end\n";
|
|
359 print $fh join('',@lp50),"end\n";
|
|
360 print $fh join('',@lmean),"end\n";
|
|
361 }
|
|
362 close($fh);
|
|
363 plot($$args{gp});
|
|
364
|
|
365
|
|
366
|
|
367 # Quality distribution per cycle, the distribution is for each cycle plotted as a separate curve
|
|
368 $args = get_defaults($opts,"$$opts{prefix}quals3.png",wh=>'600,600');
|
|
369 $fh = $$args{fh};
|
|
370 my $nquals = @{$$opts{dat}{FFQ}[0]}-1;
|
|
371 my $ncycles = @{$$opts{dat}{FFQ}};
|
|
372 print $fh qq[
|
|
373 $$args{terminal}
|
|
374 set output "$$args{img}"
|
|
375 $$args{grid}
|
|
376 set multiplot
|
|
377 set rmargin 0
|
|
378 set lmargin 0
|
|
379 set tmargin 0
|
|
380 set bmargin 0
|
|
381 set origin 0.15,0.52
|
|
382 set size 0.8,0.4
|
|
383 set title "$$args{title}"
|
|
384 set ylabel "Frequency (fwd reads)"
|
|
385 set label "Cycle $fmax_cycle" at $fmax_qual+1,$fmax
|
|
386 unset xlabel
|
|
387 set xrange [0:$nquals]
|
|
388 set format x ""
|
|
389 ];
|
|
390 my @plots;
|
|
391 for (my $i=0; $i<$ncycles; $i++) { push @plots, q['-' using 1:2 with lines t ''] }
|
|
392 print $fh "plot ", join(",", @plots), "\n";
|
|
393 for my $cycle (@{$$opts{dat}{FFQ}})
|
|
394 {
|
|
395 for (my $iqual=1; $iqual<$nquals; $iqual++) { print $fh "$iqual\t$$cycle[$iqual]\n"; }
|
|
396 print $fh "end\n";
|
|
397 }
|
|
398 if ( $is_paired )
|
|
399 {
|
|
400 print $fh qq[
|
|
401 set origin 0.15,0.1
|
|
402 set size 0.8,0.4
|
|
403 unset title
|
|
404 unset format
|
|
405 set xtics
|
|
406 set xlabel "Quality"
|
|
407 unset label
|
|
408 set label "Cycle $lmax_cycle" at $lmax_qual+1,$lmax
|
|
409 set ylabel "Frequency (rev reads)"
|
|
410 ];
|
|
411 print $fh "plot ", join(",", @plots), "\n";
|
|
412 for my $cycle (@{$$opts{dat}{LFQ}})
|
|
413 {
|
|
414 for (my $iqual=1; $iqual<$nquals; $iqual++)
|
|
415 {
|
|
416 print $fh "$iqual\t$$cycle[$iqual]\n";
|
|
417 }
|
|
418 print $fh "end\n";
|
|
419 }
|
|
420 }
|
|
421 close($fh);
|
|
422 plot($$args{gp});
|
|
423
|
|
424
|
|
425 # Heatmap qualitites
|
|
426 $args = get_defaults($opts,"$$opts{prefix}quals-hm.png", wh=>'600,500');
|
|
427 $fh = $$args{fh};
|
|
428 my $max = defined $lmax && $lmax > $fmax ? $lmax : $fmax;
|
|
429 my @ytics;
|
|
430 for my $cycle (@{$$opts{dat}{FFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } }
|
|
431 my $ytics = join(',', @ytics);
|
|
432 print $fh qq[
|
|
433 $$args{terminal}
|
|
434 set output "$$args{img}"
|
|
435 unset key
|
|
436 unset colorbox
|
|
437 set palette defined (0 0 0 0, 1 0 0 1, 3 0 1 0, 4 1 0 0, 6 1 1 1)
|
|
438 set cbrange [0:$max]
|
|
439 set yrange [0:$ncycles]
|
|
440 set xrange [0:$nquals]
|
|
441 set view map
|
|
442 set multiplot
|
|
443 set rmargin 0
|
|
444 set lmargin 0
|
|
445 set tmargin 0
|
|
446 set bmargin 0
|
|
447 set origin 0,0.46
|
|
448 set size 0.95,0.6
|
|
449 set obj 1 rectangle behind from first 0,0 to first $nquals,$ncycles
|
|
450 set obj 1 fillstyle solid 1.0 fillcolor rgbcolor "black"
|
|
451 set ylabel "Cycle (fwd reads)" offset character -1,0
|
|
452 unset ytics
|
|
453 set ytics ($ytics)
|
|
454 unset xtics
|
|
455 set title "$$args{title}"
|
|
456 splot '-' matrix with image
|
|
457 ];
|
|
458 for my $cycle (@{$$opts{dat}{FFQ}})
|
|
459 {
|
|
460 for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; }
|
|
461 print $fh "\n";
|
|
462 }
|
|
463 print $fh "end\nend\n";
|
|
464 @ytics = ();
|
|
465 for my $cycle (@{$$opts{dat}{LFQ}}) { if ( $$cycle[0]%10==0 ) { push @ytics,qq["$$cycle[0]" $$cycle[0]]; } }
|
|
466 $ytics = join(',', @ytics);
|
|
467 print $fh qq[
|
|
468 set origin 0,0.03
|
|
469 set size 0.95,0.6
|
|
470 set ylabel "Cycle (rev reads)" offset character -1,0
|
|
471 set xlabel "Base Quality"
|
|
472 unset title
|
|
473 unset ytics
|
|
474 set ytics ($ytics)
|
|
475 set xrange [0:$nquals]
|
|
476 set xtics
|
|
477 set colorbox vertical user origin first ($nquals+1),0 size screen 0.025,0.812
|
|
478 set cblabel "Number of bases"
|
|
479 splot '-' matrix with image
|
|
480 ];
|
|
481 for my $cycle (@{$$opts{dat}{LFQ}})
|
|
482 {
|
|
483 for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; }
|
|
484 print $fh "\n";
|
|
485 }
|
|
486 print $fh "end\nend\n";
|
|
487 close($fh);
|
|
488 plot($$args{gp});
|
|
489 }
|
|
490
|
|
491
|
|
492 sub plot_acgt_cycles
|
|
493 {
|
|
494 my ($opts) = @_;
|
|
495
|
|
496 if ( !exists($$opts{dat}{GCC}) or !@{$$opts{dat}{GCC}} ) { return; }
|
|
497
|
|
498 my $args = get_defaults($opts,"$$opts{prefix}acgt-cycles.png");
|
|
499 my $fh = $$args{fh};
|
|
500 print $fh qq[
|
|
501 $$args{terminal}
|
|
502 set output "$$args{img}"
|
|
503 $$args{grid}
|
|
504 set style line 1 linecolor rgb "green"
|
|
505 set style line 2 linecolor rgb "red"
|
|
506 set style line 3 linecolor rgb "black"
|
|
507 set style line 4 linecolor rgb "blue"
|
|
508 set style increment user
|
|
509 set ylabel "Base content [%]"
|
|
510 set xlabel "Read Cycle"
|
|
511 set yrange [0:100]
|
|
512 set title "$$args{title}"
|
|
513 plot '-' w l ti 'A', '-' w l ti 'C', '-' w l ti 'G', '-' w l ti 'T'
|
|
514 ];
|
|
515 for my $base (1..4)
|
|
516 {
|
|
517 for my $cycle (@{$$opts{dat}{GCC}})
|
|
518 {
|
|
519 print $fh $$cycle[0]+1,"\t",$$cycle[$base],"\n";
|
|
520 }
|
|
521 print $fh "end\n";
|
|
522 }
|
|
523 close($fh);
|
|
524 plot($$args{gp});
|
|
525 }
|
|
526
|
|
527
|
|
528 sub plot_gc
|
|
529 {
|
|
530 my ($opts) = @_;
|
|
531
|
|
532 my $is_paired = $$opts{dat}{'is paired:'};
|
|
533 my $args = get_defaults($opts,"$$opts{prefix}gc-content.png");
|
|
534 my $fh = $$args{fh};
|
|
535 my ($gcl_max,$gcf_max,$lmax,$fmax);
|
|
536 for my $gc (@{$$opts{dat}{GCF}}) { if ( !defined $gcf_max or $gcf_max<$$gc[1] ) { $gcf_max=$$gc[1]; $fmax=$$gc[0]; } }
|
|
537 for my $gc (@{$$opts{dat}{GCL}}) { if ( !defined $gcl_max or $gcl_max<$$gc[1] ) { $gcl_max=$$gc[1]; $lmax=$$gc[0]; } }
|
|
538 my $gcmax = $is_paired && $gcl_max > $gcf_max ? $lmax : $fmax;
|
|
539 print $fh qq[
|
|
540 $$args{terminal}
|
|
541 set output "$$args{img}"
|
|
542 $$args{grid}
|
|
543 set title "$$args{title}"
|
|
544 set ylabel "Normalized Frequency"
|
|
545 set xlabel "GC Content [%]"
|
|
546 set yrange [0:1.1]
|
|
547 set label sprintf("%.1f",$gcmax) at $gcmax,1 front offset 1,0
|
|
548 plot ]
|
|
549 . (exists($$opts{ref_stats}) ? q['-' smooth csplines with lines lt 0 title 'Reference', ] : '')
|
|
550 . q['-' smooth csplines with lines lc 1 title 'First fragments' ]
|
|
551 . ($is_paired ? q[, '-' smooth csplines with lines lc 2 title 'Last fragments'] : '')
|
|
552 . q[
|
|
553 ];
|
|
554 if ( exists($$opts{ref_stats}) )
|
|
555 {
|
|
556 open(my $ref,'<',$$opts{ref_stats}) or error("$$opts{ref_stats}: $!");
|
|
557 while (my $line=<$ref>) { print $fh $line }
|
|
558 close($ref);
|
|
559 print $fh "end\n";
|
|
560 }
|
|
561 for my $cycle (@{$$opts{dat}{GCF}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcf_max; }
|
|
562 print $fh "end\n";
|
|
563 if ( $is_paired )
|
|
564 {
|
|
565 for my $cycle (@{$$opts{dat}{GCL}}) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcl_max; }
|
|
566 print $fh "end\n";
|
|
567 }
|
|
568 close($fh);
|
|
569 plot($$args{gp});
|
|
570 }
|
|
571
|
|
572
|
|
573 sub plot_gc_depth
|
|
574 {
|
|
575 my ($opts) = @_;
|
|
576
|
|
577 if ( !exists($$opts{dat}{GCD}) or !@{$$opts{dat}{GCD}} ) { return; }
|
|
578
|
|
579 # Find unique sequence percentiles for 30,40, and 50% GC content, just to draw x2tics.
|
|
580 my @tics = ( {gc=>30},{gc=>40},{gc=>50} );
|
|
581 for my $gc (@{$$opts{dat}{GCD}})
|
|
582 {
|
|
583 for my $tic (@tics)
|
|
584 {
|
|
585 my $diff = abs($$gc[0]-$$tic{gc});
|
|
586 if ( !exists($$tic{pr}) or $diff<$$tic{diff} ) { $$tic{pr}=$$gc[1]; $$tic{diff}=$diff; }
|
|
587 }
|
|
588 }
|
|
589
|
|
590 my @x2tics;
|
|
591 for my $tic (@tics) { push @x2tics, qq["$$tic{gc}" $$tic{pr}]; }
|
|
592 my $x2tics = join(',',@x2tics);
|
|
593
|
|
594 my $args = get_defaults($opts,"$$opts{prefix}gc-depth.png", wh=>'600,500');
|
|
595 my $fh = $$args{fh};
|
|
596 print $fh qq[
|
|
597 $$args{terminal}
|
|
598 set output "$$args{img}"
|
|
599 $$args{grid}
|
|
600 set ylabel "Mapped depth"
|
|
601 set xlabel "Percentile of mapped sequence ordered by GC content"
|
|
602 set x2label "GC Content [%]"
|
|
603 set title "$$args{title}"
|
|
604 set x2tics ($x2tics)
|
|
605 set xtics nomirror
|
|
606 set xrange [0.1:99.9]
|
|
607
|
|
608 plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#dedede" t '10-90th percentile' , \\
|
|
609 '-' using 1:2:3 with filledcurve lt 1 lc rgb "#bbdeff" t '25-75th percentile' , \\
|
|
610 '-' using 1:2 with lines lc rgb "#0084ff" t 'Median'
|
|
611 ];
|
|
612 for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[2]\t$$gc[6]\n"; } print $fh "end\n";
|
|
613 for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[3]\t$$gc[5]\n"; } print $fh "end\n";
|
|
614 for my $gc (@{$$opts{dat}{GCD}}) { print $fh "$$gc[1]\t$$gc[4]\n"; } print $fh "end\n";
|
|
615 close($fh);
|
|
616 plot($$args{gp});
|
|
617 }
|
|
618
|
|
619
|
|
620 sub plot_isize
|
|
621 {
|
|
622 my ($opts) = @_;
|
|
623
|
|
624 if ( !$$opts{dat}{'is paired:'} or !exists($$opts{dat}{IS}) or !@{$$opts{dat}{IS}} ) { return; }
|
|
625
|
|
626 my ($isize_max,$isize_cnt);
|
|
627 for my $isize (@{$$opts{dat}{IS}})
|
|
628 {
|
|
629 if ( !defined $isize_max or $isize_cnt<$$isize[1] ) { $isize_cnt=$$isize[1]; $isize_max=$$isize[0]; }
|
|
630 }
|
|
631
|
|
632 my $args = get_defaults($opts,"$$opts{prefix}insert-size.png");
|
|
633 my $fh = $$args{fh};
|
|
634 print $fh qq[
|
|
635 $$args{terminal}
|
|
636 set output "$$args{img}"
|
|
637 $$args{grid}
|
|
638 set rmargin 5
|
|
639 set label sprintf("%d",$isize_max) at $isize_max+10,$isize_cnt
|
|
640 set ylabel "Number of pairs"
|
|
641 set xlabel "Insert Size"
|
|
642 set title "$$args{title}"
|
|
643 plot \\
|
|
644 '-' with lines lc rgb 'black' title 'All pairs', \\
|
|
645 '-' with lines title 'Inward', \\
|
|
646 '-' with lines title 'Outward', \\
|
|
647 '-' with lines title 'Other'
|
|
648 ];
|
|
649 for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[1]\n"; } print $fh "end\n";
|
|
650 for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[2]\n"; } print $fh "end\n";
|
|
651 for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[3]\n"; } print $fh "end\n";
|
|
652 for my $isize (@{$$opts{dat}{IS}}) { print $fh "$$isize[0]\t$$isize[4]\n"; } print $fh "end\n";
|
|
653 close($fh);
|
|
654 plot($$args{gp});
|
|
655 }
|
|
656
|
|
657
|
|
658 sub plot_coverage
|
|
659 {
|
|
660 my ($opts) = @_;
|
|
661
|
|
662 if ( !exists($$opts{dat}{COV}) or !@{$$opts{dat}{COV}} ) { return; }
|
|
663
|
|
664 my @vals;
|
|
665 for my $cov (@{$$opts{dat}{COV}}) { push @vals,$$cov[2]; }
|
|
666 my $i = percentile(99.8,@vals);
|
|
667 my $p99 = $$opts{dat}{COV}[$i][1];
|
|
668
|
|
669 my $args = get_defaults($opts,"$$opts{prefix}coverage.png");
|
|
670 my $fh = $$args{fh};
|
|
671 print $fh qq[
|
|
672 $$args{terminal}
|
|
673 set output "$$args{img}"
|
|
674 $$args{grid}
|
|
675 set ylabel "Number of mapped bases"
|
|
676 set xlabel "Coverage"
|
|
677 set style fill solid border -1
|
|
678 set title "$$args{title}"
|
|
679 set xrange [:$p99]
|
|
680 plot '-' with lines notitle
|
|
681 ];
|
|
682 for my $cov (@{$$opts{dat}{COV}})
|
|
683 {
|
|
684 if ( $$cov[2]==0 ) { next; }
|
|
685 print $fh "$$cov[1]\t$$cov[2]\n";
|
|
686 }
|
|
687 print $fh "end\n";
|
|
688 close($fh);
|
|
689 plot($$args{gp});
|
|
690 }
|
|
691
|
|
692
|
|
693 sub plot_mismatches_per_cycle
|
|
694 {
|
|
695 my ($opts) = @_;
|
|
696
|
|
697 if ( !exists($$opts{dat}{MPC}) or !@{$$opts{dat}{MPC}} ) { return; }
|
|
698 if ( older_than($opts,'2012-02-06') ) { plot_mismatches_per_cycle_old($opts); }
|
|
699
|
|
700 my $nquals = @{$$opts{dat}{MPC}[0]} - 2;
|
|
701 my $ncycles = @{$$opts{dat}{MPC}};
|
|
702 my ($style,$with);
|
|
703 if ( $ncycles>100 ) { $style = ''; $with = 'w l'; }
|
|
704 else { $style = 'set style data histogram; set style histogram rowstacked'; $with = ''; }
|
|
705
|
|
706 my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png");
|
|
707 my $fh = $$args{fh};
|
|
708 print $fh qq[
|
|
709 $$args{terminal}
|
|
710 set output "$$args{img}"
|
|
711 $$args{grid}
|
|
712 set style line 1 linecolor rgb "#e40000"
|
|
713 set style line 2 linecolor rgb "#ff9f00"
|
|
714 set style line 3 linecolor rgb "#eeee00"
|
|
715 set style line 4 linecolor rgb "#4ebd68"
|
|
716 set style line 5 linecolor rgb "#0061ff"
|
|
717 set style increment user
|
|
718 set key left top
|
|
719 $style
|
|
720 set ylabel "Number of mismatches"
|
|
721 set xlabel "Read Cycle"
|
|
722 set style fill solid border -1
|
|
723 set title "$$args{title}"
|
|
724 set xrange [-1:$ncycles]
|
|
725 plot '-' $with ti 'Base Quality>30', \\
|
|
726 '-' $with ti '30>=Q>20', \\
|
|
727 '-' $with ti '20>=Q>10', \\
|
|
728 '-' $with ti '10>=Q', \\
|
|
729 '-' $with ti "N's"
|
|
730 ];
|
|
731 for my $cycle (@{$$opts{dat}{MPC}})
|
|
732 {
|
|
733 my $sum; for my $idx (31..$#$cycle) { $sum += $$cycle[$idx]; }
|
|
734 print $fh "$sum\n";
|
|
735 }
|
|
736 print $fh "end\n";
|
|
737 for my $cycle (@{$$opts{dat}{MPC}})
|
|
738 {
|
|
739 my $sum; for my $idx (22..31) { $sum += $$cycle[$idx]; }
|
|
740 print $fh "$sum\n";
|
|
741 }
|
|
742 print $fh "end\n";
|
|
743 for my $cycle (@{$$opts{dat}{MPC}})
|
|
744 {
|
|
745 my $sum; for my $idx (12..21) { $sum += $$cycle[$idx]; }
|
|
746 print $fh "$sum\n";
|
|
747 }
|
|
748 print $fh "end\n";
|
|
749 for my $cycle (@{$$opts{dat}{MPC}})
|
|
750 {
|
|
751 my $sum; for my $idx (2..11) { $sum += $$cycle[$idx]; }
|
|
752 print $fh "$sum\n";
|
|
753 }
|
|
754 print $fh "end\n";
|
|
755 for my $cycle (@{$$opts{dat}{MPC}}) { print $fh "$$cycle[1]\n"; }
|
|
756 print $fh "end\n";
|
|
757 close($fh);
|
|
758 plot($$args{gp});
|
|
759 }
|
|
760
|
|
761 sub plot_indel_dist
|
|
762 {
|
|
763 my ($opts) = @_;
|
|
764
|
|
765 if ( !exists($$opts{dat}{ID}) or !@{$$opts{dat}{ID}} ) { return; }
|
|
766
|
|
767 my $args = get_defaults($opts,"$$opts{prefix}indel-dist.png");
|
|
768 my $fh = $$args{fh};
|
|
769 print $fh qq[
|
|
770 $$args{terminal}
|
|
771 set output "$$args{img}"
|
|
772 $$args{grid}
|
|
773 set style line 1 linetype 1 linecolor rgb "red"
|
|
774 set style line 2 linetype 2 linecolor rgb "black"
|
|
775 set style line 3 linetype 3 linecolor rgb "green"
|
|
776 set style increment user
|
|
777 set ylabel "Indel count [log]"
|
|
778 set xlabel "Indel length"
|
|
779 set y2label "Insertions/Deletions ratio"
|
|
780 set log y
|
|
781 set y2tics nomirror
|
|
782 set ytics nomirror
|
|
783 set title "$$args{title}"
|
|
784 plot '-' w l ti 'Insertions', '-' w l ti 'Deletions', '-' axes x1y2 w l ti "Ins/Dels ratio"
|
|
785 ];
|
|
786 for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n";
|
|
787 for my $len (@{$$opts{dat}{ID}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n";
|
|
788 for my $len (@{$$opts{dat}{ID}}) { printf $fh "%d\t%f\n", $$len[0],$$len[2]?$$len[1]/$$len[2]:0; } print $fh "end\n";
|
|
789 close($fh);
|
|
790 plot($$args{gp});
|
|
791 }
|
|
792
|
|
793 sub plot_indel_cycles
|
|
794 {
|
|
795 my ($opts) = @_;
|
|
796
|
|
797 if ( !exists($$opts{dat}{IC}) or !@{$$opts{dat}{IC}} ) { return; }
|
|
798
|
|
799 my $args = get_defaults($opts,"$$opts{prefix}indel-cycles.png");
|
|
800 my $fh = $$args{fh};
|
|
801 print $fh qq[
|
|
802 $$args{terminal}
|
|
803 set output "$$args{img}"
|
|
804 $$args{grid}
|
|
805 set style line 1 linetype 1 linecolor rgb "red"
|
|
806 set style line 2 linetype 2 linecolor rgb "black"
|
|
807 set style line 3 linetype 3 linecolor rgb "green"
|
|
808 set style line 4 linetype 4 linecolor rgb "blue"
|
|
809 set style increment user
|
|
810 set ylabel "Indel count"
|
|
811 set xlabel "Read Cycle"
|
|
812 set title "$$args{title}"
|
|
813 plot '-' w l ti 'Insertions (fwd)', '' w l ti 'Insertions (rev)', '' w l ti 'Deletions (fwd)', '' w l ti 'Deletions (rev)'
|
|
814 ];
|
|
815 for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n";
|
|
816 for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n";
|
|
817 for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[3]\n"; } print $fh "end\n";
|
|
818 for my $len (@{$$opts{dat}{IC}}) { print $fh "$$len[0]\t$$len[4]\n"; } print $fh "end\n";
|
|
819 close($fh);
|
|
820 plot($$args{gp});
|
|
821 }
|
|
822
|
|
823
|
|
824
|
|
825
|
|
826
|
|
827
|
|
828
|
|
829 sub has_values
|
|
830 {
|
|
831 my ($opts,@tags) = @_;
|
|
832 for my $tag (@tags)
|
|
833 {
|
|
834 my (@lines) = `cat $$opts{bamcheck} | grep ^$tag | wc -l`;
|
|
835 chomp($lines[0]);
|
|
836 if ( $lines[0]<2 ) { return 0; }
|
|
837 }
|
|
838 return 1;
|
|
839 }
|
|
840
|
|
841 sub plot_mismatches_per_cycle_old
|
|
842 {
|
|
843 my ($opts) = @_;
|
|
844
|
|
845 my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png");
|
|
846 my ($nquals) = `grep ^MPC $$opts{bamcheck} | awk '\$2==1' | sed 's,\\t,\\n,g' | wc -l`;
|
|
847 my ($ncycles) = `grep ^MPC $$opts{bamcheck} | wc -l`;
|
|
848 chomp($nquals);
|
|
849 chomp($ncycles);
|
|
850 $nquals--;
|
|
851 $ncycles--;
|
|
852 my @gr0_15 = (2..17);
|
|
853 my @gr16_30 = (18..32);
|
|
854 my @gr31_n = (33..$nquals);
|
|
855 my $gr0_15 = '$'. join('+$',@gr0_15);
|
|
856 my $gr16_30 = '$'. join('+$',@gr16_30);
|
|
857 my $gr31_n = '$'. join('+$',@gr31_n);
|
|
858
|
|
859 open(my $fh,'>',$$args{gp}) or error("$$args{gp}: $!");
|
|
860 print $fh q[
|
|
861 set terminal png size 600,400 truecolor font "DejaVuSansMono,9"
|
|
862 set output "] . $$args{img} . q["
|
|
863
|
|
864 set key left top
|
|
865 set style data histogram
|
|
866 set style histogram rowstacked
|
|
867
|
|
868 set grid back lc rgb "#aaaaaa"
|
|
869 set ylabel "Number of mismatches"
|
|
870 set xlabel "Read Cycle"
|
|
871 set style fill solid border -1
|
|
872 set title "] . $$args{title} . qq["
|
|
873 set xrange [-1:$ncycles]
|
|
874
|
|
875 plot '< grep ^MPC $$opts{bamcheck} | cut -f 2-' using ($gr31_n) ti 'Base Quality>30', '' using ($gr16_30) ti '30>=Q>15', '' using ($gr0_15) ti '15>=Q'
|
|
876 ];
|
|
877 close($fh);
|
|
878
|
|
879 plot($$args{gp});
|
|
880 }
|
|
881
|
|
882
|