comparison PsiCLASS-1.0.2/samtools-0.1.19/bcftools/vcfutils.pl @ 0:903fc43d6227 draft default tip

Uploaded
author lsong10
date Fri, 26 Mar 2021 16:52:45 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:903fc43d6227
1 #!/usr/bin/perl -w
2
3 # Author: lh3
4
5 use strict;
6 use warnings;
7 use Getopt::Std;
8
9 &main;
10 exit;
11
12 sub main {
13 &usage if (@ARGV < 1);
14 my $command = shift(@ARGV);
15 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
16 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats,
17 gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq);
18 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
19 &{$func{$command}};
20 }
21
22 sub splitchr {
23 my %opts = (l=>5000000);
24 getopts('l:', \%opts);
25 my $l = $opts{l};
26 die(qq/Usage: vcfutils.pl splitchr [-l $opts{l}] <in.fa.fai>\n/) if (@ARGV == 0 && -t STDIN);
27 while (<>) {
28 my @t = split;
29 my $last = 0;
30 for (my $i = 0; $i < $t[1];) {
31 my $e = ($t[1] - $i) / $l < 1.1? $t[1] : $i + $l;
32 print "$t[0]:".($i+1)."-$e\n";
33 $i = $e;
34 }
35 }
36 }
37
38 sub subsam {
39 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
40 my ($fh, %h);
41 my $fn = shift(@ARGV);
42 my @col;
43 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
44 $h{$_} = 1 for (@ARGV);
45 while (<$fh>) {
46 if (/^##/) {
47 print;
48 } elsif (/^#/) {
49 my @t = split;
50 my @s = @t[0..8]; # all fixed fields + FORMAT
51 for (9 .. $#t) {
52 if ($h{$t[$_]}) {
53 push(@s, $t[$_]);
54 push(@col, $_);
55 }
56 }
57 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
58 print join("\t", @s), "\n";
59 } else {
60 my @t = split;
61 if (@col == 0) {
62 print join("\t", @t[0..7]), "\n";
63 } else {
64 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
65 }
66 }
67 }
68 close($fh);
69 }
70
71 sub listsam {
72 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
73 while (<>) {
74 if (/^#/ && !/^##/) {
75 my @t = split;
76 print join("\n", @t[9..$#t]), "\n";
77 exit;
78 }
79 }
80 }
81
82 sub fillac {
83 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
84 while (<>) {
85 if (/^#/) {
86 print;
87 } else {
88 my @t = split;
89 my @c = (0, 0);
90 my $n = 0;
91 my $s = -1;
92 @_ = split(":", $t[8]);
93 for (0 .. $#_) {
94 if ($_[$_] eq 'GT') { $s = $_; last; }
95 }
96 if ($s < 0) {
97 print join("\t", @t), "\n";
98 next;
99 }
100 for (9 .. $#t) {
101 if ($t[$_] =~ /^0,0,0/) {
102 } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
103 ++$c[$2]; ++$c[$3];
104 $n += 2;
105 }
106 }
107 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
108 my $info = $t[7];
109 $info =~ s/(;?)AC=(\d+)//;
110 $info =~ s/(;?)AN=(\d+)//;
111 if ($info eq '.') {
112 $info = $AC;
113 } else {
114 $info .= ";$AC";
115 }
116 $t[7] = $info;
117 print join("\t", @t), "\n";
118 }
119 }
120 }
121
122 sub ldstats {
123 my %opts = (t=>0.9);
124 getopts('t:', \%opts);
125 die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
126 my $cutoff = $opts{t};
127 my ($last, $lastchr) = (0x7fffffff, '');
128 my ($x, $y, $n) = (0, 0, 0);
129 while (<>) {
130 if (/^([^#\s]+)\s(\d+)/) {
131 my ($chr, $pos) = ($1, $2);
132 if (/NEIR=([\d\.]+)/) {
133 ++$n;
134 ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff);
135 }
136 $last = $pos; $lastchr = $chr;
137 }
138 }
139 print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n";
140 print "Fraction: ", $y/$n, "\n";
141 print "Length: $x\n";
142 }
143
144 sub qstats {
145 my %opts = (r=>'', s=>0.02, v=>undef);
146 getopts('r:s:v', \%opts);
147 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
148 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
149 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
150 my %h = ();
151 my $is_vcf = defined($opts{v})? 1 : 0;
152 if ($opts{r}) { # read the reference positions
153 my $fh;
154 open($fh, $opts{r}) || die;
155 while (<$fh>) {
156 next if (/^#/);
157 if ($is_vcf) {
158 my @t = split;
159 $h{$t[0],$t[1]} = $t[4];
160 } else {
161 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
162 }
163 }
164 close($fh);
165 }
166 my $hsize = scalar(keys %h);
167 my @a;
168 while (<>) {
169 next if (/^#/);
170 my @t = split;
171 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
172 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
173 my @s = split(',', $t[4]);
174 $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0);
175 next if (length($s[0]) != 1);
176 my $hit;
177 if ($is_vcf) {
178 $hit = 0;
179 my $aa = $h{$t[0],$t[1]};
180 if (defined($aa)) {
181 my @aaa = split(",", $aa);
182 for (@aaa) {
183 $hit = 1 if ($_ eq $s[0]);
184 }
185 }
186 } else {
187 $hit = defined($h{$t[0],$t[1]})? 1 : 0;
188 }
189 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
190 }
191 push(@a, [-1, 0, 0, 0]); # end marker
192 die("[qstats] No SNP data!\n") if (@a == 0);
193 @a = sort {$b->[0]<=>$a->[0]} @a;
194 my $next = $opts{s};
195 my $last = $a[0];
196 my @c = (0, 0, 0, 0);
197 my @lc;
198 $lc[1] = $lc[2] = 0;
199 for my $p (@a) {
200 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
201 my @x;
202 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
203 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
204 $x[2] = sprintf("%.4f", $c[3] / $c[1]);
205 my $a = $c[1] - $lc[1];
206 my $b = $c[2] - $lc[2];
207 $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
208 print join("\t", $last, @c, @x), "\n";
209 $next = $c[0]/@a + $opts{s};
210 $lc[1] = $c[1]; $lc[2] = $c[2];
211 }
212 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
213 $last = $p->[0];
214 }
215 }
216
217 sub varFilter {
218 my %opts = (d=>2, D=>10000000, a=>2, W=>10, Q=>10, w=>3, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000, e=>1e-4);
219 getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:e:', \%opts);
220 die(qq/
221 Usage: vcfutils.pl varFilter [options] <in.vcf>
222
223 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
224 -d INT minimum read depth [$opts{d}]
225 -D INT maximum read depth [$opts{D}]
226 -a INT minimum number of alternate bases [$opts{a}]
227 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
228 -W INT window size for filtering adjacent gaps [$opts{W}]
229 -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
230 -2 FLOAT min P-value for baseQ bias [$opts{2}]
231 -3 FLOAT min P-value for mapQ bias [$opts{3}]
232 -4 FLOAT min P-value for end distance bias [$opts{4}]
233 -e FLOAT min P-value for HWE (plus F<0) [$opts{e}]
234 -p print filtered variants
235
236 Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools.
237 \n/) if (@ARGV == 0 && -t STDIN);
238
239 # calculate the window size
240 my ($ol, $ow) = ($opts{W}, $opts{w});
241 my $max_dist = $ol > $ow? $ol : $ow;
242 # the core loop
243 my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
244 while (<>) {
245 my @t = split;
246 if (/^#/) {
247 print; next;
248 }
249 next if ($t[4] eq '.'); # skip non-var sites
250 next if ($t[3] eq 'N'); # skip sites with unknown ref ('N')
251 # check if the site is a SNP
252 my $type = 1; # SNP
253 if (length($t[3]) > 1) {
254 $type = 2; # MNP
255 my @s = split(',', $t[4]);
256 for (@s) {
257 $type = 3 if (length != length($t[3]));
258 }
259 } else {
260 my @s = split(',', $t[4]);
261 for (@s) {
262 $type = 3 if (length > 1);
263 }
264 }
265 # clear the out-of-range elements
266 while (@staging) {
267 # Still on the same chromosome and the first element's window still affects this position?
268 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
269 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
270 }
271 my $flt = 0;
272 # parse annotations
273 my ($dp, $mq, $dp_alt) = (-1, -1, -1);
274 if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
275 $dp = $1 + $2 + $3 + $4;
276 $dp_alt = $3 + $4;
277 }
278 if ($t[7] =~ /DP=(\d+)/i) {
279 $dp = $1;
280 }
281 $mq = $1 if ($t[7] =~ /MQ=(\d+)/i);
282 # the depth and mapQ filter
283 if ($dp >= 0) {
284 if ($dp < $opts{d}) {
285 $flt = 2;
286 } elsif ($dp > $opts{D}) {
287 $flt = 3;
288 }
289 }
290 $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a});
291 $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
292 $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
293 && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
294 $flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 < $opts{G}) || (/MXSP=(\d+)/ && $1 >= $opts{S})));
295 # HWE filter
296 if ($t[7] =~ /G3=([^;,]+),([^;,]+),([^;,]+).*HWE=([^;,]+)/ && $4 < $opts{e}) {
297 my $p = 2*$1 + $2;
298 my $f = ($p > 0 && $p < 1)? 1 - $2 / ($p * (1-$p)) : 0;
299 $flt = 9 if ($f < 0);
300 }
301
302 my $score = $t[5] * 100 + $dp_alt;
303 my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs
304 if ($flt == 0) {
305 if ($type == 3) { # an indel
306 # filtering SNPs and MNPs
307 for my $x (@staging) {
308 next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
309 $x->[1] = 5;
310 }
311 # check the staging list for indel filtering
312 for my $x (@staging) {
313 next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
314 if ($x->[0]>>2 < $score) {
315 $x->[1] = 6;
316 } else {
317 $flt = 6; last;
318 }
319 }
320 } else { # SNP or MNP
321 for my $x (@staging) {
322 next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]);
323 if ($x->[4] + length($x->[7]) - 1 == $t[1] && substr($x->[7], -1, 1) eq substr($t[4], 0, 1)
324 && length($x->[7]) - length($x->[6]) == 1) {
325 $x->[1] = 5;
326 } else { $flt = 5; }
327 last;
328 }
329 # check MNP
330 for my $x (@staging) {
331 next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]);
332 if ($x->[0]>>2 < $score) {
333 $x->[1] = 8;
334 } else {
335 $flt = 8; last;
336 }
337 }
338 }
339 }
340 push(@staging, [$score<<2|$type, $flt, $rlen, @t]);
341 }
342 # output the last few elements in the staging list
343 while (@staging) {
344 varFilter_aux(shift @staging, $opts{p});
345 }
346 }
347
348 sub varFilter_aux {
349 my ($first, $is_print) = @_;
350 if ($first->[1] == 0) {
351 print join("\t", @$first[3 .. @$first-1]), "\n";
352 } elsif ($is_print) {
353 print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
354 }
355 }
356
357 sub gapstats {
358 my (@c0, @c1);
359 $c0[$_] = $c1[$_] = 0 for (0 .. 10000);
360 while (<>) {
361 next if (/^#/);
362 my @t = split;
363 next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel
364 my @s = split(',', $t[4]);
365 for my $x (@s) {
366 my $l = length($x) - length($t[3]) + 5000;
367 if ($x =~ /^-/) {
368 $l = -(length($x) - 1) + 5000;
369 } elsif ($x =~ /^\+/) {
370 $l = length($x) - 1 + 5000;
371 }
372 $c0[$l] += 1 / @s;
373 }
374 }
375 for (my $i = 0; $i < 10000; ++$i) {
376 next if ($c0[$i] == 0);
377 $c1[0] += $c0[$i];
378 $c1[1] += $c0[$i] if (($i-5000)%3 == 0);
379 printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]);
380 }
381 printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]);
382 }
383
384 sub ucscsnp2vcf {
385 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
386 print "##fileformat=VCFv4.0\n";
387 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
388 while (<>) {
389 my @t = split("\t");
390 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
391 my $pos = $t[2] + 1;
392 my @alt;
393 push(@alt, $t[7]);
394 if ($t[6] eq '-') {
395 $t[9] = reverse($t[9]);
396 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
397 }
398 my @a = split("/", $t[9]);
399 for (@a) {
400 push(@alt, $_) if ($_ ne $alt[0]);
401 }
402 if ($indel) {
403 --$pos;
404 for (0 .. $#alt) {
405 $alt[$_] =~ tr/-//d;
406 $alt[$_] = "N$alt[$_]";
407 }
408 }
409 my $ref = shift(@alt);
410 my $af = $t[13] > 0? ";AF=$t[13]" : '';
411 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
412 my $info = "molType=$t[10];class=$t[11]$valid$af";
413 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
414 }
415 }
416
417 sub hapmap2vcf {
418 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
419 my $fn = shift(@ARGV);
420 # parse UCSC SNP
421 warn("Parsing UCSC SNPs...\n");
422 my ($fh, %map);
423 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
424 while (<$fh>) {
425 my @t = split;
426 next if ($t[3] - $t[2] != 1); # not SNP
427 @{$map{$t[4]}} = @t[1,3,7];
428 }
429 close($fh);
430 # write VCF
431 warn("Writing VCF...\n");
432 print "##fileformat=VCFv4.0\n";
433 while (<>) {
434 my @t = split;
435 if ($t[0] eq 'rs#') { # the first line
436 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
437 } else {
438 next unless ($map{$t[0]});
439 next if (length($t[1]) != 3); # skip non-SNPs
440 my $a = \@{$map{$t[0]}};
441 my $ref = $a->[2];
442 my @u = split('/', $t[1]);
443 if ($u[1] eq $ref) {
444 $u[1] = $u[0]; $u[0] = $ref;
445 } elsif ($u[0] ne $ref) { next; }
446 my $alt = $u[1];
447 my %w;
448 $w{$u[0]} = 0; $w{$u[1]} = 1;
449 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
450 my $is_tri = 0;
451 for (@t[11..$#t]) {
452 if ($_ eq 'NN') {
453 push(@s, './.');
454 } else {
455 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
456 if (!defined($a[0]) || !defined($a[1])) {
457 $is_tri = 1;
458 last;
459 }
460 push(@s, "$a[0]/$a[1]");
461 }
462 }
463 next if ($is_tri);
464 print join("\t", @s), "\n";
465 }
466 }
467 }
468
469 sub vcf2fq {
470 my %opts = (d=>3, D=>100000, Q=>10, l=>5);
471 getopts('d:D:Q:l:', \%opts);
472 die(qq/
473 Usage: vcfutils.pl vcf2fq [options] <all-site.vcf>
474
475 Options: -d INT minimum depth [$opts{d}]
476 -D INT maximum depth [$opts{D}]
477 -Q INT min RMS mapQ [$opts{Q}]
478 -l INT INDEL filtering window [$opts{l}]
479 \n/) if (@ARGV == 0 && -t STDIN);
480
481 my ($last_chr, $seq, $qual, $last_pos, @gaps);
482 my $_Q = $opts{Q};
483 my $_d = $opts{d};
484 my $_D = $opts{D};
485
486 my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y',
487 GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K');
488
489 $last_chr = '';
490 while (<>) {
491 next if (/^#/);
492 my @t = split;
493 if ($last_chr ne $t[0]) {
494 &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr);
495 ($last_chr, $last_pos) = ($t[0], 0);
496 $seq = $qual = '';
497 @gaps = ();
498 }
499 die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0);
500 if ($t[1] - $last_pos > 1) {
501 $seq .= 'n' x ($t[1] - $last_pos - 1);
502 $qual .= '!' x ($t[1] - $last_pos - 1);
503 }
504 if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference
505 my ($ref, $alt) = ($t[3], $1);
506 my ($b, $q);
507 $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/);
508 if ($q < 0) {
509 $_ = ($t[7] =~ /AF1=([\d\.]+)/)? $1 : 0;
510 $b = ($_ < .5 || $alt eq '.')? $ref : $alt;
511 $q = -$q;
512 } else {
513 $b = $het{"$ref$alt"};
514 $b ||= 'N';
515 }
516 $b = lc($b);
517 $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D));
518 $q = int($q + 33 + .499);
519 $q = chr($q <= 126? $q : 126);
520 $seq .= $b;
521 $qual .= $q;
522 } elsif ($t[4] ne '.') { # an INDEL
523 push(@gaps, [$t[1], length($t[3])]);
524 }
525 $last_pos = $t[1];
526 }
527 &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l});
528 }
529
530 sub v2q_post_process {
531 my ($chr, $seq, $qual, $gaps, $l) = @_;
532 for my $g (@$gaps) {
533 my $beg = $g->[0] > $l? $g->[0] - $l : 0;
534 my $end = $g->[0] + $g->[1] + $l;
535 $end = length($$seq) if ($end > length($$seq));
536 substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg));
537 }
538 print "\@$chr\n"; &v2q_print_str($seq);
539 print "+\n"; &v2q_print_str($qual);
540 }
541
542 sub v2q_print_str {
543 my ($s) = @_;
544 my $l = length($$s);
545 for (my $i = 0; $i < $l; $i += 60) {
546 print substr($$s, $i, 60), "\n";
547 }
548 }
549
550 sub usage {
551 die(qq/
552 Usage: vcfutils.pl <command> [<arguments>]\n
553 Command: subsam get a subset of samples
554 listsam list the samples
555 fillac fill the allele count field
556 qstats SNP stats stratified by QUAL
557
558 hapmap2vcf convert the hapmap format to VCF
559 ucscsnp2vcf convert UCSC SNP SQL dump to VCF
560
561 varFilter filtering short variants (*)
562 vcf2fq VCF->fastq (**)
563
564 Notes: Commands with description endting with (*) may need bcftools
565 specific annotations.
566 \n/);
567 }