Mercurial > repos > lsong10 > psiclass
view PsiCLASS-1.0.2/samtools-0.1.19/bcftools/vcfutils.pl @ 0:903fc43d6227 draft default tip
Uploaded
author | lsong10 |
---|---|
date | Fri, 26 Mar 2021 16:52:45 +0000 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl -w # Author: lh3 use strict; use warnings; use Getopt::Std; &main; exit; sub main { &usage if (@ARGV < 1); my $command = shift(@ARGV); my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter, hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats, gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq); die("Unknown command \"$command\".\n") if (!defined($func{$command})); &{$func{$command}}; } sub splitchr { my %opts = (l=>5000000); getopts('l:', \%opts); my $l = $opts{l}; die(qq/Usage: vcfutils.pl splitchr [-l $opts{l}] <in.fa.fai>\n/) if (@ARGV == 0 && -t STDIN); while (<>) { my @t = split; my $last = 0; for (my $i = 0; $i < $t[1];) { my $e = ($t[1] - $i) / $l < 1.1? $t[1] : $i + $l; print "$t[0]:".($i+1)."-$e\n"; $i = $e; } } } sub subsam { die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0); my ($fh, %h); my $fn = shift(@ARGV); my @col; open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die; $h{$_} = 1 for (@ARGV); while (<$fh>) { if (/^##/) { print; } elsif (/^#/) { my @t = split; my @s = @t[0..8]; # all fixed fields + FORMAT for (9 .. $#t) { if ($h{$t[$_]}) { push(@s, $t[$_]); push(@col, $_); } } pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field print join("\t", @s), "\n"; } else { my @t = split; if (@col == 0) { print join("\t", @t[0..7]), "\n"; } else { print join("\t", @t[0..8], map {$t[$_]} @col), "\n"; } } } close($fh); } sub listsam { die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN); while (<>) { if (/^#/ && !/^##/) { my @t = split; print join("\n", @t[9..$#t]), "\n"; exit; } } } sub fillac { die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN); while (<>) { if (/^#/) { print; } else { my @t = split; my @c = (0, 0); my $n = 0; my $s = -1; @_ = split(":", $t[8]); for (0 .. $#_) { if ($_[$_] eq 'GT') { $s = $_; last; } } if ($s < 0) { print join("\t", @t), "\n"; next; } for (9 .. $#t) { if ($t[$_] =~ /^0,0,0/) { } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) { ++$c[$2]; ++$c[$3]; $n += 2; } } my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n"; my $info = $t[7]; $info =~ s/(;?)AC=(\d+)//; $info =~ s/(;?)AN=(\d+)//; if ($info eq '.') { $info = $AC; } else { $info .= ";$AC"; } $t[7] = $info; print join("\t", @t), "\n"; } } } sub ldstats { my %opts = (t=>0.9); getopts('t:', \%opts); die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN); my $cutoff = $opts{t}; my ($last, $lastchr) = (0x7fffffff, ''); my ($x, $y, $n) = (0, 0, 0); while (<>) { if (/^([^#\s]+)\s(\d+)/) { my ($chr, $pos) = ($1, $2); if (/NEIR=([\d\.]+)/) { ++$n; ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff); } $last = $pos; $lastchr = $chr; } } print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n"; print "Fraction: ", $y/$n, "\n"; print "Length: $x\n"; } sub qstats { my %opts = (r=>'', s=>0.02, v=>undef); getopts('r:s:v', \%opts); die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN); my %ts = (AG=>1, GA=>1, CT=>1, TC=>1); my %h = (); my $is_vcf = defined($opts{v})? 1 : 0; if ($opts{r}) { # read the reference positions my $fh; open($fh, $opts{r}) || die; while (<$fh>) { next if (/^#/); if ($is_vcf) { my @t = split; $h{$t[0],$t[1]} = $t[4]; } else { $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/); } } close($fh); } my $hsize = scalar(keys %h); my @a; while (<>) { next if (/^#/); my @t = split; next if (length($t[3]) != 1 || uc($t[3]) eq 'N'); $t[3] = uc($t[3]); $t[4] = uc($t[4]); my @s = split(',', $t[4]); $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0); next if (length($s[0]) != 1); my $hit; if ($is_vcf) { $hit = 0; my $aa = $h{$t[0],$t[1]}; if (defined($aa)) { my @aaa = split(",", $aa); for (@aaa) { $hit = 1 if ($_ eq $s[0]); } } } else { $hit = defined($h{$t[0],$t[1]})? 1 : 0; } push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]); } push(@a, [-1, 0, 0, 0]); # end marker die("[qstats] No SNP data!\n") if (@a == 0); @a = sort {$b->[0]<=>$a->[0]} @a; my $next = $opts{s}; my $last = $a[0]; my @c = (0, 0, 0, 0); my @lc; $lc[1] = $lc[2] = 0; for my $p (@a) { if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) { my @x; $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100); $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0); $x[2] = sprintf("%.4f", $c[3] / $c[1]); my $a = $c[1] - $lc[1]; my $b = $c[2] - $lc[2]; $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100); print join("\t", $last, @c, @x), "\n"; $next = $c[0]/@a + $opts{s}; $lc[1] = $c[1]; $lc[2] = $c[2]; } ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3]; $last = $p->[0]; } } sub varFilter { my %opts = (d=>2, D=>10000000, a=>2, W=>10, Q=>10, w=>3, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000, e=>1e-4); getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:e:', \%opts); die(qq/ Usage: vcfutils.pl varFilter [options] <in.vcf> Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] -d INT minimum read depth [$opts{d}] -D INT maximum read depth [$opts{D}] -a INT minimum number of alternate bases [$opts{a}] -w INT SNP within INT bp around a gap to be filtered [$opts{w}] -W INT window size for filtering adjacent gaps [$opts{W}] -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}] -2 FLOAT min P-value for baseQ bias [$opts{2}] -3 FLOAT min P-value for mapQ bias [$opts{3}] -4 FLOAT min P-value for end distance bias [$opts{4}] -e FLOAT min P-value for HWE (plus F<0) [$opts{e}] -p print filtered variants Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools. \n/) if (@ARGV == 0 && -t STDIN); # calculate the window size my ($ol, $ow) = ($opts{W}, $opts{w}); my $max_dist = $ol > $ow? $ol : $ow; # the core loop my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...) while (<>) { my @t = split; if (/^#/) { print; next; } next if ($t[4] eq '.'); # skip non-var sites next if ($t[3] eq 'N'); # skip sites with unknown ref ('N') # check if the site is a SNP my $type = 1; # SNP if (length($t[3]) > 1) { $type = 2; # MNP my @s = split(',', $t[4]); for (@s) { $type = 3 if (length != length($t[3])); } } else { my @s = split(',', $t[4]); for (@s) { $type = 3 if (length > 1); } } # clear the out-of-range elements while (@staging) { # Still on the same chromosome and the first element's window still affects this position? last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]); varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much } my $flt = 0; # parse annotations my ($dp, $mq, $dp_alt) = (-1, -1, -1); if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) { $dp = $1 + $2 + $3 + $4; $dp_alt = $3 + $4; } if ($t[7] =~ /DP=(\d+)/i) { $dp = $1; } $mq = $1 if ($t[7] =~ /MQ=(\d+)/i); # the depth and mapQ filter if ($dp >= 0) { if ($dp < $opts{d}) { $flt = 2; } elsif ($dp > $opts{D}) { $flt = 3; } } $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a}); $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q}); $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4})); $flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 < $opts{G}) || (/MXSP=(\d+)/ && $1 >= $opts{S}))); # HWE filter if ($t[7] =~ /G3=([^;,]+),([^;,]+),([^;,]+).*HWE=([^;,]+)/ && $4 < $opts{e}) { my $p = 2*$1 + $2; my $f = ($p > 0 && $p < 1)? 1 - $2 / ($p * (1-$p)) : 0; $flt = 9 if ($f < 0); } my $score = $t[5] * 100 + $dp_alt; my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs if ($flt == 0) { if ($type == 3) { # an indel # filtering SNPs and MNPs for my $x (@staging) { next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]); $x->[1] = 5; } # check the staging list for indel filtering for my $x (@staging) { next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]); if ($x->[0]>>2 < $score) { $x->[1] = 6; } else { $flt = 6; last; } } } else { # SNP or MNP for my $x (@staging) { next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]); if ($x->[4] + length($x->[7]) - 1 == $t[1] && substr($x->[7], -1, 1) eq substr($t[4], 0, 1) && length($x->[7]) - length($x->[6]) == 1) { $x->[1] = 5; } else { $flt = 5; } last; } # check MNP for my $x (@staging) { next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]); if ($x->[0]>>2 < $score) { $x->[1] = 8; } else { $flt = 8; last; } } } } push(@staging, [$score<<2|$type, $flt, $rlen, @t]); } # output the last few elements in the staging list while (@staging) { varFilter_aux(shift @staging, $opts{p}); } } sub varFilter_aux { my ($first, $is_print) = @_; if ($first->[1] == 0) { print join("\t", @$first[3 .. @$first-1]), "\n"; } elsif ($is_print) { print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; } } sub gapstats { my (@c0, @c1); $c0[$_] = $c1[$_] = 0 for (0 .. 10000); while (<>) { next if (/^#/); my @t = split; next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel my @s = split(',', $t[4]); for my $x (@s) { my $l = length($x) - length($t[3]) + 5000; if ($x =~ /^-/) { $l = -(length($x) - 1) + 5000; } elsif ($x =~ /^\+/) { $l = length($x) - 1 + 5000; } $c0[$l] += 1 / @s; } } for (my $i = 0; $i < 10000; ++$i) { next if ($c0[$i] == 0); $c1[0] += $c0[$i]; $c1[1] += $c0[$i] if (($i-5000)%3 == 0); printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]); } printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]); } sub ucscsnp2vcf { die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN); print "##fileformat=VCFv4.0\n"; print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n"; while (<>) { my @t = split("\t"); my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1; my $pos = $t[2] + 1; my @alt; push(@alt, $t[7]); if ($t[6] eq '-') { $t[9] = reverse($t[9]); $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/; } my @a = split("/", $t[9]); for (@a) { push(@alt, $_) if ($_ ne $alt[0]); } if ($indel) { --$pos; for (0 .. $#alt) { $alt[$_] =~ tr/-//d; $alt[$_] = "N$alt[$_]"; } } my $ref = shift(@alt); my $af = $t[13] > 0? ";AF=$t[13]" : ''; my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]"; my $info = "molType=$t[10];class=$t[11]$valid$af"; print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n"; } } sub hapmap2vcf { die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0); my $fn = shift(@ARGV); # parse UCSC SNP warn("Parsing UCSC SNPs...\n"); my ($fh, %map); open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die; while (<$fh>) { my @t = split; next if ($t[3] - $t[2] != 1); # not SNP @{$map{$t[4]}} = @t[1,3,7]; } close($fh); # write VCF warn("Writing VCF...\n"); print "##fileformat=VCFv4.0\n"; while (<>) { my @t = split; if ($t[0] eq 'rs#') { # the first line print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n"; } else { next unless ($map{$t[0]}); next if (length($t[1]) != 3); # skip non-SNPs my $a = \@{$map{$t[0]}}; my $ref = $a->[2]; my @u = split('/', $t[1]); if ($u[1] eq $ref) { $u[1] = $u[0]; $u[0] = $ref; } elsif ($u[0] ne $ref) { next; } my $alt = $u[1]; my %w; $w{$u[0]} = 0; $w{$u[1]} = 1; my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT'); my $is_tri = 0; for (@t[11..$#t]) { if ($_ eq 'NN') { push(@s, './.'); } else { my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)}); if (!defined($a[0]) || !defined($a[1])) { $is_tri = 1; last; } push(@s, "$a[0]/$a[1]"); } } next if ($is_tri); print join("\t", @s), "\n"; } } } sub vcf2fq { my %opts = (d=>3, D=>100000, Q=>10, l=>5); getopts('d:D:Q:l:', \%opts); die(qq/ Usage: vcfutils.pl vcf2fq [options] <all-site.vcf> Options: -d INT minimum depth [$opts{d}] -D INT maximum depth [$opts{D}] -Q INT min RMS mapQ [$opts{Q}] -l INT INDEL filtering window [$opts{l}] \n/) if (@ARGV == 0 && -t STDIN); my ($last_chr, $seq, $qual, $last_pos, @gaps); my $_Q = $opts{Q}; my $_d = $opts{d}; my $_D = $opts{D}; my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y', GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K'); $last_chr = ''; while (<>) { next if (/^#/); my @t = split; if ($last_chr ne $t[0]) { &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr); ($last_chr, $last_pos) = ($t[0], 0); $seq = $qual = ''; @gaps = (); } die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0); if ($t[1] - $last_pos > 1) { $seq .= 'n' x ($t[1] - $last_pos - 1); $qual .= '!' x ($t[1] - $last_pos - 1); } if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference my ($ref, $alt) = ($t[3], $1); my ($b, $q); $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/); if ($q < 0) { $_ = ($t[7] =~ /AF1=([\d\.]+)/)? $1 : 0; $b = ($_ < .5 || $alt eq '.')? $ref : $alt; $q = -$q; } else { $b = $het{"$ref$alt"}; $b ||= 'N'; } $b = lc($b); $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D)); $q = int($q + 33 + .499); $q = chr($q <= 126? $q : 126); $seq .= $b; $qual .= $q; } elsif ($t[4] ne '.') { # an INDEL push(@gaps, [$t[1], length($t[3])]); } $last_pos = $t[1]; } &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}); } sub v2q_post_process { my ($chr, $seq, $qual, $gaps, $l) = @_; for my $g (@$gaps) { my $beg = $g->[0] > $l? $g->[0] - $l : 0; my $end = $g->[0] + $g->[1] + $l; $end = length($$seq) if ($end > length($$seq)); substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg)); } print "\@$chr\n"; &v2q_print_str($seq); print "+\n"; &v2q_print_str($qual); } sub v2q_print_str { my ($s) = @_; my $l = length($$s); for (my $i = 0; $i < $l; $i += 60) { print substr($$s, $i, 60), "\n"; } } sub usage { die(qq/ Usage: vcfutils.pl <command> [<arguments>]\n Command: subsam get a subset of samples listsam list the samples fillac fill the allele count field qstats SNP stats stratified by QUAL hapmap2vcf convert the hapmap format to VCF ucscsnp2vcf convert UCSC SNP SQL dump to VCF varFilter filtering short variants (*) vcf2fq VCF->fastq (**) Notes: Commands with description endting with (*) may need bcftools specific annotations. \n/); }