annotate pyPRADA_1.2/tools/samtools-0.1.16/bcftools/vcfutils.pl @ 3:f17965495ec9 draft default tip

Uploaded
author siyuan
date Tue, 11 Mar 2014 12:14:01 -0400
parents acc2ca1a3ba4
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
1 #!/usr/bin/perl -w
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
2
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
3 # Author: lh3
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
4
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
5 use strict;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
6 use warnings;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
7 use Getopt::Std;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
8
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
9 &main;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
10 exit;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
11
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
12 sub main {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
13 &usage if (@ARGV < 1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
14 my $command = shift(@ARGV);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
15 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
16 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
17 gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
18 die("Unknown command \"$command\".\n") if (!defined($func{$command}));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
19 &{$func{$command}};
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
20 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
21
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
22 sub splitchr {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
23 my %opts = (l=>5000000);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
24 getopts('l:', \%opts);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
25 my $l = $opts{l};
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
26 die(qq/Usage: vcfutils.pl splitchr [-l $opts{l}] <in.fa.fai>\n/) if (@ARGV == 0 && -t STDIN);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
27 while (<>) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
28 my @t = split;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
29 my $last = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
30 for (my $i = 0; $i < $t[1];) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
31 my $e = ($t[1] - $i) / $l < 1.1? $t[1] : $i + $l;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
32 print "$t[0]:".($i+1)."-$e\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
33 $i = $e;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
34 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
35 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
36 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
37
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
38 sub subsam {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
39 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
40 my ($fh, %h);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
41 my $fn = shift(@ARGV);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
42 my @col;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
43 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
44 $h{$_} = 1 for (@ARGV);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
45 while (<$fh>) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
46 if (/^##/) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
47 print;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
48 } elsif (/^#/) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
49 my @t = split;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
50 my @s = @t[0..8]; # all fixed fields + FORMAT
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
51 for (9 .. $#t) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
52 if ($h{$t[$_]}) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
53 push(@s, $t[$_]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
54 push(@col, $_);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
55 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
56 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
57 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
58 print join("\t", @s), "\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
59 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
60 my @t = split;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
61 if (@col == 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
62 print join("\t", @t[0..7]), "\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
63 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
64 print join("\t", @t[0..8], map {$t[$_]} @col), "\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
65 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
66 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
67 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
68 close($fh);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
69 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
70
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
71 sub listsam {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
72 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
73 while (<>) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
74 if (/^#/ && !/^##/) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
75 my @t = split;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
76 print join("\n", @t[9..$#t]), "\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
77 exit;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
78 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
79 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
80 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
81
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
82 sub fillac {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
83 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
84 while (<>) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
85 if (/^#/) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
86 print;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
87 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
88 my @t = split;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
89 my @c = (0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
90 my $n = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
91 my $s = -1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
92 @_ = split(":", $t[8]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
93 for (0 .. $#_) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
94 if ($_[$_] eq 'GT') { $s = $_; last; }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
95 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
96 if ($s < 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
97 print join("\t", @t), "\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
98 next;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
99 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
100 for (9 .. $#t) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
101 if ($t[$_] =~ /^0,0,0/) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
102 } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
103 ++$c[$2]; ++$c[$3];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
104 $n += 2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
105 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
106 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
107 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
108 my $info = $t[7];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
109 $info =~ s/(;?)AC=(\d+)//;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
110 $info =~ s/(;?)AN=(\d+)//;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
111 if ($info eq '.') {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
112 $info = $AC;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
113 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
114 $info .= ";$AC";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
115 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
116 $t[7] = $info;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
117 print join("\t", @t), "\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
118 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
119 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
120 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
121
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
122 sub ldstats {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
123 my %opts = (t=>0.9);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
124 getopts('t:', \%opts);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
125 die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
126 my $cutoff = $opts{t};
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
127 my ($last, $lastchr) = (0x7fffffff, '');
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
128 my ($x, $y, $n) = (0, 0, 0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
129 while (<>) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
130 if (/^([^#\s]+)\s(\d+)/) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
131 my ($chr, $pos) = ($1, $2);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
132 if (/NEIR=([\d\.]+)/) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
133 ++$n;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
134 ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
135 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
136 $last = $pos; $lastchr = $chr;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
137 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
138 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
139 print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
140 print "Fraction: ", $y/$n, "\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
141 print "Length: $x\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
142 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
143
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
144 sub qstats {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
145 my %opts = (r=>'', s=>0.02, v=>undef);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
146 getopts('r:s:v', \%opts);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
147 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
148 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
149 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
150 my %h = ();
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
151 my $is_vcf = defined($opts{v})? 1 : 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
152 if ($opts{r}) { # read the reference positions
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
153 my $fh;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
154 open($fh, $opts{r}) || die;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
155 while (<$fh>) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
156 next if (/^#/);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
157 if ($is_vcf) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
158 my @t = split;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
159 $h{$t[0],$t[1]} = $t[4];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
160 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
161 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
162 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
163 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
164 close($fh);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
165 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
166 my $hsize = scalar(keys %h);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
167 my @a;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
168 while (<>) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
169 next if (/^#/);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
170 my @t = split;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
171 next if (length($t[3]) != 1 || uc($t[3]) eq 'N');
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
172 $t[3] = uc($t[3]); $t[4] = uc($t[4]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
173 my @s = split(',', $t[4]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
174 $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
175 next if (length($s[0]) != 1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
176 my $hit;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
177 if ($is_vcf) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
178 $hit = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
179 my $aa = $h{$t[0],$t[1]};
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
180 if (defined($aa)) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
181 my @aaa = split(",", $aa);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
182 for (@aaa) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
183 $hit = 1 if ($_ eq $s[0]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
184 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
185 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
186 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
187 $hit = defined($h{$t[0],$t[1]})? 1 : 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
188 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
189 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
190 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
191 push(@a, [-1, 0, 0, 0]); # end marker
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
192 die("[qstats] No SNP data!\n") if (@a == 0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
193 @a = sort {$b->[0]<=>$a->[0]} @a;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
194 my $next = $opts{s};
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
195 my $last = $a[0];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
196 my @c = (0, 0, 0, 0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
197 my @lc;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
198 $lc[1] = $lc[2] = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
199 for my $p (@a) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
200 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
201 my @x;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
202 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
203 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
204 $x[2] = sprintf("%.4f", $c[3] / $c[1]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
205 my $a = $c[1] - $lc[1];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
206 my $b = $c[2] - $lc[2];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
207 $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
208 print join("\t", $last, @c, @x), "\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
209 $next = $c[0]/@a + $opts{s};
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
210 $lc[1] = $c[1]; $lc[2] = $c[2];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
211 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
212 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
213 $last = $p->[0];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
214 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
215 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
216
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
217 sub varFilter {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
218 my %opts = (d=>2, D=>10000000, a=>2, W=>10, Q=>10, w=>10, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
219 getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:', \%opts);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
220 die(qq/
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
221 Usage: vcfutils.pl varFilter [options] <in.vcf>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
222
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
223 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
224 -d INT minimum read depth [$opts{d}]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
225 -D INT maximum read depth [$opts{D}]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
226 -a INT minimum number of alternate bases [$opts{a}]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
227 -w INT SNP within INT bp around a gap to be filtered [$opts{w}]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
228 -W INT window size for filtering adjacent gaps [$opts{W}]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
229 -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
230 -2 FLOAT min P-value for baseQ bias [$opts{2}]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
231 -3 FLOAT min P-value for mapQ bias [$opts{3}]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
232 -4 FLOAT min P-value for end distance bias [$opts{4}]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
233 -p print filtered variants
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
234
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
235 Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
236 \n/) if (@ARGV == 0 && -t STDIN);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
237
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
238 # calculate the window size
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
239 my ($ol, $ow) = ($opts{W}, $opts{w});
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
240 my $max_dist = $ol > $ow? $ol : $ow;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
241 # the core loop
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
242 my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
243 while (<>) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
244 my @t = split;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
245 if (/^#/) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
246 print; next;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
247 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
248 next if ($t[4] eq '.'); # skip non-var sites
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
249 next if ($t[3] eq 'N'); # skip sites with unknown ref ('N')
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
250 # check if the site is a SNP
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
251 my $type = 1; # SNP
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
252 if (length($t[3]) > 1) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
253 $type = 2; # MNP
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
254 my @s = split(',', $t[4]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
255 for (@s) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
256 $type = 3 if (length != length($t[3]));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
257 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
258 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
259 my @s = split(',', $t[4]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
260 for (@s) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
261 $type = 3 if (length > 1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
262 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
263 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
264 # clear the out-of-range elements
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
265 while (@staging) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
266 # Still on the same chromosome and the first element's window still affects this position?
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
267 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
268 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
269 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
270 my $flt = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
271 # parse annotations
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
272 my ($dp, $mq, $dp_alt) = (-1, -1, -1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
273 if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
274 $dp = $1 + $2 + $3 + $4;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
275 $dp_alt = $3 + $4;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
276 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
277 if ($t[7] =~ /DP=(\d+)/i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
278 $dp = $1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
279 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
280 $mq = $1 if ($t[7] =~ /MQ=(\d+)/i);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
281 # the depth and mapQ filter
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
282 if ($dp >= 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
283 if ($dp < $opts{d}) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
284 $flt = 2;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
285 } elsif ($dp > $opts{D}) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
286 $flt = 3;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
287 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
288 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
289 $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a});
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
290 $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q});
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
291 $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
292 && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4}));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
293 $flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 < $opts{G}) || (/MXSP=(\d+)/ && $1 >= $opts{S})));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
294
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
295 my $score = $t[5] * 100 + $dp_alt;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
296 my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
297 if ($flt == 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
298 if ($type == 3) { # an indel
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
299 # filtering SNPs and MNPs
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
300 for my $x (@staging) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
301 next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
302 $x->[1] = 5;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
303 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
304 # check the staging list for indel filtering
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
305 for my $x (@staging) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
306 next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
307 if ($x->[0]>>2 < $score) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
308 $x->[1] = 6;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
309 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
310 $flt = 6; last;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
311 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
312 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
313 } else { # SNP or MNP
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
314 for my $x (@staging) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
315 next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
316 if ($x->[4] + length($x->[7]) - 1 == $t[1] && substr($x->[7], -1, 1) eq substr($t[4], 0, 1)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
317 && length($x->[7]) - length($x->[6]) == 1) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
318 $x->[1] = 5;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
319 } else { $flt = 5; }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
320 last;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
321 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
322 # check MNP
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
323 for my $x (@staging) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
324 next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
325 if ($x->[0]>>2 < $score) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
326 $x->[1] = 8;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
327 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
328 $flt = 8; last;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
329 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
330 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
331 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
332 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
333 push(@staging, [$score<<2|$type, $flt, $rlen, @t]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
334 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
335 # output the last few elements in the staging list
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
336 while (@staging) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
337 varFilter_aux(shift @staging, $opts{p});
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
338 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
339 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
340
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
341 sub varFilter_aux {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
342 my ($first, $is_print) = @_;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
343 if ($first->[1] == 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
344 print join("\t", @$first[3 .. @$first-1]), "\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
345 } elsif ($is_print) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
346 print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
347 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
348 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
349
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
350 sub gapstats {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
351 my (@c0, @c1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
352 $c0[$_] = $c1[$_] = 0 for (0 .. 10000);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
353 while (<>) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
354 next if (/^#/);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
355 my @t = split;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
356 next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
357 my @s = split(',', $t[4]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
358 for my $x (@s) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
359 my $l = length($x) - length($t[3]) + 5000;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
360 if ($x =~ /^-/) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
361 $l = -(length($x) - 1) + 5000;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
362 } elsif ($x =~ /^\+/) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
363 $l = length($x) - 1 + 5000;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
364 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
365 $c0[$l] += 1 / @s;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
366 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
367 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
368 for (my $i = 0; $i < 10000; ++$i) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
369 next if ($c0[$i] == 0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
370 $c1[0] += $c0[$i];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
371 $c1[1] += $c0[$i] if (($i-5000)%3 == 0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
372 printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
373 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
374 printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
375 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
376
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
377 sub ucscsnp2vcf {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
378 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
379 print "##fileformat=VCFv4.0\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
380 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
381 while (<>) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
382 my @t = split("\t");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
383 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
384 my $pos = $t[2] + 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
385 my @alt;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
386 push(@alt, $t[7]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
387 if ($t[6] eq '-') {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
388 $t[9] = reverse($t[9]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
389 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
390 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
391 my @a = split("/", $t[9]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
392 for (@a) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
393 push(@alt, $_) if ($_ ne $alt[0]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
394 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
395 if ($indel) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
396 --$pos;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
397 for (0 .. $#alt) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
398 $alt[$_] =~ tr/-//d;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
399 $alt[$_] = "N$alt[$_]";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
400 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
401 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
402 my $ref = shift(@alt);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
403 my $af = $t[13] > 0? ";AF=$t[13]" : '';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
404 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
405 my $info = "molType=$t[10];class=$t[11]$valid$af";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
406 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
407 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
408 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
409
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
410 sub hapmap2vcf {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
411 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
412 my $fn = shift(@ARGV);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
413 # parse UCSC SNP
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
414 warn("Parsing UCSC SNPs...\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
415 my ($fh, %map);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
416 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
417 while (<$fh>) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
418 my @t = split;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
419 next if ($t[3] - $t[2] != 1); # not SNP
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
420 @{$map{$t[4]}} = @t[1,3,7];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
421 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
422 close($fh);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
423 # write VCF
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
424 warn("Writing VCF...\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
425 print "##fileformat=VCFv4.0\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
426 while (<>) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
427 my @t = split;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
428 if ($t[0] eq 'rs#') { # the first line
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
429 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
430 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
431 next unless ($map{$t[0]});
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
432 next if (length($t[1]) != 3); # skip non-SNPs
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
433 my $a = \@{$map{$t[0]}};
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
434 my $ref = $a->[2];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
435 my @u = split('/', $t[1]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
436 if ($u[1] eq $ref) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
437 $u[1] = $u[0]; $u[0] = $ref;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
438 } elsif ($u[0] ne $ref) { next; }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
439 my $alt = $u[1];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
440 my %w;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
441 $w{$u[0]} = 0; $w{$u[1]} = 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
442 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT');
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
443 my $is_tri = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
444 for (@t[11..$#t]) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
445 if ($_ eq 'NN') {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
446 push(@s, './.');
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
447 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
448 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)});
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
449 if (!defined($a[0]) || !defined($a[1])) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
450 $is_tri = 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
451 last;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
452 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
453 push(@s, "$a[0]/$a[1]");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
454 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
455 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
456 next if ($is_tri);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
457 print join("\t", @s), "\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
458 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
459 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
460 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
461
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
462 sub vcf2fq {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
463 my %opts = (d=>3, D=>100000, Q=>10, l=>5);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
464 getopts('d:D:Q:l:', \%opts);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
465 die(qq/
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
466 Usage: vcfutils.pl vcf2fq [options] <all-site.vcf>
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
467
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
468 Options: -d INT minimum depth [$opts{d}]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
469 -D INT maximum depth [$opts{D}]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
470 -Q INT min RMS mapQ [$opts{Q}]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
471 -l INT INDEL filtering window [$opts{l}]
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
472 \n/) if (@ARGV == 0 && -t STDIN);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
473
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
474 my ($last_chr, $seq, $qual, $last_pos, @gaps);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
475 my $_Q = $opts{Q};
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
476 my $_d = $opts{d};
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
477 my $_D = $opts{D};
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
478
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
479 my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y',
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
480 GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K');
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
481
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
482 $last_chr = '';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
483 while (<>) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
484 next if (/^#/);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
485 my @t = split;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
486 if ($last_chr ne $t[0]) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
487 &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
488 ($last_chr, $last_pos) = ($t[0], 0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
489 $seq = $qual = '';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
490 @gaps = ();
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
491 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
492 die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
493 if ($t[1] - $last_pos > 1) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
494 $seq .= 'n' x ($t[1] - $last_pos - 1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
495 $qual .= '!' x ($t[1] - $last_pos - 1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
496 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
497 if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
498 my ($ref, $alt) = ($t[3], $1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
499 my ($b, $q);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
500 $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
501 if ($q < 0) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
502 $_ = $1 if ($t[7] =~ /AF1=([\d\.]+)/);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
503 $b = ($_ < .5 || $alt eq '.')? $ref : $alt;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
504 $q = -$q;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
505 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
506 $b = $het{"$ref$alt"};
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
507 $b ||= 'N';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
508 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
509 $b = lc($b);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
510 $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
511 $q = int($q + 33 + .499);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
512 $q = chr($q <= 126? $q : 126);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
513 $seq .= $b;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
514 $qual .= $q;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
515 } elsif ($t[4] ne '.') { # an INDEL
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
516 push(@gaps, [$t[1], length($t[3])]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
517 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
518 $last_pos = $t[1];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
519 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
520 &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l});
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
521 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
522
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
523 sub v2q_post_process {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
524 my ($chr, $seq, $qual, $gaps, $l) = @_;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
525 for my $g (@$gaps) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
526 my $beg = $g->[0] > $l? $g->[0] - $l : 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
527 my $end = $g->[0] + $g->[1] + $l;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
528 $end = length($$seq) if ($end > length($$seq));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
529 substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
530 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
531 print "\@$chr\n"; &v2q_print_str($seq);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
532 print "+\n"; &v2q_print_str($qual);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
533 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
534
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
535 sub v2q_print_str {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
536 my ($s) = @_;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
537 my $l = length($$s);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
538 for (my $i = 0; $i < $l; $i += 60) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
539 print substr($$s, $i, 60), "\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
540 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
541 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
542
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
543 sub usage {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
544 die(qq/
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
545 Usage: vcfutils.pl <command> [<arguments>]\n
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
546 Command: subsam get a subset of samples
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
547 listsam list the samples
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
548 fillac fill the allele count field
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
549 qstats SNP stats stratified by QUAL
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
550
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
551 hapmap2vcf convert the hapmap format to VCF
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
552 ucscsnp2vcf convert UCSC SNP SQL dump to VCF
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
553
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
554 varFilter filtering short variants (*)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
555 vcf2fq VCF->fastq (**)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
556
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
557 Notes: Commands with description endting with (*) may need bcftools
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
558 specific annotations.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
559 \n/);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
560 }