Mercurial > repos > lsong10 > psiclass
comparison PsiCLASS-1.0.2/samtools-0.1.19/bcftools/vcfutils.pl @ 0:903fc43d6227 draft default tip
Uploaded
| author | lsong10 |
|---|---|
| date | Fri, 26 Mar 2021 16:52:45 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:903fc43d6227 |
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 | |
| 3 # Author: lh3 | |
| 4 | |
| 5 use strict; | |
| 6 use warnings; | |
| 7 use Getopt::Std; | |
| 8 | |
| 9 &main; | |
| 10 exit; | |
| 11 | |
| 12 sub main { | |
| 13 &usage if (@ARGV < 1); | |
| 14 my $command = shift(@ARGV); | |
| 15 my %func = (subsam=>\&subsam, listsam=>\&listsam, fillac=>\&fillac, qstats=>\&qstats, varFilter=>\&varFilter, | |
| 16 hapmap2vcf=>\&hapmap2vcf, ucscsnp2vcf=>\&ucscsnp2vcf, filter4vcf=>\&varFilter, ldstats=>\&ldstats, | |
| 17 gapstats=>\&gapstats, splitchr=>\&splitchr, vcf2fq=>\&vcf2fq); | |
| 18 die("Unknown command \"$command\".\n") if (!defined($func{$command})); | |
| 19 &{$func{$command}}; | |
| 20 } | |
| 21 | |
| 22 sub splitchr { | |
| 23 my %opts = (l=>5000000); | |
| 24 getopts('l:', \%opts); | |
| 25 my $l = $opts{l}; | |
| 26 die(qq/Usage: vcfutils.pl splitchr [-l $opts{l}] <in.fa.fai>\n/) if (@ARGV == 0 && -t STDIN); | |
| 27 while (<>) { | |
| 28 my @t = split; | |
| 29 my $last = 0; | |
| 30 for (my $i = 0; $i < $t[1];) { | |
| 31 my $e = ($t[1] - $i) / $l < 1.1? $t[1] : $i + $l; | |
| 32 print "$t[0]:".($i+1)."-$e\n"; | |
| 33 $i = $e; | |
| 34 } | |
| 35 } | |
| 36 } | |
| 37 | |
| 38 sub subsam { | |
| 39 die(qq/Usage: vcfutils.pl subsam <in.vcf> [samples]\n/) if (@ARGV == 0); | |
| 40 my ($fh, %h); | |
| 41 my $fn = shift(@ARGV); | |
| 42 my @col; | |
| 43 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die; | |
| 44 $h{$_} = 1 for (@ARGV); | |
| 45 while (<$fh>) { | |
| 46 if (/^##/) { | |
| 47 print; | |
| 48 } elsif (/^#/) { | |
| 49 my @t = split; | |
| 50 my @s = @t[0..8]; # all fixed fields + FORMAT | |
| 51 for (9 .. $#t) { | |
| 52 if ($h{$t[$_]}) { | |
| 53 push(@s, $t[$_]); | |
| 54 push(@col, $_); | |
| 55 } | |
| 56 } | |
| 57 pop(@s) if (@s == 9); # no sample selected; remove the FORMAT field | |
| 58 print join("\t", @s), "\n"; | |
| 59 } else { | |
| 60 my @t = split; | |
| 61 if (@col == 0) { | |
| 62 print join("\t", @t[0..7]), "\n"; | |
| 63 } else { | |
| 64 print join("\t", @t[0..8], map {$t[$_]} @col), "\n"; | |
| 65 } | |
| 66 } | |
| 67 } | |
| 68 close($fh); | |
| 69 } | |
| 70 | |
| 71 sub listsam { | |
| 72 die(qq/Usage: vcfutils.pl listsam <in.vcf>\n/) if (@ARGV == 0 && -t STDIN); | |
| 73 while (<>) { | |
| 74 if (/^#/ && !/^##/) { | |
| 75 my @t = split; | |
| 76 print join("\n", @t[9..$#t]), "\n"; | |
| 77 exit; | |
| 78 } | |
| 79 } | |
| 80 } | |
| 81 | |
| 82 sub fillac { | |
| 83 die(qq/Usage: vcfutils.pl fillac <in.vcf>\n\nNote: The GT field MUST BE present and always appear as the first field.\n/) if (@ARGV == 0 && -t STDIN); | |
| 84 while (<>) { | |
| 85 if (/^#/) { | |
| 86 print; | |
| 87 } else { | |
| 88 my @t = split; | |
| 89 my @c = (0, 0); | |
| 90 my $n = 0; | |
| 91 my $s = -1; | |
| 92 @_ = split(":", $t[8]); | |
| 93 for (0 .. $#_) { | |
| 94 if ($_[$_] eq 'GT') { $s = $_; last; } | |
| 95 } | |
| 96 if ($s < 0) { | |
| 97 print join("\t", @t), "\n"; | |
| 98 next; | |
| 99 } | |
| 100 for (9 .. $#t) { | |
| 101 if ($t[$_] =~ /^0,0,0/) { | |
| 102 } elsif ($t[$_] =~ /^([^\s:]+:){$s}(\d+).(\d+)/) { | |
| 103 ++$c[$2]; ++$c[$3]; | |
| 104 $n += 2; | |
| 105 } | |
| 106 } | |
| 107 my $AC = "AC=" . join("\t", @c[1..$#c]) . ";AN=$n"; | |
| 108 my $info = $t[7]; | |
| 109 $info =~ s/(;?)AC=(\d+)//; | |
| 110 $info =~ s/(;?)AN=(\d+)//; | |
| 111 if ($info eq '.') { | |
| 112 $info = $AC; | |
| 113 } else { | |
| 114 $info .= ";$AC"; | |
| 115 } | |
| 116 $t[7] = $info; | |
| 117 print join("\t", @t), "\n"; | |
| 118 } | |
| 119 } | |
| 120 } | |
| 121 | |
| 122 sub ldstats { | |
| 123 my %opts = (t=>0.9); | |
| 124 getopts('t:', \%opts); | |
| 125 die("Usage: vcfutils.pl ldstats [-t $opts{t}] <in.vcf>\n") if (@ARGV == 0 && -t STDIN); | |
| 126 my $cutoff = $opts{t}; | |
| 127 my ($last, $lastchr) = (0x7fffffff, ''); | |
| 128 my ($x, $y, $n) = (0, 0, 0); | |
| 129 while (<>) { | |
| 130 if (/^([^#\s]+)\s(\d+)/) { | |
| 131 my ($chr, $pos) = ($1, $2); | |
| 132 if (/NEIR=([\d\.]+)/) { | |
| 133 ++$n; | |
| 134 ++$y, $x += $pos - $last if ($lastchr eq $chr && $pos > $last && $1 > $cutoff); | |
| 135 } | |
| 136 $last = $pos; $lastchr = $chr; | |
| 137 } | |
| 138 } | |
| 139 print "Number of SNP intervals in strong LD (r > $opts{t}): $y\n"; | |
| 140 print "Fraction: ", $y/$n, "\n"; | |
| 141 print "Length: $x\n"; | |
| 142 } | |
| 143 | |
| 144 sub qstats { | |
| 145 my %opts = (r=>'', s=>0.02, v=>undef); | |
| 146 getopts('r:s:v', \%opts); | |
| 147 die("Usage: vcfutils.pl qstats [-r ref.vcf] <in.vcf>\n | |
| 148 Note: This command discards indels. Output: QUAL #non-indel #SNPs #transitions #joint ts/tv #joint/#ref #joint/#non-indel \n") if (@ARGV == 0 && -t STDIN); | |
| 149 my %ts = (AG=>1, GA=>1, CT=>1, TC=>1); | |
| 150 my %h = (); | |
| 151 my $is_vcf = defined($opts{v})? 1 : 0; | |
| 152 if ($opts{r}) { # read the reference positions | |
| 153 my $fh; | |
| 154 open($fh, $opts{r}) || die; | |
| 155 while (<$fh>) { | |
| 156 next if (/^#/); | |
| 157 if ($is_vcf) { | |
| 158 my @t = split; | |
| 159 $h{$t[0],$t[1]} = $t[4]; | |
| 160 } else { | |
| 161 $h{$1,$2} = 1 if (/^(\S+)\s+(\d+)/); | |
| 162 } | |
| 163 } | |
| 164 close($fh); | |
| 165 } | |
| 166 my $hsize = scalar(keys %h); | |
| 167 my @a; | |
| 168 while (<>) { | |
| 169 next if (/^#/); | |
| 170 my @t = split; | |
| 171 next if (length($t[3]) != 1 || uc($t[3]) eq 'N'); | |
| 172 $t[3] = uc($t[3]); $t[4] = uc($t[4]); | |
| 173 my @s = split(',', $t[4]); | |
| 174 $t[5] = 3 if ($t[5] eq '.' || $t[5] < 0); | |
| 175 next if (length($s[0]) != 1); | |
| 176 my $hit; | |
| 177 if ($is_vcf) { | |
| 178 $hit = 0; | |
| 179 my $aa = $h{$t[0],$t[1]}; | |
| 180 if (defined($aa)) { | |
| 181 my @aaa = split(",", $aa); | |
| 182 for (@aaa) { | |
| 183 $hit = 1 if ($_ eq $s[0]); | |
| 184 } | |
| 185 } | |
| 186 } else { | |
| 187 $hit = defined($h{$t[0],$t[1]})? 1 : 0; | |
| 188 } | |
| 189 push(@a, [$t[5], ($t[4] eq '.' || $t[4] eq $t[3])? 0 : 1, $ts{$t[3].$s[0]}? 1 : 0, $hit]); | |
| 190 } | |
| 191 push(@a, [-1, 0, 0, 0]); # end marker | |
| 192 die("[qstats] No SNP data!\n") if (@a == 0); | |
| 193 @a = sort {$b->[0]<=>$a->[0]} @a; | |
| 194 my $next = $opts{s}; | |
| 195 my $last = $a[0]; | |
| 196 my @c = (0, 0, 0, 0); | |
| 197 my @lc; | |
| 198 $lc[1] = $lc[2] = 0; | |
| 199 for my $p (@a) { | |
| 200 if ($p->[0] == -1 || ($p->[0] != $last && $c[0]/@a > $next)) { | |
| 201 my @x; | |
| 202 $x[0] = sprintf("%.4f", $c[1]-$c[2]? $c[2] / ($c[1] - $c[2]) : 100); | |
| 203 $x[1] = sprintf("%.4f", $hsize? $c[3] / $hsize : 0); | |
| 204 $x[2] = sprintf("%.4f", $c[3] / $c[1]); | |
| 205 my $a = $c[1] - $lc[1]; | |
| 206 my $b = $c[2] - $lc[2]; | |
| 207 $x[3] = sprintf("%.4f", $a-$b? $b / ($a-$b) : 100); | |
| 208 print join("\t", $last, @c, @x), "\n"; | |
| 209 $next = $c[0]/@a + $opts{s}; | |
| 210 $lc[1] = $c[1]; $lc[2] = $c[2]; | |
| 211 } | |
| 212 ++$c[0]; $c[1] += $p->[1]; $c[2] += $p->[2]; $c[3] += $p->[3]; | |
| 213 $last = $p->[0]; | |
| 214 } | |
| 215 } | |
| 216 | |
| 217 sub varFilter { | |
| 218 my %opts = (d=>2, D=>10000000, a=>2, W=>10, Q=>10, w=>3, p=>undef, 1=>1e-4, 2=>1e-100, 3=>0, 4=>1e-4, G=>0, S=>1000, e=>1e-4); | |
| 219 getopts('pd:D:W:Q:w:a:1:2:3:4:G:S:e:', \%opts); | |
| 220 die(qq/ | |
| 221 Usage: vcfutils.pl varFilter [options] <in.vcf> | |
| 222 | |
| 223 Options: -Q INT minimum RMS mapping quality for SNPs [$opts{Q}] | |
| 224 -d INT minimum read depth [$opts{d}] | |
| 225 -D INT maximum read depth [$opts{D}] | |
| 226 -a INT minimum number of alternate bases [$opts{a}] | |
| 227 -w INT SNP within INT bp around a gap to be filtered [$opts{w}] | |
| 228 -W INT window size for filtering adjacent gaps [$opts{W}] | |
| 229 -1 FLOAT min P-value for strand bias (given PV4) [$opts{1}] | |
| 230 -2 FLOAT min P-value for baseQ bias [$opts{2}] | |
| 231 -3 FLOAT min P-value for mapQ bias [$opts{3}] | |
| 232 -4 FLOAT min P-value for end distance bias [$opts{4}] | |
| 233 -e FLOAT min P-value for HWE (plus F<0) [$opts{e}] | |
| 234 -p print filtered variants | |
| 235 | |
| 236 Note: Some of the filters rely on annotations generated by SAMtools\/BCFtools. | |
| 237 \n/) if (@ARGV == 0 && -t STDIN); | |
| 238 | |
| 239 # calculate the window size | |
| 240 my ($ol, $ow) = ($opts{W}, $opts{w}); | |
| 241 my $max_dist = $ol > $ow? $ol : $ow; | |
| 242 # the core loop | |
| 243 my @staging; # (indel_filtering_score, flt_tag, indel_span; chr, pos, ...) | |
| 244 while (<>) { | |
| 245 my @t = split; | |
| 246 if (/^#/) { | |
| 247 print; next; | |
| 248 } | |
| 249 next if ($t[4] eq '.'); # skip non-var sites | |
| 250 next if ($t[3] eq 'N'); # skip sites with unknown ref ('N') | |
| 251 # check if the site is a SNP | |
| 252 my $type = 1; # SNP | |
| 253 if (length($t[3]) > 1) { | |
| 254 $type = 2; # MNP | |
| 255 my @s = split(',', $t[4]); | |
| 256 for (@s) { | |
| 257 $type = 3 if (length != length($t[3])); | |
| 258 } | |
| 259 } else { | |
| 260 my @s = split(',', $t[4]); | |
| 261 for (@s) { | |
| 262 $type = 3 if (length > 1); | |
| 263 } | |
| 264 } | |
| 265 # clear the out-of-range elements | |
| 266 while (@staging) { | |
| 267 # Still on the same chromosome and the first element's window still affects this position? | |
| 268 last if ($staging[0][3] eq $t[0] && $staging[0][4] + $staging[0][2] + $max_dist >= $t[1]); | |
| 269 varFilter_aux(shift(@staging), $opts{p}); # calling a function is a bit slower, not much | |
| 270 } | |
| 271 my $flt = 0; | |
| 272 # parse annotations | |
| 273 my ($dp, $mq, $dp_alt) = (-1, -1, -1); | |
| 274 if ($t[7] =~ /DP4=(\d+),(\d+),(\d+),(\d+)/i) { | |
| 275 $dp = $1 + $2 + $3 + $4; | |
| 276 $dp_alt = $3 + $4; | |
| 277 } | |
| 278 if ($t[7] =~ /DP=(\d+)/i) { | |
| 279 $dp = $1; | |
| 280 } | |
| 281 $mq = $1 if ($t[7] =~ /MQ=(\d+)/i); | |
| 282 # the depth and mapQ filter | |
| 283 if ($dp >= 0) { | |
| 284 if ($dp < $opts{d}) { | |
| 285 $flt = 2; | |
| 286 } elsif ($dp > $opts{D}) { | |
| 287 $flt = 3; | |
| 288 } | |
| 289 } | |
| 290 $flt = 4 if ($dp_alt >= 0 && $dp_alt < $opts{a}); | |
| 291 $flt = 1 if ($flt == 0 && $mq >= 0 && $mq < $opts{Q}); | |
| 292 $flt = 7 if ($flt == 0 && /PV4=([^,]+),([^,]+),([^,]+),([^,;\t]+)/ | |
| 293 && ($1<$opts{1} || $2<$opts{2} || $3<$opts{3} || $4<$opts{4})); | |
| 294 $flt = 8 if ($flt == 0 && ((/MXGQ=(\d+)/ && $1 < $opts{G}) || (/MXSP=(\d+)/ && $1 >= $opts{S}))); | |
| 295 # HWE filter | |
| 296 if ($t[7] =~ /G3=([^;,]+),([^;,]+),([^;,]+).*HWE=([^;,]+)/ && $4 < $opts{e}) { | |
| 297 my $p = 2*$1 + $2; | |
| 298 my $f = ($p > 0 && $p < 1)? 1 - $2 / ($p * (1-$p)) : 0; | |
| 299 $flt = 9 if ($f < 0); | |
| 300 } | |
| 301 | |
| 302 my $score = $t[5] * 100 + $dp_alt; | |
| 303 my $rlen = length($t[3]) - 1; # $indel_score<0 for SNPs | |
| 304 if ($flt == 0) { | |
| 305 if ($type == 3) { # an indel | |
| 306 # filtering SNPs and MNPs | |
| 307 for my $x (@staging) { | |
| 308 next if (($x->[0]&3) == 3 || $x->[1] || $x->[4] + $x->[2] + $ow < $t[1]); | |
| 309 $x->[1] = 5; | |
| 310 } | |
| 311 # check the staging list for indel filtering | |
| 312 for my $x (@staging) { | |
| 313 next if (($x->[0]&3) != 3 || $x->[1] || $x->[4] + $x->[2] + $ol < $t[1]); | |
| 314 if ($x->[0]>>2 < $score) { | |
| 315 $x->[1] = 6; | |
| 316 } else { | |
| 317 $flt = 6; last; | |
| 318 } | |
| 319 } | |
| 320 } else { # SNP or MNP | |
| 321 for my $x (@staging) { | |
| 322 next if (($x->[0]&3) != 3 || $x->[4] + $x->[2] + $ow < $t[1]); | |
| 323 if ($x->[4] + length($x->[7]) - 1 == $t[1] && substr($x->[7], -1, 1) eq substr($t[4], 0, 1) | |
| 324 && length($x->[7]) - length($x->[6]) == 1) { | |
| 325 $x->[1] = 5; | |
| 326 } else { $flt = 5; } | |
| 327 last; | |
| 328 } | |
| 329 # check MNP | |
| 330 for my $x (@staging) { | |
| 331 next if (($x->[0]&3) == 3 || $x->[4] + $x->[2] < $t[1]); | |
| 332 if ($x->[0]>>2 < $score) { | |
| 333 $x->[1] = 8; | |
| 334 } else { | |
| 335 $flt = 8; last; | |
| 336 } | |
| 337 } | |
| 338 } | |
| 339 } | |
| 340 push(@staging, [$score<<2|$type, $flt, $rlen, @t]); | |
| 341 } | |
| 342 # output the last few elements in the staging list | |
| 343 while (@staging) { | |
| 344 varFilter_aux(shift @staging, $opts{p}); | |
| 345 } | |
| 346 } | |
| 347 | |
| 348 sub varFilter_aux { | |
| 349 my ($first, $is_print) = @_; | |
| 350 if ($first->[1] == 0) { | |
| 351 print join("\t", @$first[3 .. @$first-1]), "\n"; | |
| 352 } elsif ($is_print) { | |
| 353 print STDERR join("\t", substr("UQdDaGgPMS", $first->[1], 1), @$first[3 .. @$first-1]), "\n"; | |
| 354 } | |
| 355 } | |
| 356 | |
| 357 sub gapstats { | |
| 358 my (@c0, @c1); | |
| 359 $c0[$_] = $c1[$_] = 0 for (0 .. 10000); | |
| 360 while (<>) { | |
| 361 next if (/^#/); | |
| 362 my @t = split; | |
| 363 next if (length($t[3]) == 1 && $t[4] =~ /^[A-Za-z](,[A-Za-z])*$/); # not an indel | |
| 364 my @s = split(',', $t[4]); | |
| 365 for my $x (@s) { | |
| 366 my $l = length($x) - length($t[3]) + 5000; | |
| 367 if ($x =~ /^-/) { | |
| 368 $l = -(length($x) - 1) + 5000; | |
| 369 } elsif ($x =~ /^\+/) { | |
| 370 $l = length($x) - 1 + 5000; | |
| 371 } | |
| 372 $c0[$l] += 1 / @s; | |
| 373 } | |
| 374 } | |
| 375 for (my $i = 0; $i < 10000; ++$i) { | |
| 376 next if ($c0[$i] == 0); | |
| 377 $c1[0] += $c0[$i]; | |
| 378 $c1[1] += $c0[$i] if (($i-5000)%3 == 0); | |
| 379 printf("C\t%d\t%.2f\n", ($i-5000), $c0[$i]); | |
| 380 } | |
| 381 printf("3\t%d\t%d\t%.3f\n", $c1[0], $c1[1], $c1[1]/$c1[0]); | |
| 382 } | |
| 383 | |
| 384 sub ucscsnp2vcf { | |
| 385 die("Usage: vcfutils.pl <in.ucsc.snp>\n") if (@ARGV == 0 && -t STDIN); | |
| 386 print "##fileformat=VCFv4.0\n"; | |
| 387 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"), "\n"; | |
| 388 while (<>) { | |
| 389 my @t = split("\t"); | |
| 390 my $indel = ($t[9] =~ /^[ACGT](\/[ACGT])+$/)? 0 : 1; | |
| 391 my $pos = $t[2] + 1; | |
| 392 my @alt; | |
| 393 push(@alt, $t[7]); | |
| 394 if ($t[6] eq '-') { | |
| 395 $t[9] = reverse($t[9]); | |
| 396 $t[9] =~ tr/ACGTRYMKWSNacgtrymkwsn/TGCAYRKMWSNtgcayrkmwsn/; | |
| 397 } | |
| 398 my @a = split("/", $t[9]); | |
| 399 for (@a) { | |
| 400 push(@alt, $_) if ($_ ne $alt[0]); | |
| 401 } | |
| 402 if ($indel) { | |
| 403 --$pos; | |
| 404 for (0 .. $#alt) { | |
| 405 $alt[$_] =~ tr/-//d; | |
| 406 $alt[$_] = "N$alt[$_]"; | |
| 407 } | |
| 408 } | |
| 409 my $ref = shift(@alt); | |
| 410 my $af = $t[13] > 0? ";AF=$t[13]" : ''; | |
| 411 my $valid = ($t[12] eq 'unknown')? '' : ";valid=$t[12]"; | |
| 412 my $info = "molType=$t[10];class=$t[11]$valid$af"; | |
| 413 print join("\t", $t[1], $pos, $t[4], $ref, join(",", @alt), 0, '.', $info), "\n"; | |
| 414 } | |
| 415 } | |
| 416 | |
| 417 sub hapmap2vcf { | |
| 418 die("Usage: vcfutils.pl <in.ucsc.snp> <in.hapmap>\n") if (@ARGV == 0); | |
| 419 my $fn = shift(@ARGV); | |
| 420 # parse UCSC SNP | |
| 421 warn("Parsing UCSC SNPs...\n"); | |
| 422 my ($fh, %map); | |
| 423 open($fh, ($fn =~ /\.gz$/)? "gzip -dc $fn |" : $fn) || die; | |
| 424 while (<$fh>) { | |
| 425 my @t = split; | |
| 426 next if ($t[3] - $t[2] != 1); # not SNP | |
| 427 @{$map{$t[4]}} = @t[1,3,7]; | |
| 428 } | |
| 429 close($fh); | |
| 430 # write VCF | |
| 431 warn("Writing VCF...\n"); | |
| 432 print "##fileformat=VCFv4.0\n"; | |
| 433 while (<>) { | |
| 434 my @t = split; | |
| 435 if ($t[0] eq 'rs#') { # the first line | |
| 436 print join("\t", "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT", @t[11..$#t]), "\n"; | |
| 437 } else { | |
| 438 next unless ($map{$t[0]}); | |
| 439 next if (length($t[1]) != 3); # skip non-SNPs | |
| 440 my $a = \@{$map{$t[0]}}; | |
| 441 my $ref = $a->[2]; | |
| 442 my @u = split('/', $t[1]); | |
| 443 if ($u[1] eq $ref) { | |
| 444 $u[1] = $u[0]; $u[0] = $ref; | |
| 445 } elsif ($u[0] ne $ref) { next; } | |
| 446 my $alt = $u[1]; | |
| 447 my %w; | |
| 448 $w{$u[0]} = 0; $w{$u[1]} = 1; | |
| 449 my @s = (@$a[0,1], $t[0], $ref, $alt, 0, '.', '.', 'GT'); | |
| 450 my $is_tri = 0; | |
| 451 for (@t[11..$#t]) { | |
| 452 if ($_ eq 'NN') { | |
| 453 push(@s, './.'); | |
| 454 } else { | |
| 455 my @a = ($w{substr($_,0,1)}, $w{substr($_,1,1)}); | |
| 456 if (!defined($a[0]) || !defined($a[1])) { | |
| 457 $is_tri = 1; | |
| 458 last; | |
| 459 } | |
| 460 push(@s, "$a[0]/$a[1]"); | |
| 461 } | |
| 462 } | |
| 463 next if ($is_tri); | |
| 464 print join("\t", @s), "\n"; | |
| 465 } | |
| 466 } | |
| 467 } | |
| 468 | |
| 469 sub vcf2fq { | |
| 470 my %opts = (d=>3, D=>100000, Q=>10, l=>5); | |
| 471 getopts('d:D:Q:l:', \%opts); | |
| 472 die(qq/ | |
| 473 Usage: vcfutils.pl vcf2fq [options] <all-site.vcf> | |
| 474 | |
| 475 Options: -d INT minimum depth [$opts{d}] | |
| 476 -D INT maximum depth [$opts{D}] | |
| 477 -Q INT min RMS mapQ [$opts{Q}] | |
| 478 -l INT INDEL filtering window [$opts{l}] | |
| 479 \n/) if (@ARGV == 0 && -t STDIN); | |
| 480 | |
| 481 my ($last_chr, $seq, $qual, $last_pos, @gaps); | |
| 482 my $_Q = $opts{Q}; | |
| 483 my $_d = $opts{d}; | |
| 484 my $_D = $opts{D}; | |
| 485 | |
| 486 my %het = (AC=>'M', AG=>'R', AT=>'W', CA=>'M', CG=>'S', CT=>'Y', | |
| 487 GA=>'R', GC=>'S', GT=>'K', TA=>'W', TC=>'Y', TG=>'K'); | |
| 488 | |
| 489 $last_chr = ''; | |
| 490 while (<>) { | |
| 491 next if (/^#/); | |
| 492 my @t = split; | |
| 493 if ($last_chr ne $t[0]) { | |
| 494 &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}) if ($last_chr); | |
| 495 ($last_chr, $last_pos) = ($t[0], 0); | |
| 496 $seq = $qual = ''; | |
| 497 @gaps = (); | |
| 498 } | |
| 499 die("[vcf2fq] unsorted input\n") if ($t[1] - $last_pos < 0); | |
| 500 if ($t[1] - $last_pos > 1) { | |
| 501 $seq .= 'n' x ($t[1] - $last_pos - 1); | |
| 502 $qual .= '!' x ($t[1] - $last_pos - 1); | |
| 503 } | |
| 504 if (length($t[3]) == 1 && $t[7] !~ /INDEL/ && $t[4] =~ /^([A-Za-z.])(,[A-Za-z])*$/) { # a SNP or reference | |
| 505 my ($ref, $alt) = ($t[3], $1); | |
| 506 my ($b, $q); | |
| 507 $q = $1 if ($t[7] =~ /FQ=(-?[\d\.]+)/); | |
| 508 if ($q < 0) { | |
| 509 $_ = ($t[7] =~ /AF1=([\d\.]+)/)? $1 : 0; | |
| 510 $b = ($_ < .5 || $alt eq '.')? $ref : $alt; | |
| 511 $q = -$q; | |
| 512 } else { | |
| 513 $b = $het{"$ref$alt"}; | |
| 514 $b ||= 'N'; | |
| 515 } | |
| 516 $b = lc($b); | |
| 517 $b = uc($b) if (($t[7] =~ /MQ=(\d+)/ && $1 >= $_Q) && ($t[7] =~ /DP=(\d+)/ && $1 >= $_d && $1 <= $_D)); | |
| 518 $q = int($q + 33 + .499); | |
| 519 $q = chr($q <= 126? $q : 126); | |
| 520 $seq .= $b; | |
| 521 $qual .= $q; | |
| 522 } elsif ($t[4] ne '.') { # an INDEL | |
| 523 push(@gaps, [$t[1], length($t[3])]); | |
| 524 } | |
| 525 $last_pos = $t[1]; | |
| 526 } | |
| 527 &v2q_post_process($last_chr, \$seq, \$qual, \@gaps, $opts{l}); | |
| 528 } | |
| 529 | |
| 530 sub v2q_post_process { | |
| 531 my ($chr, $seq, $qual, $gaps, $l) = @_; | |
| 532 for my $g (@$gaps) { | |
| 533 my $beg = $g->[0] > $l? $g->[0] - $l : 0; | |
| 534 my $end = $g->[0] + $g->[1] + $l; | |
| 535 $end = length($$seq) if ($end > length($$seq)); | |
| 536 substr($$seq, $beg, $end - $beg) = lc(substr($$seq, $beg, $end - $beg)); | |
| 537 } | |
| 538 print "\@$chr\n"; &v2q_print_str($seq); | |
| 539 print "+\n"; &v2q_print_str($qual); | |
| 540 } | |
| 541 | |
| 542 sub v2q_print_str { | |
| 543 my ($s) = @_; | |
| 544 my $l = length($$s); | |
| 545 for (my $i = 0; $i < $l; $i += 60) { | |
| 546 print substr($$s, $i, 60), "\n"; | |
| 547 } | |
| 548 } | |
| 549 | |
| 550 sub usage { | |
| 551 die(qq/ | |
| 552 Usage: vcfutils.pl <command> [<arguments>]\n | |
| 553 Command: subsam get a subset of samples | |
| 554 listsam list the samples | |
| 555 fillac fill the allele count field | |
| 556 qstats SNP stats stratified by QUAL | |
| 557 | |
| 558 hapmap2vcf convert the hapmap format to VCF | |
| 559 ucscsnp2vcf convert UCSC SNP SQL dump to VCF | |
| 560 | |
| 561 varFilter filtering short variants (*) | |
| 562 vcf2fq VCF->fastq (**) | |
| 563 | |
| 564 Notes: Commands with description endting with (*) may need bcftools | |
| 565 specific annotations. | |
| 566 \n/); | |
| 567 } |
