# HG changeset patch # User devteam # Date 1406562929 14400 # Node ID 5fca46616675648630d1cb1c3a1dc55d052108f6 Imported from capsule None diff -r 000000000000 -r 5fca46616675 test-data/vcf2pgSnp_input.vcf --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/vcf2pgSnp_input.vcf Mon Jul 28 11:55:29 2014 -0400 @@ -0,0 +1,50 @@ +##fileformat=VCFv4.1 +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##ALT= +##FORMAT= +##FORMAT= +##FORMAT= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG00096 HG00097 HG00099 HG00100 +1 10523 . TCCG T 152 PASS VT=INDEL;RSQ=0.5246;ERATE=0.0023;AN=2184;AA=.;THETA=0.0172;AC=5;AVGPOST=0.9954;LDAF=0.0045;AF=0.00;AMR_AF=0.00;AFR_AF=0.01 GT:DS:GL 0|0:0.000:0.00,-1.50,-34.60 0|0:0.000:0.00,-0.90,-20.70 0|0:0.000:0.00,-1.50,-34.50 0|0:0.000:0.00,-0.30,-6.90 +1 10583 rs58108140 G A 100 PASS VT=SNP;SNPSOURCE=LOWCOV;AN=2184;AC=327;LDAF=0.2363;AVGPOST=0.7630;ERATE=0.0098;AA=.;RSQ=0.4161;THETA=0.0197;SNPSOURCE=LOWCOV;AF=0.15;ASN_AF=0.15;AMR_AF=0.18;AFR_AF=0.04;EUR_AF=0.21 GT:DS:GL 0|0:0.250:-0.18,-0.47,-2.42 0|0:0.200:-0.24,-0.44,-1.16 0|0:0.250:-0.15,-0.54,-3.12 0|1:0.750:-0.48,-0.48,-0.48 +1 10611 rs189107123 C G 100 PASS VT=SNP;SNPSOURCE=LOWCOV;THETA=0.0048;RSQ=0.3814;AN=2184;AVGPOST=0.9427;ERATE=0.0024;LDAF=0.0420;AA=.;SNPSOURCE=LOWCOV;AC=39;AF=0.02;ASN_AF=0.02;AMR_AF=0.02;AFR_AF=0.01;EUR_AF=0.02 GT:DS:GL 0|0:0.050:-0.48,-0.48,-0.48 0|0:0.350:-0.24,-0.44,-1.16 0|0:0.100:-0.22,-0.42,-1.80 0|0:0.100:-0.48,-0.48,-0.48 +1 10616 . CCGCCGTTGCAAAGGCGCGCCG C 2781 PASS VT=INDEL;RSQ=0.1048;AN=2184;LDAF=0.7052;AVGPOST=0.5674;ERATE=0.0146;AA=.;THETA=0.0195;AC=1972;AF=0.90;ASN_AF=0.88;AMR_AF=0.92;AFR_AF=0.89;EUR_AF=0.92 GT:DS:GL 1|1:1.450:0,0,0 1|1:1.750:0,0,0 1|1:1.200:0,0,0 0|1:1.150:0,0,0 +1 11540 . T TA 263 PASS VT=INDEL;AVGPOST=0.5915;LDAF=0.2676;AN=2184;RSQ=0.0831;AC=93;THETA=0.0011;AA=.;ERATE=0.0222;AF=0.04;ASN_AF=0.05;AMR_AF=0.04;AFR_AF=0.05;EUR_AF=0.04 GT:DS:GL 0|0:0.700:0,0,0 0|1:0.800:0,0,0 0|0:0.300:0,0,0 0|0:0.450:0,0,0 +1 13302 rs180734498 C T 100 PASS VT=SNP;SNPSOURCE=LOWCOV;THETA=0.0159;LDAF=0.1562;AN=2184;AC=250;AA=.;RSQ=0.6272;ERATE=0.0116;SNPSOURCE=LOWCOV;AVGPOST=0.8877;AF=0.11;ASN_AF=0.02;AMR_AF=0.09;AFR_AF=0.21;EUR_AF=0.13 GT:DS:GL 0|0:0.250:-0.13,-0.58,-3.62 0|1:0.950:-2.45,-0.00,-5.00 0|0:0.450:-0.29,-0.32,-5.00 0|0:0.300:-0.48,-0.48,-0.48 +1 13327 rs144762171 G C 100 PASS VT=SNP;SNPSOURCE=LOWCOV;RSQ=0.6776;THETA=0.0189;AN=2184;AC=57;AA=.;LDAF=0.0342;AVGPOST=0.9733;SNPSOURCE=LOWCOV;ERATE=0.0007;AF=0.03;ASN_AF=0.01;AMR_AF=0.03;AFR_AF=0.02;EUR_AF=0.04 GT:DS:GL 0|0:0.000:-0.03,-1.11,-5.00 0|1:1.000:-1.97,-0.01,-2.51 0|0:0.050:-0.01,-1.69,-5.00 0|0:0.100:-0.48,-0.48,-0.48 +1 13417 . C CGAGA 2788 PASS VT=INDEL;AVGPOST=0.7560;THETA=0.0061;AC=52;AN=2184;AA=.;RSQ=0.1970;LDAF=0.1542;ERATE=0.0127;AF=0.02;ASN_AF=0.02;AMR_AF=0.04;AFR_AF=0.01;EUR_AF=0.03 GT:DS:GL 0|0:0.050:0.00,-0.30,-5.60 0|0:0.250:0,0,0 0|0:0.400:0,0,0 0|0:0.350:0.00,-0.30,-5.60 +1 13957 . TC T 226 PASS VT=INDEL;RSQ=0.2600;AN=2184;AA=.;LDAF=0.0542;THETA=0.0144;AVGPOST=0.9144;ERATE=0.0029;AC=27;AF=0.01;ASN_AF=0.01;AMR_AF=0.01;AFR_AF=0.01;EUR_AF=0.01 GT:DS:GL 0|0:0.150:0,0,0 0|0:0.050:0,0,0 0|0:0.100:0,0,0 0|0:0.100:0,0,0 +1 13980 rs151276478 T C 100 PASS VT=SNP;SNPSOURCE=LOWCOV;ERATE=0.0021;THETA=0.0096;RSQ=0.3995;AN=2184;AC=55;LDAF=0.0628;AVGPOST=0.9128;AA=.;SNPSOURCE=LOWCOV;AF=0.03;ASN_AF=0.02;AMR_AF=0.03;AFR_AF=0.01;EUR_AF=0.03 GT:DS:GL 0|0:0.100:-0.48,-0.48,-0.48 0|1:0.950:-0.48,-0.48,-0.48 0|0:0.050:-0.48,-0.48,-0.48 0|0:0.050:-0.48,-0.48,-0.48 +1 16226 . AG A 104 PASS VT=INDEL;THETA=0.0211;AN=2184;RSQ=0.6096;LDAF=0.0034;AVGPOST=0.9971;AA=A;ERATE=0.0008;AC=4;AF=0.00;AMR_AF=0.00;EUR_AF=0.00 GT:DS:GL 0|0:0.000:0.00,-1.50,-24.00 0|0:0.050:0,0,0 0|0:0.000:0.00,-0.30,-4.80 0|0:0.000:0.00,-0.90,-14.40 +1 17496 . AC A 209 PASS VT=INDEL;AC=7;AN=2184;ERATE=0.0009;AA=A;RSQ=0.7057;THETA=0.0221;AVGPOST=0.9968;LDAF=0.0045;AF=0.00;AFR_AF=0.01 GT:DS:GL 0|0:0.000:0.00,-3.30,-42.90 0|0:0.000:0,0,0 0|0:0.000:0,0,0 0|0:0.000:0,0,0 +1 18869 . ACC A 126 PASS VT=INDEL;LDAF=0.2562;AC=52;AN=2184;ERATE=0.0162;AVGPOST=0.5962;AA=A;RSQ=0.0889;THETA=0.0107;AF=0.02;ASN_AF=0.04;AMR_AF=0.02;AFR_AF=0.02;EUR_AF=0.01 GT:DS:GL 0|0:0.400:0,0,0 0|0:0.350:0,0,0 0|0:0.250:0.00,-0.30,-2.90 0|0:0.550:0,0,0 +1 19083 . G GCA 139 PASS VT=INDEL;AVGPOST=0.7121;AN=2184;AC=13;AA=G;THETA=0.0077;ERATE=0.0123;RSQ=0.0826;LDAF=0.1645;AF=0.01;ASN_AF=0.01;AMR_AF=0.01;AFR_AF=0.01;EUR_AF=0.00 GT:DS:GL 0|0:0.200:0,0,0 0|0:0.400:0,0,0 0|0:0.250:0,0,0 0|0:0.350:0,0,0 +1 19190 . GC G 815 PASS VT=INDEL;AC=131;LDAF=0.2531;AN=2184;AA=G;THETA=0.0161;AVGPOST=0.6408;RSQ=0.1696;ERATE=0.0188;AF=0.06;ASN_AF=0.05;AMR_AF=0.10;AFR_AF=0.07;EUR_AF=0.04 GT:DS:GL 0|0:0.650:0,0,0 0|0:0.200:0,0,0 0|0:0.650:0,0,0 0|0:0.500:0,0,0 +1 20862 . A AG 78 PASS VT=INDEL;ERATE=0.0067;AVGPOST=0.8007;RSQ=0.0827;LDAF=0.8907;AN=2184;THETA=0.0095;AA=A;AC=2178;AF=1.00;ASN_AF=1.00;AMR_AF=1.00;AFR_AF=0.99;EUR_AF=1.00 GT:DS:GL 1|1:1.750:0,0,0 1|1:1.800:0,0,0 1|1:1.850:0,0,0 1|1:1.750:0,0,0 +1 30923 rs140337953 G T 100 PASS VT=SNP;SNPSOURCE=LOWCOV;RSQ=0.5363;LDAF=0.6540;THETA=0.0078;AN=2184;AA=T;ERATE=0.0127;AC=1581;SNPSOURCE=LOWCOV;AVGPOST=0.7309;AF=0.72;ASN_AF=0.88;AMR_AF=0.81;AFR_AF=0.48;EUR_AF=0.72 GT:DS:GL 1|1:1.950:-5.00,-0.61,-0.12 0|0:0.450:-0.10,-0.69,-2.81 0|0:0.450:-0.11,-0.64,-3.49 1|1:1.500:-0.48,-0.48,-0.48 +1 46402 . C CTGT 104 PASS VT=INDEL;RSQ=0.0943;AA=C;AN=2184;AVGPOST=0.8697;THETA=0.0108;AC=5;LDAF=0.0697;ERATE=0.0056;AF=0.00;ASN_AF=0.00;AMR_AF=0.00;AFR_AF=0.01 GT:DS:GL 0|0:0.050:0,0,0 0|0:0.100:0,0,0 0|0:0.050:0,0,0 0|0:0.050:0,0,0 +1 47190 . G GA 868 PASS VT=INDEL;LDAF=0.2298;AN=2184;AC=45;AA=G;RSQ=0.1035;AVGPOST=0.6322;THETA=0.0141;ERATE=0.0127;AF=0.02;ASN_AF=0.00;AMR_AF=0.03;AFR_AF=0.05;EUR_AF=0.01 GT:DS:GL 0|0:0.350:0,0,0 0|0:0.500:0,0,0 0|0:0.550:0,0,0 0|0:0.400:0,0,0 +1 49514 . AG A 449 PASS VT=INDEL;THETA=0.0164;AC=52;AN=2184;RSQ=0.4892;AA=A;LDAF=0.0469;AVGPOST=0.9448;ERATE=0.0053;AF=0.02;ASN_AF=0.02;AMR_AF=0.02;AFR_AF=0.01;EUR_AF=0.03 GT:DS:GL 0|0:0.000:0.00,-0.90,-12.40 0|1:0.800:0,0,0 0|0:0.150:-0.10,0.00,0.00 0|0:0.000:0.00,-0.30,-4.20 +1 49515 . G GA 44 PASS VT=INDEL;THETA=0.0214;AN=2184;AC=13;AA=A;ERATE=0.0008;AVGPOST=0.9879;LDAF=0.0111;RSQ=0.5361;AF=0.01;ASN_AF=0.02;EUR_AF=0.01 GT:DS:GL 0|0:0.050:0.00,-0.90,-9.70 0|0:0.050:0,0,0 0|0:0.000:0,0,0 0|0:0.000:0.00,-0.30,-3.20 +1 50481 . GGT G 2370 PASS VT=INDEL;RSQ=0.3702;THETA=0.0038;AN=2184;AA=G;ERATE=0.0177;AVGPOST=0.7981;AC=142;LDAF=0.1660;AF=0.07;ASN_AF=0.05;AMR_AF=0.13;AFR_AF=0.07;EUR_AF=0.04 GT:DS:GL 0|0:0.250:0,0,0 0|0:0.350:0,0,0 1|0:1.150:-3.00,-0.30,0.00 0|0:0.250:0,0,0 diff -r 000000000000 -r 5fca46616675 test-data/vcf2pgSnp_output.pgSnp --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/vcf2pgSnp_output.pgSnp Mon Jul 28 11:55:29 2014 -0400 @@ -0,0 +1,22 @@ +chr1 10522 10523 TCCG 1 8 0 +chr1 10582 10583 G/A 2 7,1 0,0 +chr1 10610 10611 C 1 8 0 +chr1 10615 10616 CCGCCGTTGCAAAGGCGCGCCG/C 2 1,7 0,0 +chr1 11539 11540 T/T 2 7,1 0,0 +chr1 13301 13302 C/T 2 7,1 0,0 +chr1 13326 13327 G/C 2 7,1 0,0 +chr1 13416 13417 C 1 8 0 +chr1 13956 13957 TC 1 8 0 +chr1 13979 13980 T/C 2 7,1 0,0 +chr1 16225 16226 AG 1 8 0 +chr1 17495 17496 AC 1 8 0 +chr1 18868 18869 ACC 1 8 0 +chr1 19082 19083 G 1 8 0 +chr1 19189 19190 GC 1 8 0 +chr1 20861 20862 A 1 8 0 +chr1 30922 30923 G/T 2 4,4 0,0 +chr1 46401 46402 C 1 8 0 +chr1 47189 47190 G 1 8 0 +chr1 49513 49514 AG/A 2 7,1 0,0 +chr1 49514 49515 G 1 8 0 +chr1 50480 50481 GGT/G 2 7,1 0,0 diff -r 000000000000 -r 5fca46616675 vcf2pgSnp.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf2pgSnp.pl Mon Jul 28 11:55:29 2014 -0400 @@ -0,0 +1,116 @@ +#!/usr/bin/perl -w +use strict; + +#convert from a vcf file to a pgSnp file. +#frequency count = chromosome count +#either a single column/individual +#or all columns as a population + +my $in; +my $stCol = 9; +my $endCol; +if (@ARGV && scalar @ARGV == 2) { + $stCol = shift @ARGV; + $in = shift @ARGV; + if ($stCol eq 'all') { $stCol = 10; } + else { $endCol = $stCol; } + $stCol--; #go from 1 based to zero based column number + if ($stCol < 9) { + print "ERROR genotype fields don't start until column 10\n"; + exit; + } +}elsif (@ARGV && scalar @ARGV == 1) { + $in = shift @ARGV; +}elsif (@ARGV) { + print "usage: vcf2pgSnp.pl [indColNum default=all] file.vcf > file.pgSnp\n"; + exit; +} + +open(FH, $in) or die "Couldn't open $in, $!\n"; +while () { + chomp; + if (/^\s*#/) { next; } #skip comments/headers + if (/^\s*$/) { next; } #skip blank lines + my @f = split(/\t/); + #chr pos1base ID refNt altNt[,|D#|Int] quality filter info format geno1 ... + my $a; + my %nt; + my %all; + my $cnt = 0; + my $var; + if ($f[3] eq 'N') { next; } #ignore ref=N + if ($f[4] =~ /[DI]/ or $f[3] =~ /[DI]/) { next; } #don't do microsatellite + #if ($f[4] =~ /[ACTG],[ACTG]/) { next; } #only do positions with single alternate + if ($f[6] && !($f[6] eq '.' or $f[6] eq 'PASS')) { next; } #filtered for some reason + my $ind = 0; + if ($f[8] ne 'GT') { #more than just genotype + my @t = split(/:/, $f[8]); + foreach (@t) { if ($_ eq 'GT') { last; } $ind++; } + if ($ind == 0 && $f[8] !~ /^GT/) { die "ERROR couldn't find genotype in format $f[8]\n"; } + } + #count 0's, 1's, 2's + if (!$endCol) { $endCol = $#f; } + foreach my $col ($stCol .. $endCol) { + if ($ind > 0) { + my @t = split(/:/, $f[$col]); + $f[$col] = $t[$ind] . ":"; #only keep genotype part + } + if ($f[$col] =~ /^(0|1|2).(0|1|2)/) { + $nt{$1}++; + $nt{$2}++; + }elsif ($f[$col] =~ /^(0|1|2):/) { #chrY or male chrX, single + $nt{$1}++; + } #else ignore + } + if (%nt) { + if ($f[0] !~ /chr/) { $f[0] = "chr$f[0]"; } + print "$f[0]\t", ($f[1]-1), "\t$f[1]\t"; #position info + my $cnt = scalar(keys %nt); + my $fr; + my $sc; + my $all; + if (exists $nt{0}) { + $all = uc($f[3]); + $fr = $nt{0}; + $sc = 0; + } + if (!exists $nt{0} && exists $nt{1}) { + if ($f[4] =~ /([ACTG]),?/) { + $all = $1; + $fr = $nt{1}; + $sc = 0; + }else { die "bad variant nt $f[4] for nt 1"; } + }elsif (exists $nt{1}) { + if ($f[4] =~ /([ACTG]),?/) { + $all .= '/' . $1; + $fr .= ",$nt{1}"; + $sc .= ",0"; + }else { die "bad variant nt $f[4] for nt 1"; } + } + if (exists $nt{2}) { + if ($f[4] =~ /^[ACTG],([ACTG]),?/) { + $all .= '/' . $1; + $fr .= ",$nt{2}"; + $sc .= ",0"; + }else { die "bad variant nt $f[4] for nt 2"; } + } + if (exists $nt{3}) { + if ($f[4] =~ /^[ACTG],[ACTG],([ACTG])/) { + $all .= '/' . $1; + $fr .= ",$nt{3}"; + $sc .= ",0"; + }else { die "bad variant nt $f[4] for nt 3"; } + } + if (exists $nt{4}) { + if ($f[4] =~ /^[ACTG],[ACTG],[ACTG],([ACTG])/) { + $all .= '/' . $1; + $fr .= ",$nt{4}"; + $sc .= ",0"; + }else { die "bad variant nt $f[4] for nt 4"; } + } + print "$all\t$cnt\t$fr\t$sc\n"; + } +} +close FH; + +exit; diff -r 000000000000 -r 5fca46616675 vcf2pgSnp.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/vcf2pgSnp.xml Mon Jul 28 11:55:29 2014 -0400 @@ -0,0 +1,79 @@ +