Mercurial > repos > miller-lab > snp_analysis_conversion
annotate vcf2pgSnp.pl @ 3:edf12470a1a6 default tip
Bugfix from Belinda, in vcf2pgSnp.pl
author | Cathy Riemer <cathy+hg@bx.psu.edu> |
---|---|
date | Thu, 19 Mar 2015 12:06:34 -0400 |
parents | 35c20b109be5 |
children |
rev | line source |
---|---|
2
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
1 #!/usr/bin/perl -w |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
2 use strict; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
3 |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
4 #convert from a vcf file to a pgSnp file. |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
5 #frequency count = chromosome count |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
6 #either a single column/individual |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
7 #or all columns as a population |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
8 |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
9 my $in; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
10 my $stCol = 9; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
11 my $endCol; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
12 if (@ARGV && scalar @ARGV == 2) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
13 $stCol = shift @ARGV; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
14 $in = shift @ARGV; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
15 if ($stCol eq 'all') { $stCol = 10; } |
3
edf12470a1a6
Bugfix from Belinda, in vcf2pgSnp.pl
Cathy Riemer <cathy+hg@bx.psu.edu>
parents:
2
diff
changeset
|
16 else { $endCol = $stCol - 1; } #want 0 based column number |
2
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
17 $stCol--; #go from 1 based to zero based column number |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
18 if ($stCol < 9) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
19 print "ERROR genotype fields don't start until column 10\n"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
20 exit; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
21 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
22 }elsif (@ARGV && scalar @ARGV == 1) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
23 $in = shift @ARGV; |
3
edf12470a1a6
Bugfix from Belinda, in vcf2pgSnp.pl
Cathy Riemer <cathy+hg@bx.psu.edu>
parents:
2
diff
changeset
|
24 }else { |
2
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
25 print "usage: vcf2pgSnp.pl [indColNum default=all] file.vcf > file.pgSnp\n"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
26 exit; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
27 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
28 |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
29 open(FH, $in) or die "Couldn't open $in, $!\n"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
30 while (<FH>) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
31 chomp; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
32 if (/^\s*#/) { next; } #skip comments/headers |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
33 if (/^\s*$/) { next; } #skip blank lines |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
34 my @f = split(/\t/); |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
35 #chr pos1base ID refNt altNt[,|D#|Int] quality filter info format geno1 ... |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
36 my $a; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
37 my %nt; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
38 my %all; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
39 my $cnt = 0; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
40 my $var; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
41 if ($f[3] eq 'N') { next; } #ignore ref=N |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
42 if ($f[4] =~ /[DI]/ or $f[3] =~ /[DI]/) { next; } #don't do microsatellite |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
43 #if ($f[4] =~ /[ACTG],[ACTG]/) { next; } #only do positions with single alternate |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
44 if ($f[6] && !($f[6] eq '.' or $f[6] eq 'PASS')) { next; } #filtered for some reason |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
45 my $ind = 0; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
46 if ($f[8] ne 'GT') { #more than just genotype |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
47 my @t = split(/:/, $f[8]); |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
48 foreach (@t) { if ($_ eq 'GT') { last; } $ind++; } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
49 if ($ind == 0 && $f[8] !~ /^GT/) { die "ERROR couldn't find genotype in format $f[8]\n"; } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
50 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
51 #count 0's, 1's, 2's |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
52 if (!$endCol) { $endCol = $#f; } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
53 foreach my $col ($stCol .. $endCol) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
54 if ($ind > 0) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
55 my @t = split(/:/, $f[$col]); |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
56 $f[$col] = $t[$ind] . ":"; #only keep genotype part |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
57 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
58 if ($f[$col] =~ /^(0|1|2).(0|1|2)/) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
59 $nt{$1}++; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
60 $nt{$2}++; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
61 }elsif ($f[$col] =~ /^(0|1|2):/) { #chrY or male chrX, single |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
62 $nt{$1}++; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
63 } #else ignore |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
64 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
65 if (%nt) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
66 if ($f[0] !~ /chr/) { $f[0] = "chr$f[0]"; } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
67 print "$f[0]\t", ($f[1]-1), "\t$f[1]\t"; #position info |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
68 my $cnt = scalar(keys %nt); |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
69 my $fr; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
70 my $sc; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
71 my $all; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
72 if (exists $nt{0}) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
73 $all = uc($f[3]); |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
74 $fr = $nt{0}; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
75 $sc = 0; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
76 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
77 if (!exists $nt{0} && exists $nt{1}) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
78 if ($f[4] =~ /([ACTG]),?/) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
79 $all = $1; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
80 $fr = $nt{1}; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
81 $sc = 0; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
82 }else { die "bad variant nt $f[4] for nt 1"; } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
83 }elsif (exists $nt{1}) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
84 if ($f[4] =~ /([ACTG]),?/) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
85 $all .= '/' . $1; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
86 $fr .= ",$nt{1}"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
87 $sc .= ",0"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
88 }else { die "bad variant nt $f[4] for nt 1"; } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
89 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
90 if (exists $nt{2}) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
91 if ($f[4] =~ /^[ACTG],([ACTG]),?/) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
92 $all .= '/' . $1; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
93 $fr .= ",$nt{2}"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
94 $sc .= ",0"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
95 }else { die "bad variant nt $f[4] for nt 2"; } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
96 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
97 if (exists $nt{3}) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
98 if ($f[4] =~ /^[ACTG],[ACTG],([ACTG])/) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
99 $all .= '/' . $1; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
100 $fr .= ",$nt{3}"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
101 $sc .= ",0"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
102 }else { die "bad variant nt $f[4] for nt 3"; } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
103 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
104 if (exists $nt{4}) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
105 if ($f[4] =~ /^[ACTG],[ACTG],[ACTG],([ACTG])/) { |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
106 $all .= '/' . $1; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
107 $fr .= ",$nt{4}"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
108 $sc .= ",0"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
109 }else { die "bad variant nt $f[4] for nt 4"; } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
110 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
111 print "$all\t$cnt\t$fr\t$sc\n"; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
112 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
113 } |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
114 close FH; |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
115 |
35c20b109be5
Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff
changeset
|
116 exit; |