annotate vcf2pgSnp.pl @ 3:edf12470a1a6 default tip

Bugfix from Belinda, in vcf2pgSnp.pl
author Cathy Riemer <cathy+hg@bx.psu.edu>
date Thu, 19 Mar 2015 12:06:34 -0400
parents 35c20b109be5
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
1 #!/usr/bin/perl -w
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
2 use strict;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
3
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
4 #convert from a vcf file to a pgSnp file.
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
5 #frequency count = chromosome count
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
6 #either a single column/individual
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
7 #or all columns as a population
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
8
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
9 my $in;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
10 my $stCol = 9;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
11 my $endCol;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
12 if (@ARGV && scalar @ARGV == 2) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
13 $stCol = shift @ARGV;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
14 $in = shift @ARGV;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
15 if ($stCol eq 'all') { $stCol = 10; }
3
edf12470a1a6 Bugfix from Belinda, in vcf2pgSnp.pl
Cathy Riemer <cathy+hg@bx.psu.edu>
parents: 2
diff changeset
16 else { $endCol = $stCol - 1; } #want 0 based column number
2
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
17 $stCol--; #go from 1 based to zero based column number
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
18 if ($stCol < 9) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
19 print "ERROR genotype fields don't start until column 10\n";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
20 exit;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
21 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
22 }elsif (@ARGV && scalar @ARGV == 1) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
23 $in = shift @ARGV;
3
edf12470a1a6 Bugfix from Belinda, in vcf2pgSnp.pl
Cathy Riemer <cathy+hg@bx.psu.edu>
parents: 2
diff changeset
24 }else {
2
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
25 print "usage: vcf2pgSnp.pl [indColNum default=all] file.vcf > file.pgSnp\n";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
26 exit;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
27 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
28
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
29 open(FH, $in) or die "Couldn't open $in, $!\n";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
30 while (<FH>) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
31 chomp;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
32 if (/^\s*#/) { next; } #skip comments/headers
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
33 if (/^\s*$/) { next; } #skip blank lines
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
34 my @f = split(/\t/);
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
35 #chr pos1base ID refNt altNt[,|D#|Int] quality filter info format geno1 ...
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
36 my $a;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
37 my %nt;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
38 my %all;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
39 my $cnt = 0;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
40 my $var;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
41 if ($f[3] eq 'N') { next; } #ignore ref=N
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
42 if ($f[4] =~ /[DI]/ or $f[3] =~ /[DI]/) { next; } #don't do microsatellite
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
43 #if ($f[4] =~ /[ACTG],[ACTG]/) { next; } #only do positions with single alternate
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
44 if ($f[6] && !($f[6] eq '.' or $f[6] eq 'PASS')) { next; } #filtered for some reason
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
45 my $ind = 0;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
46 if ($f[8] ne 'GT') { #more than just genotype
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
47 my @t = split(/:/, $f[8]);
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
48 foreach (@t) { if ($_ eq 'GT') { last; } $ind++; }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
49 if ($ind == 0 && $f[8] !~ /^GT/) { die "ERROR couldn't find genotype in format $f[8]\n"; }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
50 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
51 #count 0's, 1's, 2's
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
52 if (!$endCol) { $endCol = $#f; }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
53 foreach my $col ($stCol .. $endCol) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
54 if ($ind > 0) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
55 my @t = split(/:/, $f[$col]);
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
56 $f[$col] = $t[$ind] . ":"; #only keep genotype part
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
57 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
58 if ($f[$col] =~ /^(0|1|2).(0|1|2)/) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
59 $nt{$1}++;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
60 $nt{$2}++;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
61 }elsif ($f[$col] =~ /^(0|1|2):/) { #chrY or male chrX, single
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
62 $nt{$1}++;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
63 } #else ignore
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
64 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
65 if (%nt) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
66 if ($f[0] !~ /chr/) { $f[0] = "chr$f[0]"; }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
67 print "$f[0]\t", ($f[1]-1), "\t$f[1]\t"; #position info
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
68 my $cnt = scalar(keys %nt);
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
69 my $fr;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
70 my $sc;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
71 my $all;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
72 if (exists $nt{0}) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
73 $all = uc($f[3]);
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
74 $fr = $nt{0};
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
75 $sc = 0;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
76 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
77 if (!exists $nt{0} && exists $nt{1}) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
78 if ($f[4] =~ /([ACTG]),?/) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
79 $all = $1;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
80 $fr = $nt{1};
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
81 $sc = 0;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
82 }else { die "bad variant nt $f[4] for nt 1"; }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
83 }elsif (exists $nt{1}) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
84 if ($f[4] =~ /([ACTG]),?/) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
85 $all .= '/' . $1;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
86 $fr .= ",$nt{1}";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
87 $sc .= ",0";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
88 }else { die "bad variant nt $f[4] for nt 1"; }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
89 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
90 if (exists $nt{2}) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
91 if ($f[4] =~ /^[ACTG],([ACTG]),?/) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
92 $all .= '/' . $1;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
93 $fr .= ",$nt{2}";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
94 $sc .= ",0";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
95 }else { die "bad variant nt $f[4] for nt 2"; }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
96 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
97 if (exists $nt{3}) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
98 if ($f[4] =~ /^[ACTG],[ACTG],([ACTG])/) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
99 $all .= '/' . $1;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
100 $fr .= ",$nt{3}";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
101 $sc .= ",0";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
102 }else { die "bad variant nt $f[4] for nt 3"; }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
103 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
104 if (exists $nt{4}) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
105 if ($f[4] =~ /^[ACTG],[ACTG],[ACTG],([ACTG])/) {
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
106 $all .= '/' . $1;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
107 $fr .= ",$nt{4}";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
108 $sc .= ",0";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
109 }else { die "bad variant nt $f[4] for nt 4"; }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
110 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
111 print "$all\t$cnt\t$fr\t$sc\n";
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
112 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
113 }
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
114 close FH;
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
115
35c20b109be5 Retrying upload with "bare" tarball (i.e. one without a top containing directory).
cathy
parents:
diff changeset
116 exit;