Mercurial > repos > devteam > pgsnp2gd_snp
comparison pgSnp2gd_snp.pl @ 0:d281062566f9 draft
Imported from capsule None
author | devteam |
---|---|
date | Mon, 28 Jul 2014 11:55:36 -0400 |
parents | |
children | 4cd611004dc4 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d281062566f9 |
---|---|
1 #!/usr/bin/perl -w | |
2 use strict; | |
3 | |
4 #convert from pgSnp file to snp table (Webb format?) | |
5 | |
6 #snp table format: | |
7 #1. chr | |
8 #2. position (0 based) | |
9 #3. ref allele | |
10 #4. second allele | |
11 #5. overall quality | |
12 #foreach individual (6-9, 10-13, ...) | |
13 #a. count of allele in 3 | |
14 #b. count of allele in 4 | |
15 #c. genotype call (-1, or count of ref allele) | |
16 #d. quality of genotype call (quality of non-ref allele from masterVar) | |
17 | |
18 if (!@ARGV) { | |
19 print "usage: pgSnp2gd_snp.pl file.pgSnp[.gz|.bz2] [-tab=snpTable.txt -addColsOnly -build=hg19 -name=na -ref=#1based -chr=#1based ] > newSnpTable.txt\n"; | |
20 exit; | |
21 } | |
22 | |
23 my $in = shift @ARGV; | |
24 my $tab; | |
25 my $tabOnly; | |
26 my $build; | |
27 my $name; | |
28 my $ref; | |
29 my $binChr = 1; #position of chrom column, indicates if bin is added | |
30 foreach (@ARGV) { | |
31 if (/-tab=(.*)/) { $tab = $1; } | |
32 elsif (/-addColsOnly/) { $tabOnly = 1; } | |
33 elsif (/-build=(.*)/) { $build = $1; } | |
34 elsif (/-name=(.*)/) { $name = $1; } | |
35 elsif (/-ref=(\d+)/) { $ref = $1 - 1; } #go to index | |
36 elsif (/-chr=(\d+)/) { $binChr = $1; } | |
37 } | |
38 | |
39 if ($binChr == 2 && $ref) { $ref--; } #shift over by 1, we will delete bin | |
40 if ((!$tab or !$tabOnly) && !$ref) { | |
41 print "Error the reference allele must be in a column in the file if not just adding to a previous SNP table.\n"; | |
42 exit; | |
43 } | |
44 | |
45 #WARNING loads snp table in memory, this could take > 1G ram | |
46 my %old; | |
47 my $colcnt = 0; | |
48 my @head; | |
49 if ($tab) { | |
50 open(FH, $tab) or die "Couldn't open $tab, $!\n"; | |
51 while (<FH>) { | |
52 chomp; | |
53 if (/^#/) { push(@head, $_); next; } | |
54 my @f = split(/\t/); | |
55 $old{"$f[0]:$f[1]"} = join("\t", @f); | |
56 $colcnt = scalar @f; | |
57 } | |
58 close FH or die "Couldn't close $tab, $!\n"; | |
59 } | |
60 | |
61 if ($in =~ /.gz$/) { | |
62 open(FH, "zcat $in |") or die "Couldn't open $in, $!\n"; | |
63 }elsif ($in =~ /.bz2$/) { | |
64 open(FH, "bzcat $in |") or die "Couldn't open $in, $!\n"; | |
65 }else { | |
66 open(FH, $in) or die "Couldn't open $in, $!\n"; | |
67 } | |
68 prepHeader(); | |
69 if (@head) { #keep old header, add new? | |
70 print join("\n", @head), "\n"; | |
71 } | |
72 while (<FH>) { | |
73 chomp; | |
74 if (/^#/) { next; } | |
75 if (/^\s*$/) { next; } | |
76 my @f = split(/\t/); | |
77 if ($binChr == 2) { #must have a bin column prepended on the beginning | |
78 shift @f; #delete it | |
79 } | |
80 if (!$f[3]) { next; } #WHAT? most likely still zipped? | |
81 if ($f[4] > 2) { next; } #can only do cases of 2 alleles | |
82 if ($f[2] == $f[1] or $f[2] - $f[1] != 1) { next; } #no indels | |
83 if ($f[3] =~ /-/) { next; } #no indels | |
84 #if creating a new table need the reference allele in a column | |
85 if (%old && $old{"$f[0]:$f[1]"}) { | |
86 my @o = split(/\t/, $old{"$f[0]:$f[1]"}); | |
87 my $freq = 0; | |
88 my $freq2 = 0; | |
89 my $sc; | |
90 my $g = 1; #genotype == ref allele count | |
91 if ($f[4] == 1) { #should be homozygous | |
92 if ($f[3] eq $o[2]) { $g = 2; $freq = $f[5]; } | |
93 elsif ($f[3] eq $o[3]) { $g = 0; $freq2 = $f[5]; } | |
94 else { next; } #doesn't match either allele, skip | |
95 $sc = $f[6]; | |
96 }else { | |
97 my $a = 0; #index of a alleles, freq, scores | |
98 my $b = 1; #same for b | |
99 my @all = split(/\//, $f[3]); | |
100 if ($o[2] ne $all[0] && $o[2] ne $all[1]) { next; } #must match one | |
101 if ($o[3] ne $all[0] && $o[3] ne $all[1]) { next; } | |
102 if ($o[2] eq $all[1]) { #switch indexes | |
103 $a = 1; | |
104 $b = 0; | |
105 } | |
106 my @fr = split(/,/, $f[5]); | |
107 $freq = $fr[$a]; | |
108 $freq2 = $fr[$b]; | |
109 my @s = split(/,/, $f[6]); | |
110 $sc = $s[$b]; | |
111 } | |
112 #print old | |
113 print $old{"$f[0]:$f[1]"}; | |
114 #add new columns | |
115 print "\t$freq\t$freq2\t$g\t$sc\n"; | |
116 $old{"$f[0]:$f[1]"} = ''; | |
117 }elsif (!$tabOnly) { #new table, or don't have this SNP | |
118 #need reference allele | |
119 if ($f[3] !~ /$f[$ref]/ && $f[4] == 2) { next; } #no reference allele | |
120 my $freq = 0; | |
121 my $freq2 = 0; | |
122 my $sc; | |
123 my $g = 1; #genotype == ref allele count | |
124 my $alt; | |
125 if ($f[4] == 1) { #should be homozygous | |
126 if ($f[3] eq $f[$ref]) { $g = 2; $freq = $f[5]; $alt = 'N'; } | |
127 else { $g = 0; $freq2 = $f[5]; $alt = $f[3]; } #matches alternate | |
128 $sc = $f[6]; | |
129 }else { | |
130 my $a = 0; #index of a alleles, freq, scores | |
131 my $b = 1; #same for b | |
132 my @all = split(/\//, $f[3]); | |
133 if ($f[$ref] ne $all[0] && $f[$ref] ne $all[1]) { next; } #must match one | |
134 if ($f[$ref] eq $all[1]) { #switch indexes | |
135 $a = 1; | |
136 $b = 0; | |
137 } | |
138 my @fr = split(/,/, $f[5]); | |
139 $freq = $fr[$a]; | |
140 $freq2 = $fr[$b]; | |
141 my @s = split(/,/, $f[6]); | |
142 $sc = $s[$b]; | |
143 $alt = $all[$b]; | |
144 } | |
145 #print initial columns | |
146 print "$f[0]\t$f[1]\t$f[$ref]\t$alt\t-1"; | |
147 #pad for other individuals if needed | |
148 my $i = 5; | |
149 while ($i < $colcnt) { | |
150 print "\t-1\t-1\t-1\t-1"; | |
151 $i += 4; | |
152 } | |
153 #add new columns | |
154 print "\t$freq\t$freq2\t$g\t$sc\n"; | |
155 } | |
156 } | |
157 close FH or die "Couldn't close $in, $!\n"; | |
158 | |
159 #if adding to a snp table, now we need to finish those not in the latest set | |
160 foreach my $k (keys %old) { | |
161 if ($old{$k} ne '') { #not printed yet | |
162 print $old{$k}, "\t-1\t-1\t-1\t-1\n"; #plus blank for this one | |
163 } | |
164 } | |
165 | |
166 exit; | |
167 | |
168 #parse old header and add or create new | |
169 sub prepHeader { | |
170 if (!$build) { $build = 'hg19'; } #set default | |
171 my @cnames; | |
172 my @ind; | |
173 my $n; | |
174 if (@head) { #parse previous header | |
175 my $h = join("", @head); #may split between lines | |
176 if ($h =~ /"column_names":\[(.*?)\]/) { | |
177 my @t = split(/,/, $1); | |
178 foreach (@t) { s/"//g; } | |
179 @cnames = @t; | |
180 $n = $cnames[$#cnames]; | |
181 $n =~ s/Q//; | |
182 $n++; | |
183 } | |
184 if ($h =~ /"dbkey":"(.*?)"/) { $build = $1; } | |
185 if ($h =~ /"individuals":\[(.*)\]/) { | |
186 my $t = $1; | |
187 $t =~ s/\]\].*/]/; #remove if there is more categories | |
188 @ind = split(/,/, $t); | |
189 } | |
190 }else { #start new header | |
191 @cnames = ("chr", "pos", "A", "B", "Q"); | |
192 $n = 1; | |
193 } | |
194 #add current | |
195 if (!$name) { $name= 'na'; } | |
196 my $stcol = $colcnt + 1; | |
197 if ($stcol == 1) { $stcol = 6; } #move past initial columns | |
198 push(@ind, "[\"$name\",$stcol]"); | |
199 push(@cnames, "${n}A", "${n}B", "${n}G", "${n}Q"); | |
200 #reassign head | |
201 undef @head; | |
202 foreach (@cnames) { $_ = "\"$_\""; } #quote name | |
203 $head[0] = "#{\"column_names\":[" . join(",", @cnames) . "],"; | |
204 $head[1] = "#\"individuals\":[" . join(",", @ind) . "],"; | |
205 $head[2] = "#\"dbkey\":\"$build\",\"pos\":2,\"rPos\":2,\"ref\":1,\"scaffold\":1,\"species\":\"$build\"}"; | |
206 } | |
207 ####End | |
208 |