Mercurial > repos > xuebing > sharplabtool
comparison tools/evolution/codingSnps.pl @ 0:9071e359b9a3
Uploaded
| author | xuebing | 
|---|---|
| date | Fri, 09 Mar 2012 19:37:19 -0500 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| -1:000000000000 | 0:9071e359b9a3 | 
|---|---|
| 1 #!/usr/bin/perl -w | |
| 2 use strict; | |
| 3 | |
| 4 ######################################################################### | |
| 5 # codingSnps.pl | |
| 6 # This takes a bed file with the names being / separated nts | |
| 7 # and a gene bed file with cds start and stop. | |
| 8 # It then checks for changes in coding regions, reporting | |
| 9 # those that cause a frameshift or substitution in the amino acid. | |
| 10 ######################################################################### | |
| 11 | |
| 12 my $seqFlag = "2bit"; #flag to set sequence type 2bit|nib | |
| 13 if (!@ARGV or scalar @ARGV < 3) { | |
| 14 print "Usage: codingSnps.pl snps.bed genes.bed (/dir/*$seqFlag|Galaxy build= loc=) [chr=# start=# end=# snp=# keepColumns=1] > codingSnps.txt\n"; | |
| 15 exit; | |
| 16 } | |
| 17 my $uniq = 0; #flag for whether want uniq positions | |
| 18 my $syn = 0; #flag for if want synonomous changes rather than non-syn | |
| 19 my $keep = 0; #keep old columns and append new ones | |
| 20 my $snpFile = shift @ARGV; | |
| 21 my $geneFile = shift @ARGV; | |
| 22 my $nibDir = shift @ARGV; #2bit or nib, depending on flag above | |
| 23 if ($nibDir eq 'Galaxy') { getGalaxyInfo(); } | |
| 24 my $col0 = 0; #bed like columns in default positions | |
| 25 my $col1 = 1; | |
| 26 my $col2 = 2; | |
| 27 my $col3 = 3; | |
| 28 #column positions 1 based coming in (for Galaxy) | |
| 29 foreach (@ARGV) { | |
| 30 if (/chr=(\d+)/) { $col0 = $1 -1; } | |
| 31 elsif (/start=(\d+)/) { $col1 = $1 -1; } | |
| 32 elsif (/end=(\d+)/) { $col2 = $1 -1; } | |
| 33 elsif (/snp=(\d+)/) { $col3 = $1 -1; } | |
| 34 elsif (/keepColumns=1/) { $keep = 1; } | |
| 35 } | |
| 36 if ($col0 < 0 || $col1 < 0 || $col2 < 0 || $col3 < 0) { | |
| 37 print STDERR "ERROR column numbers are given with origin 1\n"; | |
| 38 exit 1; | |
| 39 } | |
| 40 my @genes; #bed lines for genes, sorted by chrom and start | |
| 41 my %chrSt; #index in array where each chrom starts | |
| 42 my %codon; #hash of codon amino acid conversions | |
| 43 my $ends = 0; #ends vs sizes in bed 11 position, starts relative to chrom | |
| 44 my $ignoreN = 1; #skip N | |
| 45 | |
| 46 my %amb = ( | |
| 47 "R" => "A/G", | |
| 48 "Y" => "C/T", | |
| 49 "S" => "C/G", | |
| 50 "W" => "A/T", | |
| 51 "K" => "G/T", | |
| 52 "M" => "A/C", | |
| 53 "B" => "C/G/T", | |
| 54 "D" => "A/G/T", | |
| 55 "H" => "A/C/T", | |
| 56 "V" => "A/C/G", | |
| 57 "N" => "A/C/G/T" | |
| 58 ); | |
| 59 fill_codon(); | |
| 60 open(FH, "cat $geneFile | sort -k1,1 -k2,2n |") | |
| 61 or die "Couldn't open and sort $geneFile, $!\n"; | |
| 62 my $i = 0; | |
| 63 while(<FH>) { | |
| 64 chomp; | |
| 65 if (/refGene.cdsEnd|ccdsGene.exonEnds/) { $ends = 1; next; } | |
| 66 push(@genes, "$_"); | |
| 67 my @f = split(/\t/); | |
| 68 if (!exists $chrSt{$f[0]}) { $chrSt{$f[0]} = $i; } | |
| 69 $i++; | |
| 70 } | |
| 71 close FH or die "Couldn't close $geneFile, $!\n"; | |
| 72 | |
| 73 if ($ends) { print STDERR "WARNING using block ends rather than sizes\n"; } | |
| 74 | |
| 75 #open snps sorted as well | |
| 76 my $s1 = $col0 + 1; #sort order is origin 1 | |
| 77 my $s2 = $col1 + 1; | |
| 78 open(FH, "cat $snpFile | sort -k$s1,$s1 -k$s2,${s2}n |") | |
| 79 or die "Couldn't open and sort $snpFile, $!\n"; | |
| 80 $i = 0; | |
| 81 my @g; #one genes fields, should be used repeatedly | |
| 82 my %done; | |
| 83 while(<FH>) { | |
| 84 chomp; | |
| 85 if (/^\s*#/) { next; } #comment | |
| 86 my @s = split(/\t/); #SNP fields | |
| 87 if (!@s or !$s[$col0]) { die "ERROR missing SNP data, $_\n"; } | |
| 88 my $size = $#s; | |
| 89 if ($col0 > $size || $col1 > $size || $col2 > $size || $col3 > $size) { | |
| 90 print STDERR "ERROR file has fewer columns than requested, requested columns (0 based) $col0 $col1 $col2 $col3, file has $size\n"; | |
| 91 exit 1; | |
| 92 } | |
| 93 if ($s[$col1] =~ /\D/) { | |
| 94 print STDERR "ERROR the start point must be an integer not $s[$col1]\n"; | |
| 95 exit 1; | |
| 96 } | |
| 97 if ($s[$col2] =~ /\D/) { | |
| 98 print STDERR "ERROR the start point must be an integer not $s[$col2]\n"; | |
| 99 exit 1; | |
| 100 } | |
| 101 if ($s[$col3] eq 'N' && $ignoreN) { next; } | |
| 102 if (exists $amb{$s[$col3]}) { $s[$col3] = $amb{$s[$col3]}; } | |
| 103 if (!@g && exists $chrSt{$s[$col0]}) { #need to fetch first gene row | |
| 104 $i = $chrSt{$s[$col0]}; | |
| 105 @g = split(/\t/, $genes[$i]); | |
| 106 if (scalar @g < 12) { | |
| 107 print STDERR "ERROR the gene file must be the whole genes in BED format\n"; | |
| 108 exit 1; | |
| 109 } | |
| 110 }elsif (!@g) { | |
| 111 next; #no gene for this chrom | |
| 112 }elsif ($s[$col0] ne $g[0] && exists $chrSt{$s[$col0]}) { #new chrom | |
| 113 $i = $chrSt{$s[$col0]}; | |
| 114 @g = split(/\t/, $genes[$i]); | |
| 115 }elsif ($s[$col0] ne $g[0]) { | |
| 116 next; #no gene for this chrom | |
| 117 }elsif ($s[$col1] < $g[1] && $i == $chrSt{$s[$col0]}) { | |
| 118 next; #before any genes | |
| 119 }elsif ($s[$col1] > $g[2] && ($i == $#genes or $genes[$i+1] !~ $s[$col0])) { | |
| 120 next; #after all genes on chr | |
| 121 }else { | |
| 122 while ($s[$col1] > $g[2] && $i < $#genes) { | |
| 123 $i++; | |
| 124 @g = split(/\t/, $genes[$i]); | |
| 125 if ($s[$col0] ne $g[0]) { last; } #end of gene | |
| 126 } | |
| 127 if ($s[$col0] ne $g[0] or $s[$col1] < $g[1] or $s[$col1] > $g[2]) { | |
| 128 next; #no overlap with genes | |
| 129 } | |
| 130 } | |
| 131 | |
| 132 processSnp(\@s, \@g); | |
| 133 if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { next; } | |
| 134 | |
| 135 my $k = $i + 1; #check for more genes without losing data of first | |
| 136 if ($k <= $#genes) { | |
| 137 my @g2 = split(/\t/, $genes[$k]); | |
| 138 while (@g2 && $k <= $#genes) { | |
| 139 @g2 = split(/\t/, $genes[$k]); | |
| 140 if ($s[$col0] ne $g2[0]) { | |
| 141 undef @g2; | |
| 142 last; #not same chrom | |
| 143 }else { | |
| 144 while ($s[$col1] > $g2[2] && $k <= $#genes) { | |
| 145 $k++; | |
| 146 @g2 = split(/\t/, $genes[$k]); | |
| 147 if ($s[$col0] ne $g2[0]) { last; } #end of chrom | |
| 148 } | |
| 149 if ($s[$col0] ne $g2[0] or $s[$col1] < $g2[1] or $s[$col1] > $g2[2]) { | |
| 150 undef @g2; | |
| 151 last; #no overlap with more genes | |
| 152 } | |
| 153 processSnp(\@s, \@g2); | |
| 154 if ($uniq && exists $done{"$s[$col0] $s[$col1] $s[$col2]"}) { last; } | |
| 155 } | |
| 156 $k++; | |
| 157 } | |
| 158 } | |
| 159 } | |
| 160 close FH or die "Couldn't close $snpFile, $!\n"; | |
| 161 | |
| 162 exit; | |
| 163 | |
| 164 ######################################################################## | |
| 165 sub processSnp { | |
| 166 my $sref = shift; | |
| 167 my $gref = shift; | |
| 168 #overlaps gene, but maybe not coding seq | |
| 169 #inside cds | |
| 170 if ($sref->[$col1] + 1 < $gref->[6] or $sref->[$col2] > $gref->[7]) { | |
| 171 return; #outside of coding | |
| 172 } | |
| 173 #now check exon | |
| 174 my $i = 0; | |
| 175 my @st = split(/,/, $gref->[11]); | |
| 176 my @size = split(/,/, $gref->[10]); | |
| 177 if (scalar @st ne $gref->[9]) { return; } #cant do this gene #die "bad gene $gref->[3]\n"; } | |
| 178 my @pos; | |
| 179 my $in = 0; | |
| 180 for($i = 0; $i < $gref->[9]; $i++) { | |
| 181 my $sta = $gref->[1] + $st[$i] + 1; #1 based position | |
| 182 my $end = $sta + $size[$i] - 1; # | |
| 183 if ($ends) { $end = $size[$i]; $sta = $st[$i] + 1; } #ends instead of sizes | |
| 184 if ($end < $gref->[6]) { next; } #utr only | |
| 185 if ($sta > $gref->[7]) { next; } #utr only | |
| 186 #shorten to coding only | |
| 187 if ($sta < $gref->[6]) { $sta = $gref->[6] + 1; } | |
| 188 if ($end > $gref->[7]) { $end = $gref->[7]; } | |
| 189 if ($sref->[$col1] + 1 >= $sta && $sref->[$col2] <= $end) { $in = 1; } | |
| 190 elsif ($sref->[$col1] == $sref->[$col2] && $sref->[$col2] <= $end && $sref->[$col2] >= $sta) { $in = 1; } | |
| 191 push(@pos, ($sta .. $end)); #add exon worth of positions | |
| 192 } | |
| 193 #@pos has coding positions for whole gene (chr coors), | |
| 194 #and $in has whether we need to continue | |
| 195 if (!$in) { return; } #not in coding exon | |
| 196 if ((scalar @pos) % 3 != 0) { return; } #partial gene? not even codons | |
| 197 if ($sref->[$col3] =~ /^-+\/[ACTG]+$/ or $sref->[$col3] =~ /^[ACTG]+\/-+$/ or | |
| 198 $sref->[$col3] =~ /^-+$/) { #indel or del | |
| 199 my $copy = $sref->[$col3]; | |
| 200 my $c = ($copy =~ tr/-//); | |
| 201 if ($c % 3 == 0) { return; } #not frameshift | |
| 202 #handle bed4 or any interval file | |
| 203 if (!$keep) { | |
| 204 print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; | |
| 205 print "\t$gref->[3]\tframeshift\n"; | |
| 206 }else { | |
| 207 my @s = @{$sref}; | |
| 208 print join("\t", @s), "\t$gref->[3]\tframeshift\n"; | |
| 209 } | |
| 210 $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; | |
| 211 return; | |
| 212 }elsif ($sref->[$col1] == $sref->[$col2]) { #insertion | |
| 213 my $copy = $sref->[$col3]; | |
| 214 my $c = ($copy =~ tr/\[ACTG]+//); | |
| 215 if ($c % 3 == 0) { return; } #not frameshift | |
| 216 #handle bed4 or any interval file | |
| 217 if (!$keep) { | |
| 218 print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; | |
| 219 print "\t$gref->[3]\tframeshift\n"; | |
| 220 }else { | |
| 221 my @s = @{$sref}; | |
| 222 print join("\t", @s), "\t$gref->[3]\tframeshift\n"; | |
| 223 } | |
| 224 $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; | |
| 225 return; | |
| 226 }elsif ($sref->[$col3] =~ /-/) { #indel and sub? | |
| 227 return; #skip | |
| 228 } | |
| 229 #check for amino acid substitutions | |
| 230 my $s = $sref->[$col1] + 1; | |
| 231 my $e = $sref->[$col2]; | |
| 232 my $len = $sref->[$col2] - $sref->[$col1]; | |
| 233 if ($gref->[5] eq '-') { | |
| 234 @pos = reverse(@pos); | |
| 235 my $t = $s; | |
| 236 $s = $e; | |
| 237 $e = $t; | |
| 238 } | |
| 239 $i = 0; | |
| 240 my $found = 0; | |
| 241 foreach (@pos) { | |
| 242 if ($s == $_) { | |
| 243 $found = 1; | |
| 244 last; | |
| 245 } | |
| 246 $i++; | |
| 247 } | |
| 248 if ($found) { | |
| 249 my $fs = $i; #keep original start index | |
| 250 #have index where substitution starts | |
| 251 my $cp = $i % 3; | |
| 252 $i -= $cp; #i is now first position in codon | |
| 253 my $cdNum = int($i / 3) + 1; | |
| 254 my $ls = $i; | |
| 255 if (!defined $ls) { die "ERROR not defined ls for $fs $sref->[$col2]\n"; } | |
| 256 if (!@pos) { die "ERROR not defined array pos\n"; } | |
| 257 if (!defined $pos[$ls]) { die "ERROR not defined pos at $ls\n"; } | |
| 258 if (!defined $e) { die "ERROR not defined e for $pos[0] $pos[1] $pos[2]\n"; } | |
| 259 while ($ls <= $#pos && $pos[$ls] ne $e) { | |
| 260 $ls++; | |
| 261 } | |
| 262 my $i2 = $ls + (2 - ($ls % 3)); | |
| 263 if ($i2 > $#pos) { return; } #not a full codon, partial gene? | |
| 264 | |
| 265 if ($i2 - $i < 2) { die "not a full codon positions $i to $i2 for $sref->[3]\n"; } | |
| 266 my $oldnts = getnts($sref->[$col0], @pos[$i..$i2]); | |
| 267 if (!$oldnts) { die "Failed to get sequence for $sref->[$col0] $pos[$i] .. $pos[$i2]\n"; } | |
| 268 my @vars = split(/\//, $sref->[$col3]); | |
| 269 if ($gref->[5] eq '-') { #complement oldnts and revcomp vars | |
| 270 $oldnts = compl($oldnts); | |
| 271 if (!$oldnts) { return; } #skip this one | |
| 272 $oldnts = join('', (reverse(split(/ */, $oldnts)))); | |
| 273 foreach (@vars) { | |
| 274 $_ = reverse(split(/ */)); #needed for indels | |
| 275 $_ = compl($_); | |
| 276 } | |
| 277 } | |
| 278 my $r = $fs - $i; #difference in old indexes gives new index | |
| 279 my @newnts; | |
| 280 my $changed = ''; | |
| 281 foreach my $v (@vars) { | |
| 282 if (!$v or length($v) != 1) { return; } #only simple changes | |
| 283 my @new = split(/ */, $oldnts); | |
| 284 $changed = splice(@new, $r, $len, split(/ */, $v)); | |
| 285 #should only change single nt | |
| 286 push(@newnts, join("", @new)); | |
| 287 } | |
| 288 #now compute amino acids | |
| 289 my $oldaa = getaa($oldnts); | |
| 290 my @newaa; | |
| 291 my $change = 0; #flag for if there is a change | |
| 292 foreach my $v (@newnts) { | |
| 293 my $t = getaa($v); | |
| 294 if ($t ne $oldaa) { $change = 1; } | |
| 295 push(@newaa, $t); | |
| 296 } | |
| 297 if (!$change && $syn) { | |
| 298 if (!$keep) { | |
| 299 print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; | |
| 300 print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\n"; | |
| 301 }else { | |
| 302 my @s = @{$sref}; | |
| 303 print join("\t", @s), | |
| 304 "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\n"; | |
| 305 } | |
| 306 return; | |
| 307 }elsif ($syn) { return; } #only want synonymous changes | |
| 308 if (!$change) { return; } #no change in amino acids | |
| 309 if (!$keep) { | |
| 310 print "$sref->[$col0]\t$sref->[$col1]\t$sref->[$col2]\t$sref->[$col3]"; | |
| 311 if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref | |
| 312 if (!$changed) { return; } #skip this one | |
| 313 print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\n"; | |
| 314 }else { | |
| 315 my @s = @{$sref}; | |
| 316 print join("\t", @s); | |
| 317 if ($gref->[5] eq '-') { $changed = compl($changed); } #use plus for ref | |
| 318 if (!$changed) { return; } #skip this one | |
| 319 print "\t$gref->[3]\t$oldaa:", join("/", @newaa), "\t$cdNum\t$changed\n"; | |
| 320 } | |
| 321 $done{"$sref->[$col0] $sref->[$col1] $sref->[$col2]"}++; | |
| 322 } | |
| 323 } | |
| 324 | |
| 325 sub getnts { | |
| 326 my $chr = shift; | |
| 327 my @pos = @_; #list of positions not necessarily in order | |
| 328 #list may be reversed or have gaps(introns), at least 3 bps | |
| 329 my $seq = ''; | |
| 330 if (scalar @pos < 3) { die "too small region for $chr $pos[0]\n"; } | |
| 331 if ($pos[0] < $pos[1]) { #not reversed | |
| 332 my $s = $pos[0]; | |
| 333 for(my $i = 1; $i <= $#pos; $i++) { | |
| 334 if ($pos[$i] == $pos[$i-1] + 1) { next; } | |
| 335 if ($seqFlag eq '2bit') { | |
| 336 $seq .= fetchSeq2bit($chr, $s, $pos[$i-1]); | |
| 337 }else { | |
| 338 $seq .= fetchSeqNib($chr, $s, $pos[$i-1]); | |
| 339 } | |
| 340 $s = $pos[$i]; | |
| 341 } | |
| 342 if (length $seq != scalar @pos) { #still need to fetch seq | |
| 343 if ($seqFlag eq '2bit') { | |
| 344 $seq .= fetchSeq2bit($chr, $s, $pos[$#pos]); | |
| 345 }else { | |
| 346 $seq .= fetchSeqNib($chr, $s, $pos[$#pos]); | |
| 347 } | |
| 348 } | |
| 349 }else { #reversed | |
| 350 my $s = $pos[$#pos]; | |
| 351 for(my $i = $#pos -1; $i >= 0; $i--) { | |
| 352 if ($pos[$i] == $pos[$i+1] + 1) { next; } | |
| 353 if ($seqFlag eq '2bit') { | |
| 354 $seq .= fetchSeq2bit($chr, $s, $pos[$i+1]); | |
| 355 }else { | |
| 356 $seq .= fetchSeqNib($chr, $s, $pos[$i+1]); | |
| 357 } | |
| 358 $s = $pos[$i]; | |
| 359 } | |
| 360 if (length $seq != scalar @pos) { #still need to fetch seq | |
| 361 if ($seqFlag eq '2bit') { | |
| 362 $seq .= fetchSeq2bit($chr, $s, $pos[0]); | |
| 363 }else { | |
| 364 $seq .= fetchSeqNib($chr, $s, $pos[0]); | |
| 365 } | |
| 366 } | |
| 367 } | |
| 368 } | |
| 369 | |
| 370 sub fetchSeq2bit { | |
| 371 my $chr = shift; | |
| 372 my $st = shift; | |
| 373 my $end = shift; | |
| 374 my $strand = '+'; | |
| 375 $st--; #change to UCSC numbering | |
| 376 open (BIT, "twoBitToFa -seq=$chr -start=$st -end=$end $nibDir stdout |") or | |
| 377 die "Couldn't run twoBitToFa, $!\n"; | |
| 378 my $seq = ''; | |
| 379 while (<BIT>) { | |
| 380 chomp; | |
| 381 if (/^>/) { next; } #header | |
| 382 $seq .= uc($_); | |
| 383 } | |
| 384 close BIT or die "Couldn't finish twoBitToFa on $chr $st $end, $!\n"; | |
| 385 return $seq; | |
| 386 } | |
| 387 | |
| 388 sub fetchSeqNib { | |
| 389 my $chr = shift; | |
| 390 my $st = shift; | |
| 391 my $end = shift; | |
| 392 my $strand = '+'; | |
| 393 $st--; #change to UCSC numbering | |
| 394 open (NIB, "nibFrag -upper $nibDir/${chr}.nib $st $end $strand stdout |") or die "Couldn't run nibFrag, $!\n"; | |
| 395 my $seq = ''; | |
| 396 while (<NIB>) { | |
| 397 chomp; | |
| 398 if (/^>/) { next; } #header | |
| 399 $seq .= $_; | |
| 400 } | |
| 401 close NIB or die "Couldn't finish nibFrag on $chr $st $end, $!\n"; | |
| 402 return $seq; | |
| 403 } | |
| 404 | |
| 405 sub compl { | |
| 406 my $nts = shift; | |
| 407 my $comp = ''; | |
| 408 if (!$nts) { die "ERROR called compl with nts undefined"; } | |
| 409 foreach my $n (split(/ */, $nts)) { | |
| 410 if ($n eq 'A') { $comp .= 'T'; } | |
| 411 elsif ($n eq 'T') { $comp .= 'A'; } | |
| 412 elsif ($n eq 'C') { $comp .= 'G'; } | |
| 413 elsif ($n eq 'G') { $comp .= 'C'; } | |
| 414 elsif ($n eq 'N') { $comp .= 'N'; } | |
| 415 elsif ($n eq '-') { $comp .= '-'; } #deletion | |
| 416 else { $comp = undef; } | |
| 417 } | |
| 418 return $comp; | |
| 419 } | |
| 420 | |
| 421 sub getaa { | |
| 422 my $nts = shift; #in multiples of 3 | |
| 423 my $aa = ''; | |
| 424 my @n = split(/ */, $nts); | |
| 425 while (@n) { | |
| 426 my @t = splice(@n, 0, 3); | |
| 427 my $n = uc(join("", @t)); | |
| 428 if (!exists $codon{$n}) { $aa .= 'N'; next; } | |
| 429 $aa .= $codon{$n}; | |
| 430 } | |
| 431 return $aa; | |
| 432 } | |
| 433 | |
| 434 sub fill_codon { | |
| 435 $codon{GCA} = 'Ala'; | |
| 436 $codon{GCC} = 'Ala'; | |
| 437 $codon{GCG} = 'Ala'; | |
| 438 $codon{GCT} = 'Ala'; | |
| 439 $codon{CGG} = 'Arg'; | |
| 440 $codon{CGT} = 'Arg'; | |
| 441 $codon{CGC} = 'Arg'; | |
| 442 $codon{AGA} = 'Arg'; | |
| 443 $codon{AGG} = 'Arg'; | |
| 444 $codon{CGA} = 'Arg'; | |
| 445 $codon{AAC} = 'Asn'; | |
| 446 $codon{AAT} = 'Asn'; | |
| 447 $codon{GAC} = 'Asp'; | |
| 448 $codon{GAT} = 'Asp'; | |
| 449 $codon{TGC} = 'Cys'; | |
| 450 $codon{TGT} = 'Cys'; | |
| 451 $codon{CAG} = 'Gln'; | |
| 452 $codon{CAA} = 'Gln'; | |
| 453 $codon{GAA} = 'Glu'; | |
| 454 $codon{GAG} = 'Glu'; | |
| 455 $codon{GGG} = 'Gly'; | |
| 456 $codon{GGA} = 'Gly'; | |
| 457 $codon{GGC} = 'Gly'; | |
| 458 $codon{GGT} = 'Gly'; | |
| 459 $codon{CAC} = 'His'; | |
| 460 $codon{CAT} = 'His'; | |
| 461 $codon{ATA} = 'Ile'; | |
| 462 $codon{ATT} = 'Ile'; | |
| 463 $codon{ATC} = 'Ile'; | |
| 464 $codon{CTA} = 'Leu'; | |
| 465 $codon{CTC} = 'Leu'; | |
| 466 $codon{CTG} = 'Leu'; | |
| 467 $codon{CTT} = 'Leu'; | |
| 468 $codon{TTG} = 'Leu'; | |
| 469 $codon{TTA} = 'Leu'; | |
| 470 $codon{AAA} = 'Lys'; | |
| 471 $codon{AAG} = 'Lys'; | |
| 472 $codon{ATG} = 'Met'; | |
| 473 $codon{TTC} = 'Phe'; | |
| 474 $codon{TTT} = 'Phe'; | |
| 475 $codon{CCT} = 'Pro'; | |
| 476 $codon{CCA} = 'Pro'; | |
| 477 $codon{CCC} = 'Pro'; | |
| 478 $codon{CCG} = 'Pro'; | |
| 479 $codon{TCA} = 'Ser'; | |
| 480 $codon{AGC} = 'Ser'; | |
| 481 $codon{AGT} = 'Ser'; | |
| 482 $codon{TCC} = 'Ser'; | |
| 483 $codon{TCT} = 'Ser'; | |
| 484 $codon{TCG} = 'Ser'; | |
| 485 $codon{TGA} = 'Stop'; | |
| 486 $codon{TAG} = 'Stop'; | |
| 487 $codon{TAA} = 'Stop'; | |
| 488 $codon{ACT} = 'Thr'; | |
| 489 $codon{ACA} = 'Thr'; | |
| 490 $codon{ACC} = 'Thr'; | |
| 491 $codon{ACG} = 'Thr'; | |
| 492 $codon{TGG} = 'Trp'; | |
| 493 $codon{TAT} = 'Tyr'; | |
| 494 $codon{TAC} = 'Tyr'; | |
| 495 $codon{GTC} = 'Val'; | |
| 496 $codon{GTA} = 'Val'; | |
| 497 $codon{GTG} = 'Val'; | |
| 498 $codon{GTT} = 'Val'; | |
| 499 } | |
| 500 | |
| 501 sub getGalaxyInfo { | |
| 502 my $build; | |
| 503 my $locFile; | |
| 504 foreach (@ARGV) { | |
| 505 if (/build=(.*)/) { $build = $1; } | |
| 506 elsif (/loc=(.*)/) { $locFile = $1; } | |
| 507 } | |
| 508 if (!$build or !$locFile) { | |
| 509 print STDERR "ERROR missing build or locfile for Galaxy input\n"; | |
| 510 exit 1; | |
| 511 } | |
| 512 # read $locFile to get $nibDir (ignoring commets) | |
| 513 open(LF, "< $locFile") || die "open($locFile): $!\n"; | |
| 514 while(<LF>) { | |
| 515 s/#.*$//; | |
| 516 s/(?:^\s+|\s+$)//g; | |
| 517 next if (/^$/); | |
| 518 | |
| 519 my @t = split(/\t/); | |
| 520 if ($t[0] eq $build) { $nibDir = $t[1]; } | |
| 521 } | |
| 522 close(LF); | |
| 523 if ($nibDir eq 'Galaxy') { | |
| 524 print STDERR "Failed to find sequence directory in locfile $locFile\n"; | |
| 525 } | |
| 526 $nibDir .= "/$build.2bit"; #we want full path and filename | |
| 527 } | |
| 528 | 
