Mercurial > repos > ryanmorin > nextgen_variant_identification
diff SNV/identify_nonsynonymous_mutations.pl @ 0:74f5ea818cea
Uploaded
| author | ryanmorin |
|---|---|
| date | Wed, 12 Oct 2011 19:50:38 -0400 |
| parents | |
| children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SNV/identify_nonsynonymous_mutations.pl Wed Oct 12 19:50:38 2011 -0400 @@ -0,0 +1,223 @@ +#!/usr/bin/perl +#Author: Ryan Morin (rmorin@bcgsc.ca) +#last modified February, 2010 to match improvements to codon lookup table +#Script to annotate files that have been joined with an appropriate 'codon lookup' table. +#example usage: sort -k 1 snp_file.txt | join codon_lookup.sort - | this_script.pl +#Transcript information and amino acid/codon position information is used to report the amino acid change in this format: N123M where N is the original amino acid at position 123 and M is the new amino acid. +#Transcripts with the same reading frame for position 123 are separated by commas. Distinct reading frames for the affected position are separated by a semi-colon (e.g. 123 = 155 in another isoform) +#an example of the codon lookup table: +#chr10:101658781 ENSG00000107554 -1 GAA 3 37;79;791; ENST00000393570;ENST00000370423;ENST00000324109,ENST00000342239; +#hence, chr10:101658781 corresponds to the third position in a GAA codon which is amino acid 791 in both ENST00000324109 ENST00000342239, amino acid 79 in ENST00000370423 and 37 in ENST00000393570 + +use strict; +use Data::Dumper; +use Getopt::Std; +use vars qw($opt_h); +getopts("h"); + +my %lookup; + +load_ambig(); +my %codons = codons(); + +my $requested_header = $opt_h; + +my $header_printed; +if($requested_header){ + $header_printed = 0; +} +else{ + $header_printed = 1; +} +while(<STDIN>){ + + chomp; + my $line = $_; + + my @cols = split /\s/, $line; + my $ncols = @cols; + unless($header_printed){ + print "\#CHR:POSITION GENE STRAND REF_CODON CODON_POSITION REFBASE IUPAC_BASE_CALL MINOR_ALLELE_REPRESENTATION NEW_CODON REF_AA NEW_AA STATUS"; + print "\n"; + $header_printed = 1; + } + + my $gene = $cols[1]; + my $strand = $cols[2]; + my $codon = $cols[3]; + unless ($codon =~ /[ACTG]{3}/){ + print "\n"; + next; + } + my $codon_pos = $cols[4]; + my $aa_pos = $cols[5]; + my @transcript_strings = split /;/, $cols[6]; + + my $ref = uc($cols[7]); + my $amb = uc($cols[8]); + my $mut = amb_lookup($amb,$ref,$cols[0]); + next if $mut == -1; + my $mut_cor = $mut; + + my $ref_cor = $ref; + if($strand < 0){ + $mut_cor =~ tr/ACTG/TGAC/; + $ref_cor =~ tr/ACTG/TGAC/; + } + my @three = split //, $codon; + #sanity check + + my $codon_ind = $codon_pos -1; + my $orig = $three[$codon_ind]; + if($orig ne $ref_cor){ + print STDERR "$cols[0]\tERROR, base $ref is not the same as codon position $codon_pos ($orig)\n"; + s/1\s\w{3}\s\d\s/1 UNKNOWN 0 /; + print "$line"; + print " UNKNOWN\n"; + next; + + } + else{ + #ok, print normal leader line and continue to downstream work + print "$line"; + } + $three[$codon_ind] = $mut_cor; + my $new_codon = join "", @three; + print " $new_codon "; + + my $old_aa = translate($codon); + my $new_aa = translate($new_codon); + + my $type = "CODING"; + if($old_aa eq $new_aa){ + $type = "SYNONYMOUS"; + } + + my $aa_string; + + my @aa_positions = split /;/, $aa_pos; + for(my $i = 0;$i<@aa_positions;$i++){ + my $string = $old_aa . $aa_positions[$i] . $new_aa; + $aa_string .= "$string;"; + } + chop($aa_string); + print "$old_aa $new_aa $type $aa_string\n"; +} + +sub amb_lookup{ + my $amb = shift; + my $ref = shift; + my $pos = shift; + if($amb =~ /[ACTG]/){ + #print "$amb\n"; + return($amb); + } + #print "LOOKUP: $amb\n"; + my @all = @{$lookup{$amb}}; + #print Dumper @all; + my @some; + for(@all){ + next if $_ eq $ref; + push @some, $_; + + } + if(@some > 1){ + #print Dumper @some; + print STDERR "$pos\t$ref\t$amb\ttwo bases at this position, neither is reference\n"; + return(-1); + } + return($some[0]); +} + +sub load_ambig{ + add('M',qw(A C)); + add('R',qw(A G)); + add('W',qw(A T)); + add('S',qw(C G)); + add('Y',qw(T C)); + add('K',qw(T G)); + add('V',qw(A C G)); + add('H',qw(A C T)); + add('D',qw(A G T)); + add('B',qw(C G T)); +} + +sub add{ + + my $amb = shift; + my @copy = @_; + $lookup{$amb} = \@copy; +} +sub translate{ + my $codon = shift; + return($codons{$codon}); +} +sub codons{ + my %codons = ( + 'TCA' => 'S', # Serine + 'TCC' => 'S', # Serine + 'TCG' => 'S', # Serine + 'TCT' => 'S', # Serine + 'TTC' => 'F', # Phenylalanine + 'TTT' => 'F', # Phenylalanine + 'TTA' => 'L', # Leucine + 'TTG' => 'L', # Leucine + 'TAC' => 'Y', # Tyrosine + 'TAT' => 'Y', # Tyrosine + 'TAA' => '*', # Stop + 'TAG' => '*', # Stop + 'TGC' => 'C', # Cysteine + 'TGT' => 'C', # Cysteine + 'TGA' => '*', # Stop + 'TGG' => 'W', # Tryptophan + 'CTA' => 'L', # Leucine + 'CTC' => 'L', # Leucine + 'CTG' => 'L', # Leucine + 'CTT' => 'L', # Leucine + 'CCA' => 'P', # Proline + 'CCC' => 'P', # Proline + 'CCG' => 'P', # Proline + 'CCT' => 'P', # Proline + 'CAC' => 'H', # Histidine + 'CAT' => 'H', # Histidine + 'CAA' => 'Q', # Glutamine + 'CAG' => 'Q', # Glutamine + 'CGA' => 'R', # Arginine + 'CGC' => 'R', # Arginine + 'CGG' => 'R', # Arginine + 'CGT' => 'R', # Arginine + 'ATA' => 'I', # Isoleucine + 'ATC' => 'I', # Isoleucine + 'ATT' => 'I', # Isoleucine + 'ATG' => 'M', # Methionine + 'ACA' => 'T', # Threonine + 'ACC' => 'T', # Threonine + 'ACG' => 'T', # Threonine + 'ACT' => 'T', # Threonine + 'AAC' => 'N', # Asparagine + 'AAT' => 'N', # Asparagine + 'AAA' => 'K', # Lysine + 'AAG' => 'K', # Lysine + 'AGC' => 'S', # Serine + 'AGT' => 'S', # Serine + 'AGA' => 'R', # Arginine + 'AGG' => 'R', # Arginine + 'GTA' => 'V', # Valine + 'GTC' => 'V', # Valine + 'GTG' => 'V', # Valine + 'GTT' => 'V', # Valine + 'GCA' => 'A', # Alanine + 'GCC' => 'A', # Alanine + 'GCG' => 'A', # Alanine + 'GCT' => 'A', # Alanine + 'GAC' => 'D', # Aspartic Acid + 'GAT' => 'D', # Aspartic Acid + 'GAA' => 'E', # Glutamic Acid + 'GAG' => 'E', # Glutamic Acid + 'GGA' => 'G', # Glycine + 'GGC' => 'G', # Glycine + 'GGG' => 'G', # Glycine + 'GGT' => 'G', # Glycine + ); + return(%codons); +}
