diff SNV/identify_nonsynonymous_mutations.pl @ 0:74f5ea818cea

Uploaded
author ryanmorin
date Wed, 12 Oct 2011 19:50:38 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/SNV/identify_nonsynonymous_mutations.pl	Wed Oct 12 19:50:38 2011 -0400
@@ -0,0 +1,223 @@
+#!/usr/bin/perl
+#Author: Ryan Morin (rmorin@bcgsc.ca)
+#last modified February, 2010 to match improvements to codon lookup table
+#Script to annotate files that have been joined with an appropriate 'codon lookup' table.  
+#example usage: sort -k 1 snp_file.txt | join codon_lookup.sort - | this_script.pl
+#Transcript information and amino acid/codon position information is used to report the amino acid change in this format:  N123M where N is the original amino acid at position 123 and M is the new amino acid.
+#Transcripts with the same reading frame for position 123 are separated by commas.  Distinct reading frames for the affected position are separated by a semi-colon (e.g. 123 = 155 in another isoform)
+#an example of the codon lookup table:
+#chr10:101658781 ENSG00000107554 -1      GAA     3       37;79;791;      ENST00000393570;ENST00000370423;ENST00000324109,ENST00000342239;
+#hence, chr10:101658781 corresponds to the third position in a GAA codon which is amino acid 791 in both ENST00000324109 ENST00000342239, amino acid 79 in ENST00000370423 and 37 in ENST00000393570
+
+use strict;
+use Data::Dumper;
+use Getopt::Std;
+use vars qw($opt_h);
+getopts("h");
+
+my %lookup;
+
+load_ambig();
+my %codons = codons();
+
+my $requested_header = $opt_h;
+
+my $header_printed;
+if($requested_header){
+    $header_printed = 0;
+}
+else{
+    $header_printed = 1;
+}
+while(<STDIN>){
+    
+    chomp;
+    my $line = $_;
+
+    my @cols = split /\s/, $line;
+    my $ncols = @cols;
+    unless($header_printed){
+	print "\#CHR:POSITION GENE STRAND REF_CODON CODON_POSITION REFBASE IUPAC_BASE_CALL MINOR_ALLELE_REPRESENTATION NEW_CODON REF_AA NEW_AA STATUS";
+	print "\n";
+	$header_printed = 1;
+    }
+   
+    my $gene = $cols[1];
+    my $strand = $cols[2];
+    my $codon = $cols[3];
+    unless ($codon =~ /[ACTG]{3}/){
+	print "\n";
+	next;
+    }
+    my $codon_pos = $cols[4];
+    my $aa_pos = $cols[5];
+    my @transcript_strings = split /;/, $cols[6];
+    
+    my $ref = uc($cols[7]);
+    my $amb = uc($cols[8]);
+    my $mut = amb_lookup($amb,$ref,$cols[0]);
+    next if $mut == -1;
+    my $mut_cor = $mut;
+    
+    my $ref_cor = $ref;
+    if($strand < 0){
+	$mut_cor =~ tr/ACTG/TGAC/;
+	$ref_cor =~ tr/ACTG/TGAC/;
+    }
+    my @three = split //, $codon;
+    #sanity check
+  
+    my $codon_ind = $codon_pos -1;
+    my $orig = $three[$codon_ind];
+    if($orig ne $ref_cor){
+	print STDERR "$cols[0]\tERROR, base $ref is not the same as codon position $codon_pos ($orig)\n";
+	s/1\s\w{3}\s\d\s/1 UNKNOWN 0 /;
+	print "$line";
+	print " UNKNOWN\n";
+	next;
+	
+    }
+    else{
+	#ok, print normal leader line and continue to downstream work
+	print "$line";
+    }
+    $three[$codon_ind] = $mut_cor;
+    my $new_codon = join "", @three;
+    print " $new_codon ";
+    
+    my $old_aa = translate($codon);
+    my $new_aa = translate($new_codon);
+
+    my $type = "CODING";
+    if($old_aa eq $new_aa){
+	$type = "SYNONYMOUS";
+    }
+
+    my $aa_string;
+    
+    my @aa_positions = split /;/, $aa_pos;
+    for(my $i = 0;$i<@aa_positions;$i++){
+	my $string = $old_aa . $aa_positions[$i] . $new_aa;
+	$aa_string .= "$string;";
+    }
+    chop($aa_string);
+    print "$old_aa $new_aa $type $aa_string\n";
+}
+
+sub amb_lookup{
+    my $amb = shift;
+    my $ref = shift;
+    my $pos = shift;
+    if($amb =~ /[ACTG]/){
+	#print "$amb\n";
+	return($amb);
+    }
+    #print "LOOKUP: $amb\n";
+    my @all = @{$lookup{$amb}};
+    #print Dumper @all;
+    my @some;
+    for(@all){
+	next if $_ eq $ref;
+	push @some, $_;
+
+    }
+    if(@some > 1){
+	#print Dumper @some;
+	print STDERR "$pos\t$ref\t$amb\ttwo bases at this position, neither is reference\n";
+	return(-1);
+    }
+    return($some[0]);
+}
+
+sub load_ambig{
+    add('M',qw(A C));
+    add('R',qw(A G));
+    add('W',qw(A T));
+    add('S',qw(C G));
+    add('Y',qw(T C));
+    add('K',qw(T G));
+    add('V',qw(A C G));
+    add('H',qw(A C T));
+    add('D',qw(A G T));
+    add('B',qw(C G T));
+}
+
+sub add{
+
+    my $amb = shift;
+    my @copy = @_;
+    $lookup{$amb} = \@copy;
+}
+sub translate{
+    my $codon = shift;
+    return($codons{$codon});
+}
+sub codons{
+    my %codons = (
+    'TCA' => 'S',    # Serine
+    'TCC' => 'S',    # Serine
+    'TCG' => 'S',    # Serine
+    'TCT' => 'S',    # Serine
+    'TTC' => 'F',    # Phenylalanine
+    'TTT' => 'F',    # Phenylalanine
+    'TTA' => 'L',    # Leucine
+    'TTG' => 'L',    # Leucine
+    'TAC' => 'Y',    # Tyrosine
+    'TAT' => 'Y',    # Tyrosine
+    'TAA' => '*',    # Stop
+    'TAG' => '*',    # Stop
+    'TGC' => 'C',    # Cysteine
+    'TGT' => 'C',    # Cysteine
+    'TGA' => '*',    # Stop
+    'TGG' => 'W',    # Tryptophan
+    'CTA' => 'L',    # Leucine
+    'CTC' => 'L',    # Leucine
+    'CTG' => 'L',    # Leucine
+    'CTT' => 'L',    # Leucine
+    'CCA' => 'P',    # Proline
+    'CCC' => 'P',    # Proline
+    'CCG' => 'P',    # Proline
+    'CCT' => 'P',    # Proline
+    'CAC' => 'H',    # Histidine
+    'CAT' => 'H',    # Histidine
+    'CAA' => 'Q',    # Glutamine
+    'CAG' => 'Q',    # Glutamine
+    'CGA' => 'R',    # Arginine
+    'CGC' => 'R',    # Arginine
+    'CGG' => 'R',    # Arginine
+    'CGT' => 'R',    # Arginine
+    'ATA' => 'I',    # Isoleucine
+    'ATC' => 'I',    # Isoleucine
+    'ATT' => 'I',    # Isoleucine
+    'ATG' => 'M',    # Methionine
+    'ACA' => 'T',    # Threonine
+    'ACC' => 'T',    # Threonine
+    'ACG' => 'T',    # Threonine
+    'ACT' => 'T',    # Threonine
+    'AAC' => 'N',    # Asparagine
+    'AAT' => 'N',    # Asparagine
+    'AAA' => 'K',    # Lysine
+    'AAG' => 'K',    # Lysine
+    'AGC' => 'S',    # Serine
+    'AGT' => 'S',    # Serine
+    'AGA' => 'R',    # Arginine
+    'AGG' => 'R',    # Arginine
+    'GTA' => 'V',    # Valine
+    'GTC' => 'V',    # Valine
+    'GTG' => 'V',    # Valine
+    'GTT' => 'V',    # Valine
+    'GCA' => 'A',    # Alanine
+    'GCC' => 'A',    # Alanine
+    'GCG' => 'A',    # Alanine
+    'GCT' => 'A',    # Alanine
+    'GAC' => 'D',    # Aspartic Acid
+    'GAT' => 'D',    # Aspartic Acid
+    'GAA' => 'E',    # Glutamic Acid
+    'GAG' => 'E',    # Glutamic Acid
+    'GGA' => 'G',    # Glycine
+    'GGC' => 'G',    # Glycine
+    'GGG' => 'G',    # Glycine
+    'GGT' => 'G',    # Glycine
+    );
+    return(%codons);
+}