diff hgvs_collapse_transcripts @ 0:7cdd13ff182a default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 15:49:28 -0600
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hgvs_collapse_transcripts	Wed Mar 25 15:49:28 2015 -0600
@@ -0,0 +1,201 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+# reports the variant in transcripts separately only if the AA change is different, or distance from splicing site is different
+@ARGV == 2 or @ARGV == 3 or die "Usage: $0 <hgvs input.txt> <output.txt> [ignore splice distance diff]\n";
+
+my %ftype_rank = ( protein_coding => 100,
+              processed_transcript => 90,
+              antisense => 80,
+              retained_intron => 70,
+              lincRNA => 60,
+              nonsense_mediated_decay => 50,
+              misc_enrichment_kit_target => 0); 
+
+my %mtype_rank = ( nonsense => 100,
+                      frameshift => 99,
+                      nonstop => 90,
+                      missense => 80,
+                      silent => 50,
+                      "non-coding" => 40);
+open(IN, $ARGV[0])
+  or die "Cannot open $ARGV[0] for reading: $!\n";
+open(OUT, ">$ARGV[1]")
+  or die "Cannot open $ARGV[1] for writing: $!\n";
+my $succinct = (@ARGV == 3);
+my @lines;
+my $last_chr = "";
+my $last_pos = "";
+my $last_alt = "";
+my %buffered_F;
+my %buffered_id_rank;
+my $header = <IN>;
+print OUT $header;
+
+my ($chr_column, $pos_column, $alt_column, $cdna_hgvs_column, $aa_hgvs_column, $phase_column, $transcript_column, $transcript_length_column, $exon_dist_column, $ftype_column, $mtype_column, $splicing_score_column, $splicing_effect_column, $sources_column);
+chomp $header;
+my @headers = split /\t/, $header;
+for(my $i = 0; $i <= $#headers; $i++){
+  if($headers[$i] eq "Chr"){
+    $chr_column = $i;
+  }
+  elsif($headers[$i] eq "Feature type"){
+    $ftype_column = $i;
+  }
+  elsif($headers[$i] eq "Variant type"){
+    $mtype_column = $i;
+  }
+  elsif($headers[$i] eq "DNA From" or $headers[$i] eq "DNA Pos"){
+    $pos_column = $i;
+  }
+  elsif($headers[$i] eq "Obs base"){
+    $alt_column = $i;
+  }
+  elsif($headers[$i] eq "Transcript HGVS"){
+    $cdna_hgvs_column = $i;
+  }
+  elsif($headers[$i] eq "Protein HGVS"){
+    $aa_hgvs_column = $i;
+  }
+  elsif($headers[$i] eq "Phase"){
+    $phase_column = $i;
+  }
+  elsif($headers[$i] eq "Selected transcript"){
+    $transcript_column = $i;
+  }
+  elsif($headers[$i] eq "Transcript length"){
+    $transcript_length_column = $i;
+  }
+  elsif($headers[$i] eq "Closest exon junction (AA coding variants)"){
+    $exon_dist_column = $i;
+  }
+  elsif($headers[$i] eq "Splicing alteration potential"){
+    $splicing_score_column = $i;
+  }
+  elsif($headers[$i] eq "Splicing alteration details"){
+    $splicing_effect_column = $i;
+  }
+  elsif($headers[$i] eq "Sources"){
+    $sources_column = $i;
+  }
+}
+if(not defined $chr_column){
+  die "Could not find Chr header in $ARGV[0], aborting.\n";
+}
+if(not defined $pos_column){
+  die "Could not find 'DNA From' header in $ARGV[0], aborting.\n";
+}
+if(not defined $alt_column){
+  die "Could not find 'Obs base' header in $ARGV[0], aborting.\n";
+}
+if(not defined $cdna_hgvs_column){
+  die "Could not find 'Transcript HGVS' header in $ARGV[0], aborting.\n";
+}
+if(not defined $aa_hgvs_column){
+  die "Could not find 'Protein HGVS' header in $ARGV[0], aborting.\n";
+}
+if(not defined $phase_column){
+  die "Could not find Phase header in $ARGV[0], aborting.\n";
+}
+if(not defined $transcript_column){
+  die "Could not find 'Selected transcript' header in $ARGV[0], aborting.\n";
+}
+if(not defined $transcript_length_column){
+  die "Could not find 'Transcript length' header in $ARGV[0], aborting.\n";
+}
+#if(not defined $mtype_column){
+#  die "Could not find 'Variant type' header in $ARGV[0], aborting.\n";
+#}
+if(not defined $ftype_column){
+  die "Could not find 'Feature type' header in $ARGV[0], aborting.\n";
+}
+if(not defined $exon_dist_column){
+  die "Could not find 'Closest exon junction (AA coding variants)' header in $ARGV[0], aborting.\n";
+}
+
+while(<IN>){
+  chomp;
+  my @F = split /\t/, $_, -1;
+  my $chr = $F[$chr_column];
+  my $pos = $F[$pos_column];
+  my $alt = $F[$alt_column];
+  my $cdna_hgvs = $F[$cdna_hgvs_column];
+  my $hgvs = $F[$aa_hgvs_column];
+  my $ftype = $F[$ftype_column];
+  my $mtype;
+  $mtype = $F[$mtype_column] if defined $mtype_column;
+  $hgvs =~ s/\d+//g; # look only at the non-position parts of the AA syntax
+  my $exon_edge_distance = $F[$exon_dist_column];
+  my $phase = $F[$phase_column] || "";
+  # in the case of large indels (i.e. CNVs), their positions may not be the same, but effectively they should be reported as such
+  if($phase =~ /CNV-\S+?:(\S+)/){
+    $pos = $1;
+  }
+  my $preferred_id = $succinct ? ($F[$transcript_column] =~ /^$succinct/o) : 0;
+  my $id_rank = $F[$transcript_column];
+  $id_rank =~ s/^.*?0*(\d+)(\.\d+)?$/$1/; # look at only trailing non-padded number (and no .version suffix)
+
+  my $collapse_key = $succinct ? "$chr:$pos:$alt" : "$chr:$pos:$hgvs:$exon_edge_distance:$phase";
+  # Same variant
+  if($chr eq $last_chr and $pos eq $last_pos and $alt eq $last_alt){
+    # same AA effect
+    if(not exists $buffered_F{$collapse_key}){
+      $buffered_F{$collapse_key} = \@F;
+      $buffered_id_rank{$collapse_key} = $id_rank;
+    }
+    else{
+      $buffered_F{$collapse_key}->[$sources_column] .= "; $F[$sources_column]" if defined $sources_column;
+      $ftype_rank{$ftype} = 0 if not defined $ftype_rank{$ftype};
+      $mtype_rank{$mtype} = 0 if defined $mtype and not exists $mtype_rank{$mtype};
+      my $buf_ftype = $buffered_F{$collapse_key}->[$ftype_column];
+      $ftype_rank{$buf_ftype} = 0 if not defined $ftype_rank{$buf_ftype};
+      my $buf_mtype;
+      $buf_mtype = $buffered_F{$collapse_key}->[$mtype_column] if defined $mtype_column;
+      $mtype_rank{$buf_mtype} = 0 if defined $buf_mtype and not exists $mtype_rank{$buf_mtype};
+      # see if this transcript is "earlier" than the other, based on ID #
+      if($ftype_rank{$ftype} > $ftype_rank{$buf_ftype} or 
+         (defined $mtype and $mtype_rank{$mtype} > $mtype_rank{$buf_mtype}) or
+         $preferred_id or
+          ($id_rank =~ /^\d+$/ and $buffered_id_rank{$collapse_key} =~ /^\d+$/ and $id_rank < $buffered_id_rank{$collapse_key}) 
+         or $F[$transcript_column] lt $buffered_F{$collapse_key}->[$transcript_column]){ # alphabetical as backup
+        # make this the first one reported (and use its HGVS syntax)
+        $buffered_F{$collapse_key}->[$transcript_length_column] = $F[$transcript_length_column]."; ".$buffered_F{$collapse_key}->[$transcript_length_column];
+        $buffered_F{$collapse_key}->[$transcript_column] = $F[$transcript_column]."; ".$buffered_F{$collapse_key}->[$transcript_column];
+        $buffered_F{$collapse_key}->[$cdna_hgvs_column] = $F[$cdna_hgvs_column];
+        $buffered_F{$collapse_key}->[$aa_hgvs_column] = $F[$aa_hgvs_column];
+        $buffered_id_rank{$collapse_key} = $id_rank;
+      }
+      else{
+        # just append it to the list of IDs
+        $buffered_F{$collapse_key}->[$transcript_length_column] .= "; $F[$transcript_length_column]";
+        $buffered_F{$collapse_key}->[$transcript_column] .= "; $F[$transcript_column]";
+      }
+      if($succinct and defined $splicing_score_column and $F[$splicing_score_column] ne "NA" and $F[$splicing_score_column] > $buffered_F{$collapse_key}->[$splicing_score_column]){
+        $buffered_F{$collapse_key}->[$splicing_score_column] = $F[$splicing_score_column];
+        $buffered_F{$collapse_key}->[$splicing_effect_column] = $F[$splicing_effect_column] ." (transcript model ".$F[$transcript_column].")";
+      }
+    }
+  }
+  # Different variant from the last line, dump any buffered data
+  else{
+    for my $collapse_key (keys %buffered_F){
+      if(defined $sources_column){
+        my %seen;
+        $buffered_F{$collapse_key}->[$sources_column] = join("; ", grep {not $seen{$_}++} split(/; /, $buffered_F{$collapse_key}->[$sources_column]));
+      }
+      print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n";
+    }
+    undef %buffered_F;
+    $last_chr = $chr;
+    $last_pos = $pos;
+    $last_alt = $alt;
+    $buffered_F{$collapse_key} = \@F;
+    $buffered_id_rank{$collapse_key} = $id_rank;
+  }
+}
+for my $collapse_key (keys %buffered_F){
+  print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n"; 
+}
+close(IN);