| 
0
 | 
     1 #!/usr/bin/env perl
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 use strict;
 | 
| 
 | 
     4 use warnings;
 | 
| 
 | 
     5 
 | 
| 
 | 
     6 # reports the variant in transcripts separately only if the AA change is different, or distance from splicing site is different
 | 
| 
 | 
     7 @ARGV == 2 or @ARGV == 3 or die "Usage: $0 <hgvs input.txt> <output.txt> [ignore splice distance diff]\n";
 | 
| 
 | 
     8 
 | 
| 
 | 
     9 my %ftype_rank = ( protein_coding => 100,
 | 
| 
 | 
    10               processed_transcript => 90,
 | 
| 
 | 
    11               antisense => 80,
 | 
| 
 | 
    12               retained_intron => 70,
 | 
| 
 | 
    13               lincRNA => 60,
 | 
| 
 | 
    14               nonsense_mediated_decay => 50,
 | 
| 
 | 
    15               misc_enrichment_kit_target => 0); 
 | 
| 
 | 
    16 
 | 
| 
 | 
    17 my %mtype_rank = ( nonsense => 100,
 | 
| 
 | 
    18                       frameshift => 99,
 | 
| 
 | 
    19                       nonstop => 90,
 | 
| 
 | 
    20                       missense => 80,
 | 
| 
 | 
    21                       silent => 50,
 | 
| 
 | 
    22                       "non-coding" => 40);
 | 
| 
 | 
    23 open(IN, $ARGV[0])
 | 
| 
 | 
    24   or die "Cannot open $ARGV[0] for reading: $!\n";
 | 
| 
 | 
    25 open(OUT, ">$ARGV[1]")
 | 
| 
 | 
    26   or die "Cannot open $ARGV[1] for writing: $!\n";
 | 
| 
 | 
    27 my $succinct = (@ARGV == 3);
 | 
| 
 | 
    28 my @lines;
 | 
| 
 | 
    29 my $last_chr = "";
 | 
| 
 | 
    30 my $last_pos = "";
 | 
| 
 | 
    31 my $last_alt = "";
 | 
| 
 | 
    32 my %buffered_F;
 | 
| 
 | 
    33 my %buffered_id_rank;
 | 
| 
 | 
    34 my $header = <IN>;
 | 
| 
 | 
    35 print OUT $header;
 | 
| 
 | 
    36 
 | 
| 
 | 
    37 my ($chr_column, $pos_column, $alt_column, $cdna_hgvs_column, $aa_hgvs_column, $phase_column, $transcript_column, $transcript_length_column, $exon_dist_column, $ftype_column, $mtype_column, $splicing_score_column, $splicing_effect_column, $sources_column);
 | 
| 
 | 
    38 chomp $header;
 | 
| 
 | 
    39 my @headers = split /\t/, $header;
 | 
| 
 | 
    40 for(my $i = 0; $i <= $#headers; $i++){
 | 
| 
 | 
    41   if($headers[$i] eq "Chr"){
 | 
| 
 | 
    42     $chr_column = $i;
 | 
| 
 | 
    43   }
 | 
| 
 | 
    44   elsif($headers[$i] eq "Feature type"){
 | 
| 
 | 
    45     $ftype_column = $i;
 | 
| 
 | 
    46   }
 | 
| 
 | 
    47   elsif($headers[$i] eq "Variant type"){
 | 
| 
 | 
    48     $mtype_column = $i;
 | 
| 
 | 
    49   }
 | 
| 
 | 
    50   elsif($headers[$i] eq "DNA From" or $headers[$i] eq "DNA Pos"){
 | 
| 
 | 
    51     $pos_column = $i;
 | 
| 
 | 
    52   }
 | 
| 
 | 
    53   elsif($headers[$i] eq "Obs base"){
 | 
| 
 | 
    54     $alt_column = $i;
 | 
| 
 | 
    55   }
 | 
| 
 | 
    56   elsif($headers[$i] eq "Transcript HGVS"){
 | 
| 
 | 
    57     $cdna_hgvs_column = $i;
 | 
| 
 | 
    58   }
 | 
| 
 | 
    59   elsif($headers[$i] eq "Protein HGVS"){
 | 
| 
 | 
    60     $aa_hgvs_column = $i;
 | 
| 
 | 
    61   }
 | 
| 
 | 
    62   elsif($headers[$i] eq "Phase"){
 | 
| 
 | 
    63     $phase_column = $i;
 | 
| 
 | 
    64   }
 | 
| 
 | 
    65   elsif($headers[$i] eq "Selected transcript"){
 | 
| 
 | 
    66     $transcript_column = $i;
 | 
| 
 | 
    67   }
 | 
| 
 | 
    68   elsif($headers[$i] eq "Transcript length"){
 | 
| 
 | 
    69     $transcript_length_column = $i;
 | 
| 
 | 
    70   }
 | 
| 
 | 
    71   elsif($headers[$i] eq "Closest exon junction (AA coding variants)"){
 | 
| 
 | 
    72     $exon_dist_column = $i;
 | 
| 
 | 
    73   }
 | 
| 
 | 
    74   elsif($headers[$i] eq "Splicing alteration potential"){
 | 
| 
 | 
    75     $splicing_score_column = $i;
 | 
| 
 | 
    76   }
 | 
| 
 | 
    77   elsif($headers[$i] eq "Splicing alteration details"){
 | 
| 
 | 
    78     $splicing_effect_column = $i;
 | 
| 
 | 
    79   }
 | 
| 
 | 
    80   elsif($headers[$i] eq "Sources"){
 | 
| 
 | 
    81     $sources_column = $i;
 | 
| 
 | 
    82   }
 | 
| 
 | 
    83 }
 | 
| 
 | 
    84 if(not defined $chr_column){
 | 
| 
 | 
    85   die "Could not find Chr header in $ARGV[0], aborting.\n";
 | 
| 
 | 
    86 }
 | 
| 
 | 
    87 if(not defined $pos_column){
 | 
| 
 | 
    88   die "Could not find 'DNA From' header in $ARGV[0], aborting.\n";
 | 
| 
 | 
    89 }
 | 
| 
 | 
    90 if(not defined $alt_column){
 | 
| 
 | 
    91   die "Could not find 'Obs base' header in $ARGV[0], aborting.\n";
 | 
| 
 | 
    92 }
 | 
| 
 | 
    93 if(not defined $cdna_hgvs_column){
 | 
| 
 | 
    94   die "Could not find 'Transcript HGVS' header in $ARGV[0], aborting.\n";
 | 
| 
 | 
    95 }
 | 
| 
 | 
    96 if(not defined $aa_hgvs_column){
 | 
| 
 | 
    97   die "Could not find 'Protein HGVS' header in $ARGV[0], aborting.\n";
 | 
| 
 | 
    98 }
 | 
| 
 | 
    99 if(not defined $phase_column){
 | 
| 
 | 
   100   die "Could not find Phase header in $ARGV[0], aborting.\n";
 | 
| 
 | 
   101 }
 | 
| 
 | 
   102 if(not defined $transcript_column){
 | 
| 
 | 
   103   die "Could not find 'Selected transcript' header in $ARGV[0], aborting.\n";
 | 
| 
 | 
   104 }
 | 
| 
 | 
   105 if(not defined $transcript_length_column){
 | 
| 
 | 
   106   die "Could not find 'Transcript length' header in $ARGV[0], aborting.\n";
 | 
| 
 | 
   107 }
 | 
| 
 | 
   108 #if(not defined $mtype_column){
 | 
| 
 | 
   109 #  die "Could not find 'Variant type' header in $ARGV[0], aborting.\n";
 | 
| 
 | 
   110 #}
 | 
| 
 | 
   111 if(not defined $ftype_column){
 | 
| 
 | 
   112   die "Could not find 'Feature type' header in $ARGV[0], aborting.\n";
 | 
| 
 | 
   113 }
 | 
| 
 | 
   114 if(not defined $exon_dist_column){
 | 
| 
 | 
   115   die "Could not find 'Closest exon junction (AA coding variants)' header in $ARGV[0], aborting.\n";
 | 
| 
 | 
   116 }
 | 
| 
 | 
   117 
 | 
| 
 | 
   118 while(<IN>){
 | 
| 
 | 
   119   chomp;
 | 
| 
 | 
   120   my @F = split /\t/, $_, -1;
 | 
| 
 | 
   121   my $chr = $F[$chr_column];
 | 
| 
 | 
   122   my $pos = $F[$pos_column];
 | 
| 
 | 
   123   my $alt = $F[$alt_column];
 | 
| 
 | 
   124   my $cdna_hgvs = $F[$cdna_hgvs_column];
 | 
| 
 | 
   125   my $hgvs = $F[$aa_hgvs_column];
 | 
| 
 | 
   126   my $ftype = $F[$ftype_column];
 | 
| 
 | 
   127   my $mtype;
 | 
| 
 | 
   128   $mtype = $F[$mtype_column] if defined $mtype_column;
 | 
| 
 | 
   129   $hgvs =~ s/\d+//g; # look only at the non-position parts of the AA syntax
 | 
| 
 | 
   130   my $exon_edge_distance = $F[$exon_dist_column];
 | 
| 
 | 
   131   my $phase = $F[$phase_column] || "";
 | 
| 
 | 
   132   # in the case of large indels (i.e. CNVs), their positions may not be the same, but effectively they should be reported as such
 | 
| 
 | 
   133   if($phase =~ /CNV-\S+?:(\S+)/){
 | 
| 
 | 
   134     $pos = $1;
 | 
| 
 | 
   135   }
 | 
| 
 | 
   136   my $preferred_id = $succinct ? ($F[$transcript_column] =~ /^$succinct/o) : 0;
 | 
| 
 | 
   137   my $id_rank = $F[$transcript_column];
 | 
| 
 | 
   138   $id_rank =~ s/^.*?0*(\d+)(\.\d+)?$/$1/; # look at only trailing non-padded number (and no .version suffix)
 | 
| 
 | 
   139 
 | 
| 
 | 
   140   my $collapse_key = $succinct ? "$chr:$pos:$alt" : "$chr:$pos:$hgvs:$exon_edge_distance:$phase";
 | 
| 
 | 
   141   # Same variant
 | 
| 
 | 
   142   if($chr eq $last_chr and $pos eq $last_pos and $alt eq $last_alt){
 | 
| 
 | 
   143     # same AA effect
 | 
| 
 | 
   144     if(not exists $buffered_F{$collapse_key}){
 | 
| 
 | 
   145       $buffered_F{$collapse_key} = \@F;
 | 
| 
 | 
   146       $buffered_id_rank{$collapse_key} = $id_rank;
 | 
| 
 | 
   147     }
 | 
| 
 | 
   148     else{
 | 
| 
 | 
   149       $buffered_F{$collapse_key}->[$sources_column] .= "; $F[$sources_column]" if defined $sources_column;
 | 
| 
 | 
   150       $ftype_rank{$ftype} = 0 if not defined $ftype_rank{$ftype};
 | 
| 
 | 
   151       $mtype_rank{$mtype} = 0 if defined $mtype and not exists $mtype_rank{$mtype};
 | 
| 
 | 
   152       my $buf_ftype = $buffered_F{$collapse_key}->[$ftype_column];
 | 
| 
 | 
   153       $ftype_rank{$buf_ftype} = 0 if not defined $ftype_rank{$buf_ftype};
 | 
| 
 | 
   154       my $buf_mtype;
 | 
| 
 | 
   155       $buf_mtype = $buffered_F{$collapse_key}->[$mtype_column] if defined $mtype_column;
 | 
| 
 | 
   156       $mtype_rank{$buf_mtype} = 0 if defined $buf_mtype and not exists $mtype_rank{$buf_mtype};
 | 
| 
 | 
   157       # see if this transcript is "earlier" than the other, based on ID #
 | 
| 
 | 
   158       if($ftype_rank{$ftype} > $ftype_rank{$buf_ftype} or 
 | 
| 
 | 
   159          (defined $mtype and $mtype_rank{$mtype} > $mtype_rank{$buf_mtype}) or
 | 
| 
 | 
   160          $preferred_id or
 | 
| 
 | 
   161           ($id_rank =~ /^\d+$/ and $buffered_id_rank{$collapse_key} =~ /^\d+$/ and $id_rank < $buffered_id_rank{$collapse_key}) 
 | 
| 
 | 
   162          or $F[$transcript_column] lt $buffered_F{$collapse_key}->[$transcript_column]){ # alphabetical as backup
 | 
| 
 | 
   163         # make this the first one reported (and use its HGVS syntax)
 | 
| 
 | 
   164         $buffered_F{$collapse_key}->[$transcript_length_column] = $F[$transcript_length_column]."; ".$buffered_F{$collapse_key}->[$transcript_length_column];
 | 
| 
 | 
   165         $buffered_F{$collapse_key}->[$transcript_column] = $F[$transcript_column]."; ".$buffered_F{$collapse_key}->[$transcript_column];
 | 
| 
 | 
   166         $buffered_F{$collapse_key}->[$cdna_hgvs_column] = $F[$cdna_hgvs_column];
 | 
| 
 | 
   167         $buffered_F{$collapse_key}->[$aa_hgvs_column] = $F[$aa_hgvs_column];
 | 
| 
 | 
   168         $buffered_id_rank{$collapse_key} = $id_rank;
 | 
| 
 | 
   169       }
 | 
| 
 | 
   170       else{
 | 
| 
 | 
   171         # just append it to the list of IDs
 | 
| 
 | 
   172         $buffered_F{$collapse_key}->[$transcript_length_column] .= "; $F[$transcript_length_column]";
 | 
| 
 | 
   173         $buffered_F{$collapse_key}->[$transcript_column] .= "; $F[$transcript_column]";
 | 
| 
 | 
   174       }
 | 
| 
 | 
   175       if($succinct and defined $splicing_score_column and $F[$splicing_score_column] ne "NA" and $F[$splicing_score_column] > $buffered_F{$collapse_key}->[$splicing_score_column]){
 | 
| 
 | 
   176         $buffered_F{$collapse_key}->[$splicing_score_column] = $F[$splicing_score_column];
 | 
| 
 | 
   177         $buffered_F{$collapse_key}->[$splicing_effect_column] = $F[$splicing_effect_column] ." (transcript model ".$F[$transcript_column].")";
 | 
| 
 | 
   178       }
 | 
| 
 | 
   179     }
 | 
| 
 | 
   180   }
 | 
| 
 | 
   181   # Different variant from the last line, dump any buffered data
 | 
| 
 | 
   182   else{
 | 
| 
 | 
   183     for my $collapse_key (keys %buffered_F){
 | 
| 
 | 
   184       if(defined $sources_column){
 | 
| 
 | 
   185         my %seen;
 | 
| 
 | 
   186         $buffered_F{$collapse_key}->[$sources_column] = join("; ", grep {not $seen{$_}++} split(/; /, $buffered_F{$collapse_key}->[$sources_column]));
 | 
| 
 | 
   187       }
 | 
| 
 | 
   188       print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n";
 | 
| 
 | 
   189     }
 | 
| 
 | 
   190     undef %buffered_F;
 | 
| 
 | 
   191     $last_chr = $chr;
 | 
| 
 | 
   192     $last_pos = $pos;
 | 
| 
 | 
   193     $last_alt = $alt;
 | 
| 
 | 
   194     $buffered_F{$collapse_key} = \@F;
 | 
| 
 | 
   195     $buffered_id_rank{$collapse_key} = $id_rank;
 | 
| 
 | 
   196   }
 | 
| 
 | 
   197 }
 | 
| 
 | 
   198 for my $collapse_key (keys %buffered_F){
 | 
| 
 | 
   199   print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n"; 
 | 
| 
 | 
   200 }
 | 
| 
 | 
   201 close(IN);
 |