| 0 | 1 #!/usr/bin/env perl | 
|  | 2 | 
|  | 3 use strict; | 
|  | 4 use warnings; | 
|  | 5 | 
|  | 6 # reports the variant in transcripts separately only if the AA change is different, or distance from splicing site is different | 
|  | 7 @ARGV == 2 or @ARGV == 3 or die "Usage: $0 <hgvs input.txt> <output.txt> [ignore splice distance diff]\n"; | 
|  | 8 | 
|  | 9 my %ftype_rank = ( protein_coding => 100, | 
|  | 10               processed_transcript => 90, | 
|  | 11               antisense => 80, | 
|  | 12               retained_intron => 70, | 
|  | 13               lincRNA => 60, | 
|  | 14               nonsense_mediated_decay => 50, | 
|  | 15               misc_enrichment_kit_target => 0); | 
|  | 16 | 
|  | 17 my %mtype_rank = ( nonsense => 100, | 
|  | 18                       frameshift => 99, | 
|  | 19                       nonstop => 90, | 
|  | 20                       missense => 80, | 
|  | 21                       silent => 50, | 
|  | 22                       "non-coding" => 40); | 
|  | 23 open(IN, $ARGV[0]) | 
|  | 24   or die "Cannot open $ARGV[0] for reading: $!\n"; | 
|  | 25 open(OUT, ">$ARGV[1]") | 
|  | 26   or die "Cannot open $ARGV[1] for writing: $!\n"; | 
|  | 27 my $succinct = (@ARGV == 3); | 
|  | 28 my @lines; | 
|  | 29 my $last_chr = ""; | 
|  | 30 my $last_pos = ""; | 
|  | 31 my $last_alt = ""; | 
|  | 32 my %buffered_F; | 
|  | 33 my %buffered_id_rank; | 
|  | 34 my $header = <IN>; | 
|  | 35 print OUT $header; | 
|  | 36 | 
|  | 37 my ($chr_column, $pos_column, $alt_column, $cdna_hgvs_column, $aa_hgvs_column, $phase_column, $transcript_column, $transcript_length_column, $exon_dist_column, $ftype_column, $mtype_column, $splicing_score_column, $splicing_effect_column, $sources_column); | 
|  | 38 chomp $header; | 
|  | 39 my @headers = split /\t/, $header; | 
|  | 40 for(my $i = 0; $i <= $#headers; $i++){ | 
|  | 41   if($headers[$i] eq "Chr"){ | 
|  | 42     $chr_column = $i; | 
|  | 43   } | 
|  | 44   elsif($headers[$i] eq "Feature type"){ | 
|  | 45     $ftype_column = $i; | 
|  | 46   } | 
|  | 47   elsif($headers[$i] eq "Variant type"){ | 
|  | 48     $mtype_column = $i; | 
|  | 49   } | 
|  | 50   elsif($headers[$i] eq "DNA From" or $headers[$i] eq "DNA Pos"){ | 
|  | 51     $pos_column = $i; | 
|  | 52   } | 
|  | 53   elsif($headers[$i] eq "Obs base"){ | 
|  | 54     $alt_column = $i; | 
|  | 55   } | 
|  | 56   elsif($headers[$i] eq "Transcript HGVS"){ | 
|  | 57     $cdna_hgvs_column = $i; | 
|  | 58   } | 
|  | 59   elsif($headers[$i] eq "Protein HGVS"){ | 
|  | 60     $aa_hgvs_column = $i; | 
|  | 61   } | 
|  | 62   elsif($headers[$i] eq "Phase"){ | 
|  | 63     $phase_column = $i; | 
|  | 64   } | 
|  | 65   elsif($headers[$i] eq "Selected transcript"){ | 
|  | 66     $transcript_column = $i; | 
|  | 67   } | 
|  | 68   elsif($headers[$i] eq "Transcript length"){ | 
|  | 69     $transcript_length_column = $i; | 
|  | 70   } | 
|  | 71   elsif($headers[$i] eq "Closest exon junction (AA coding variants)"){ | 
|  | 72     $exon_dist_column = $i; | 
|  | 73   } | 
|  | 74   elsif($headers[$i] eq "Splicing alteration potential"){ | 
|  | 75     $splicing_score_column = $i; | 
|  | 76   } | 
|  | 77   elsif($headers[$i] eq "Splicing alteration details"){ | 
|  | 78     $splicing_effect_column = $i; | 
|  | 79   } | 
|  | 80   elsif($headers[$i] eq "Sources"){ | 
|  | 81     $sources_column = $i; | 
|  | 82   } | 
|  | 83 } | 
|  | 84 if(not defined $chr_column){ | 
|  | 85   die "Could not find Chr header in $ARGV[0], aborting.\n"; | 
|  | 86 } | 
|  | 87 if(not defined $pos_column){ | 
|  | 88   die "Could not find 'DNA From' header in $ARGV[0], aborting.\n"; | 
|  | 89 } | 
|  | 90 if(not defined $alt_column){ | 
|  | 91   die "Could not find 'Obs base' header in $ARGV[0], aborting.\n"; | 
|  | 92 } | 
|  | 93 if(not defined $cdna_hgvs_column){ | 
|  | 94   die "Could not find 'Transcript HGVS' header in $ARGV[0], aborting.\n"; | 
|  | 95 } | 
|  | 96 if(not defined $aa_hgvs_column){ | 
|  | 97   die "Could not find 'Protein HGVS' header in $ARGV[0], aborting.\n"; | 
|  | 98 } | 
|  | 99 if(not defined $phase_column){ | 
|  | 100   die "Could not find Phase header in $ARGV[0], aborting.\n"; | 
|  | 101 } | 
|  | 102 if(not defined $transcript_column){ | 
|  | 103   die "Could not find 'Selected transcript' header in $ARGV[0], aborting.\n"; | 
|  | 104 } | 
|  | 105 if(not defined $transcript_length_column){ | 
|  | 106   die "Could not find 'Transcript length' header in $ARGV[0], aborting.\n"; | 
|  | 107 } | 
|  | 108 #if(not defined $mtype_column){ | 
|  | 109 #  die "Could not find 'Variant type' header in $ARGV[0], aborting.\n"; | 
|  | 110 #} | 
|  | 111 if(not defined $ftype_column){ | 
|  | 112   die "Could not find 'Feature type' header in $ARGV[0], aborting.\n"; | 
|  | 113 } | 
|  | 114 if(not defined $exon_dist_column){ | 
|  | 115   die "Could not find 'Closest exon junction (AA coding variants)' header in $ARGV[0], aborting.\n"; | 
|  | 116 } | 
|  | 117 | 
|  | 118 while(<IN>){ | 
|  | 119   chomp; | 
|  | 120   my @F = split /\t/, $_, -1; | 
|  | 121   my $chr = $F[$chr_column]; | 
|  | 122   my $pos = $F[$pos_column]; | 
|  | 123   my $alt = $F[$alt_column]; | 
|  | 124   my $cdna_hgvs = $F[$cdna_hgvs_column]; | 
|  | 125   my $hgvs = $F[$aa_hgvs_column]; | 
|  | 126   my $ftype = $F[$ftype_column]; | 
|  | 127   my $mtype; | 
|  | 128   $mtype = $F[$mtype_column] if defined $mtype_column; | 
|  | 129   $hgvs =~ s/\d+//g; # look only at the non-position parts of the AA syntax | 
|  | 130   my $exon_edge_distance = $F[$exon_dist_column]; | 
|  | 131   my $phase = $F[$phase_column] || ""; | 
|  | 132   # in the case of large indels (i.e. CNVs), their positions may not be the same, but effectively they should be reported as such | 
|  | 133   if($phase =~ /CNV-\S+?:(\S+)/){ | 
|  | 134     $pos = $1; | 
|  | 135   } | 
|  | 136   my $preferred_id = $succinct ? ($F[$transcript_column] =~ /^$succinct/o) : 0; | 
|  | 137   my $id_rank = $F[$transcript_column]; | 
|  | 138   $id_rank =~ s/^.*?0*(\d+)(\.\d+)?$/$1/; # look at only trailing non-padded number (and no .version suffix) | 
|  | 139 | 
|  | 140   my $collapse_key = $succinct ? "$chr:$pos:$alt" : "$chr:$pos:$hgvs:$exon_edge_distance:$phase"; | 
|  | 141   # Same variant | 
|  | 142   if($chr eq $last_chr and $pos eq $last_pos and $alt eq $last_alt){ | 
|  | 143     # same AA effect | 
|  | 144     if(not exists $buffered_F{$collapse_key}){ | 
|  | 145       $buffered_F{$collapse_key} = \@F; | 
|  | 146       $buffered_id_rank{$collapse_key} = $id_rank; | 
|  | 147     } | 
|  | 148     else{ | 
|  | 149       $buffered_F{$collapse_key}->[$sources_column] .= "; $F[$sources_column]" if defined $sources_column; | 
|  | 150       $ftype_rank{$ftype} = 0 if not defined $ftype_rank{$ftype}; | 
|  | 151       $mtype_rank{$mtype} = 0 if defined $mtype and not exists $mtype_rank{$mtype}; | 
|  | 152       my $buf_ftype = $buffered_F{$collapse_key}->[$ftype_column]; | 
|  | 153       $ftype_rank{$buf_ftype} = 0 if not defined $ftype_rank{$buf_ftype}; | 
|  | 154       my $buf_mtype; | 
|  | 155       $buf_mtype = $buffered_F{$collapse_key}->[$mtype_column] if defined $mtype_column; | 
|  | 156       $mtype_rank{$buf_mtype} = 0 if defined $buf_mtype and not exists $mtype_rank{$buf_mtype}; | 
|  | 157       # see if this transcript is "earlier" than the other, based on ID # | 
|  | 158       if($ftype_rank{$ftype} > $ftype_rank{$buf_ftype} or | 
|  | 159          (defined $mtype and $mtype_rank{$mtype} > $mtype_rank{$buf_mtype}) or | 
|  | 160          $preferred_id or | 
|  | 161           ($id_rank =~ /^\d+$/ and $buffered_id_rank{$collapse_key} =~ /^\d+$/ and $id_rank < $buffered_id_rank{$collapse_key}) | 
|  | 162          or $F[$transcript_column] lt $buffered_F{$collapse_key}->[$transcript_column]){ # alphabetical as backup | 
|  | 163         # make this the first one reported (and use its HGVS syntax) | 
|  | 164         $buffered_F{$collapse_key}->[$transcript_length_column] = $F[$transcript_length_column]."; ".$buffered_F{$collapse_key}->[$transcript_length_column]; | 
|  | 165         $buffered_F{$collapse_key}->[$transcript_column] = $F[$transcript_column]."; ".$buffered_F{$collapse_key}->[$transcript_column]; | 
|  | 166         $buffered_F{$collapse_key}->[$cdna_hgvs_column] = $F[$cdna_hgvs_column]; | 
|  | 167         $buffered_F{$collapse_key}->[$aa_hgvs_column] = $F[$aa_hgvs_column]; | 
|  | 168         $buffered_id_rank{$collapse_key} = $id_rank; | 
|  | 169       } | 
|  | 170       else{ | 
|  | 171         # just append it to the list of IDs | 
|  | 172         $buffered_F{$collapse_key}->[$transcript_length_column] .= "; $F[$transcript_length_column]"; | 
|  | 173         $buffered_F{$collapse_key}->[$transcript_column] .= "; $F[$transcript_column]"; | 
|  | 174       } | 
|  | 175       if($succinct and defined $splicing_score_column and $F[$splicing_score_column] ne "NA" and $F[$splicing_score_column] > $buffered_F{$collapse_key}->[$splicing_score_column]){ | 
|  | 176         $buffered_F{$collapse_key}->[$splicing_score_column] = $F[$splicing_score_column]; | 
|  | 177         $buffered_F{$collapse_key}->[$splicing_effect_column] = $F[$splicing_effect_column] ." (transcript model ".$F[$transcript_column].")"; | 
|  | 178       } | 
|  | 179     } | 
|  | 180   } | 
|  | 181   # Different variant from the last line, dump any buffered data | 
|  | 182   else{ | 
|  | 183     for my $collapse_key (keys %buffered_F){ | 
|  | 184       if(defined $sources_column){ | 
|  | 185         my %seen; | 
|  | 186         $buffered_F{$collapse_key}->[$sources_column] = join("; ", grep {not $seen{$_}++} split(/; /, $buffered_F{$collapse_key}->[$sources_column])); | 
|  | 187       } | 
|  | 188       print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n"; | 
|  | 189     } | 
|  | 190     undef %buffered_F; | 
|  | 191     $last_chr = $chr; | 
|  | 192     $last_pos = $pos; | 
|  | 193     $last_alt = $alt; | 
|  | 194     $buffered_F{$collapse_key} = \@F; | 
|  | 195     $buffered_id_rank{$collapse_key} = $id_rank; | 
|  | 196   } | 
|  | 197 } | 
|  | 198 for my $collapse_key (keys %buffered_F){ | 
|  | 199   print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n"; | 
|  | 200 } | 
|  | 201 close(IN); |