0
+ − 1 #!/usr/bin/env perl
+ − 2
+ − 3 use strict;
+ − 4 use warnings;
+ − 5
+ − 6 # reports the variant in transcripts separately only if the AA change is different, or distance from splicing site is different
+ − 7 @ARGV == 2 or @ARGV == 3 or die "Usage: $0 <hgvs input.txt> <output.txt> [ignore splice distance diff]\n";
+ − 8
+ − 9 my %ftype_rank = ( protein_coding => 100,
+ − 10 processed_transcript => 90,
+ − 11 antisense => 80,
+ − 12 retained_intron => 70,
+ − 13 lincRNA => 60,
+ − 14 nonsense_mediated_decay => 50,
+ − 15 misc_enrichment_kit_target => 0);
+ − 16
+ − 17 my %mtype_rank = ( nonsense => 100,
+ − 18 frameshift => 99,
+ − 19 nonstop => 90,
+ − 20 missense => 80,
+ − 21 silent => 50,
+ − 22 "non-coding" => 40);
+ − 23 open(IN, $ARGV[0])
+ − 24 or die "Cannot open $ARGV[0] for reading: $!\n";
+ − 25 open(OUT, ">$ARGV[1]")
+ − 26 or die "Cannot open $ARGV[1] for writing: $!\n";
+ − 27 my $succinct = (@ARGV == 3);
+ − 28 my @lines;
+ − 29 my $last_chr = "";
+ − 30 my $last_pos = "";
+ − 31 my $last_alt = "";
+ − 32 my %buffered_F;
+ − 33 my %buffered_id_rank;
+ − 34 my $header = <IN>;
+ − 35 print OUT $header;
+ − 36
+ − 37 my ($chr_column, $pos_column, $alt_column, $cdna_hgvs_column, $aa_hgvs_column, $phase_column, $transcript_column, $transcript_length_column, $exon_dist_column, $ftype_column, $mtype_column, $splicing_score_column, $splicing_effect_column, $sources_column);
+ − 38 chomp $header;
+ − 39 my @headers = split /\t/, $header;
+ − 40 for(my $i = 0; $i <= $#headers; $i++){
+ − 41 if($headers[$i] eq "Chr"){
+ − 42 $chr_column = $i;
+ − 43 }
+ − 44 elsif($headers[$i] eq "Feature type"){
+ − 45 $ftype_column = $i;
+ − 46 }
+ − 47 elsif($headers[$i] eq "Variant type"){
+ − 48 $mtype_column = $i;
+ − 49 }
+ − 50 elsif($headers[$i] eq "DNA From" or $headers[$i] eq "DNA Pos"){
+ − 51 $pos_column = $i;
+ − 52 }
+ − 53 elsif($headers[$i] eq "Obs base"){
+ − 54 $alt_column = $i;
+ − 55 }
+ − 56 elsif($headers[$i] eq "Transcript HGVS"){
+ − 57 $cdna_hgvs_column = $i;
+ − 58 }
+ − 59 elsif($headers[$i] eq "Protein HGVS"){
+ − 60 $aa_hgvs_column = $i;
+ − 61 }
+ − 62 elsif($headers[$i] eq "Phase"){
+ − 63 $phase_column = $i;
+ − 64 }
+ − 65 elsif($headers[$i] eq "Selected transcript"){
+ − 66 $transcript_column = $i;
+ − 67 }
+ − 68 elsif($headers[$i] eq "Transcript length"){
+ − 69 $transcript_length_column = $i;
+ − 70 }
+ − 71 elsif($headers[$i] eq "Closest exon junction (AA coding variants)"){
+ − 72 $exon_dist_column = $i;
+ − 73 }
+ − 74 elsif($headers[$i] eq "Splicing alteration potential"){
+ − 75 $splicing_score_column = $i;
+ − 76 }
+ − 77 elsif($headers[$i] eq "Splicing alteration details"){
+ − 78 $splicing_effect_column = $i;
+ − 79 }
+ − 80 elsif($headers[$i] eq "Sources"){
+ − 81 $sources_column = $i;
+ − 82 }
+ − 83 }
+ − 84 if(not defined $chr_column){
+ − 85 die "Could not find Chr header in $ARGV[0], aborting.\n";
+ − 86 }
+ − 87 if(not defined $pos_column){
+ − 88 die "Could not find 'DNA From' header in $ARGV[0], aborting.\n";
+ − 89 }
+ − 90 if(not defined $alt_column){
+ − 91 die "Could not find 'Obs base' header in $ARGV[0], aborting.\n";
+ − 92 }
+ − 93 if(not defined $cdna_hgvs_column){
+ − 94 die "Could not find 'Transcript HGVS' header in $ARGV[0], aborting.\n";
+ − 95 }
+ − 96 if(not defined $aa_hgvs_column){
+ − 97 die "Could not find 'Protein HGVS' header in $ARGV[0], aborting.\n";
+ − 98 }
+ − 99 if(not defined $phase_column){
+ − 100 die "Could not find Phase header in $ARGV[0], aborting.\n";
+ − 101 }
+ − 102 if(not defined $transcript_column){
+ − 103 die "Could not find 'Selected transcript' header in $ARGV[0], aborting.\n";
+ − 104 }
+ − 105 if(not defined $transcript_length_column){
+ − 106 die "Could not find 'Transcript length' header in $ARGV[0], aborting.\n";
+ − 107 }
+ − 108 #if(not defined $mtype_column){
+ − 109 # die "Could not find 'Variant type' header in $ARGV[0], aborting.\n";
+ − 110 #}
+ − 111 if(not defined $ftype_column){
+ − 112 die "Could not find 'Feature type' header in $ARGV[0], aborting.\n";
+ − 113 }
+ − 114 if(not defined $exon_dist_column){
+ − 115 die "Could not find 'Closest exon junction (AA coding variants)' header in $ARGV[0], aborting.\n";
+ − 116 }
+ − 117
+ − 118 while(<IN>){
+ − 119 chomp;
+ − 120 my @F = split /\t/, $_, -1;
+ − 121 my $chr = $F[$chr_column];
+ − 122 my $pos = $F[$pos_column];
+ − 123 my $alt = $F[$alt_column];
+ − 124 my $cdna_hgvs = $F[$cdna_hgvs_column];
+ − 125 my $hgvs = $F[$aa_hgvs_column];
+ − 126 my $ftype = $F[$ftype_column];
+ − 127 my $mtype;
+ − 128 $mtype = $F[$mtype_column] if defined $mtype_column;
+ − 129 $hgvs =~ s/\d+//g; # look only at the non-position parts of the AA syntax
+ − 130 my $exon_edge_distance = $F[$exon_dist_column];
+ − 131 my $phase = $F[$phase_column] || "";
+ − 132 # in the case of large indels (i.e. CNVs), their positions may not be the same, but effectively they should be reported as such
+ − 133 if($phase =~ /CNV-\S+?:(\S+)/){
+ − 134 $pos = $1;
+ − 135 }
+ − 136 my $preferred_id = $succinct ? ($F[$transcript_column] =~ /^$succinct/o) : 0;
+ − 137 my $id_rank = $F[$transcript_column];
+ − 138 $id_rank =~ s/^.*?0*(\d+)(\.\d+)?$/$1/; # look at only trailing non-padded number (and no .version suffix)
+ − 139
+ − 140 my $collapse_key = $succinct ? "$chr:$pos:$alt" : "$chr:$pos:$hgvs:$exon_edge_distance:$phase";
+ − 141 # Same variant
+ − 142 if($chr eq $last_chr and $pos eq $last_pos and $alt eq $last_alt){
+ − 143 # same AA effect
+ − 144 if(not exists $buffered_F{$collapse_key}){
+ − 145 $buffered_F{$collapse_key} = \@F;
+ − 146 $buffered_id_rank{$collapse_key} = $id_rank;
+ − 147 }
+ − 148 else{
+ − 149 $buffered_F{$collapse_key}->[$sources_column] .= "; $F[$sources_column]" if defined $sources_column;
+ − 150 $ftype_rank{$ftype} = 0 if not defined $ftype_rank{$ftype};
+ − 151 $mtype_rank{$mtype} = 0 if defined $mtype and not exists $mtype_rank{$mtype};
+ − 152 my $buf_ftype = $buffered_F{$collapse_key}->[$ftype_column];
+ − 153 $ftype_rank{$buf_ftype} = 0 if not defined $ftype_rank{$buf_ftype};
+ − 154 my $buf_mtype;
+ − 155 $buf_mtype = $buffered_F{$collapse_key}->[$mtype_column] if defined $mtype_column;
+ − 156 $mtype_rank{$buf_mtype} = 0 if defined $buf_mtype and not exists $mtype_rank{$buf_mtype};
+ − 157 # see if this transcript is "earlier" than the other, based on ID #
+ − 158 if($ftype_rank{$ftype} > $ftype_rank{$buf_ftype} or
+ − 159 (defined $mtype and $mtype_rank{$mtype} > $mtype_rank{$buf_mtype}) or
+ − 160 $preferred_id or
+ − 161 ($id_rank =~ /^\d+$/ and $buffered_id_rank{$collapse_key} =~ /^\d+$/ and $id_rank < $buffered_id_rank{$collapse_key})
+ − 162 or $F[$transcript_column] lt $buffered_F{$collapse_key}->[$transcript_column]){ # alphabetical as backup
+ − 163 # make this the first one reported (and use its HGVS syntax)
+ − 164 $buffered_F{$collapse_key}->[$transcript_length_column] = $F[$transcript_length_column]."; ".$buffered_F{$collapse_key}->[$transcript_length_column];
+ − 165 $buffered_F{$collapse_key}->[$transcript_column] = $F[$transcript_column]."; ".$buffered_F{$collapse_key}->[$transcript_column];
+ − 166 $buffered_F{$collapse_key}->[$cdna_hgvs_column] = $F[$cdna_hgvs_column];
+ − 167 $buffered_F{$collapse_key}->[$aa_hgvs_column] = $F[$aa_hgvs_column];
+ − 168 $buffered_id_rank{$collapse_key} = $id_rank;
+ − 169 }
+ − 170 else{
+ − 171 # just append it to the list of IDs
+ − 172 $buffered_F{$collapse_key}->[$transcript_length_column] .= "; $F[$transcript_length_column]";
+ − 173 $buffered_F{$collapse_key}->[$transcript_column] .= "; $F[$transcript_column]";
+ − 174 }
+ − 175 if($succinct and defined $splicing_score_column and $F[$splicing_score_column] ne "NA" and $F[$splicing_score_column] > $buffered_F{$collapse_key}->[$splicing_score_column]){
+ − 176 $buffered_F{$collapse_key}->[$splicing_score_column] = $F[$splicing_score_column];
+ − 177 $buffered_F{$collapse_key}->[$splicing_effect_column] = $F[$splicing_effect_column] ." (transcript model ".$F[$transcript_column].")";
+ − 178 }
+ − 179 }
+ − 180 }
+ − 181 # Different variant from the last line, dump any buffered data
+ − 182 else{
+ − 183 for my $collapse_key (keys %buffered_F){
+ − 184 if(defined $sources_column){
+ − 185 my %seen;
+ − 186 $buffered_F{$collapse_key}->[$sources_column] = join("; ", grep {not $seen{$_}++} split(/; /, $buffered_F{$collapse_key}->[$sources_column]));
+ − 187 }
+ − 188 print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n";
+ − 189 }
+ − 190 undef %buffered_F;
+ − 191 $last_chr = $chr;
+ − 192 $last_pos = $pos;
+ − 193 $last_alt = $alt;
+ − 194 $buffered_F{$collapse_key} = \@F;
+ − 195 $buffered_id_rank{$collapse_key} = $id_rank;
+ − 196 }
+ − 197 }
+ − 198 for my $collapse_key (keys %buffered_F){
+ − 199 print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n";
+ − 200 }
+ − 201 close(IN);