0
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 use strict;
|
|
4 use warnings;
|
|
5
|
|
6 # reports the variant in transcripts separately only if the AA change is different, or distance from splicing site is different
|
|
7 @ARGV == 2 or @ARGV == 3 or die "Usage: $0 <hgvs input.txt> <output.txt> [ignore splice distance diff]\n";
|
|
8
|
|
9 my %ftype_rank = ( protein_coding => 100,
|
|
10 processed_transcript => 90,
|
|
11 antisense => 80,
|
|
12 retained_intron => 70,
|
|
13 lincRNA => 60,
|
|
14 nonsense_mediated_decay => 50,
|
|
15 misc_enrichment_kit_target => 0);
|
|
16
|
|
17 my %mtype_rank = ( nonsense => 100,
|
|
18 frameshift => 99,
|
|
19 nonstop => 90,
|
|
20 missense => 80,
|
|
21 silent => 50,
|
|
22 "non-coding" => 40);
|
|
23 open(IN, $ARGV[0])
|
|
24 or die "Cannot open $ARGV[0] for reading: $!\n";
|
|
25 open(OUT, ">$ARGV[1]")
|
|
26 or die "Cannot open $ARGV[1] for writing: $!\n";
|
|
27 my $succinct = (@ARGV == 3);
|
|
28 my @lines;
|
|
29 my $last_chr = "";
|
|
30 my $last_pos = "";
|
|
31 my $last_alt = "";
|
|
32 my %buffered_F;
|
|
33 my %buffered_id_rank;
|
|
34 my $header = <IN>;
|
|
35 print OUT $header;
|
|
36
|
|
37 my ($chr_column, $pos_column, $alt_column, $cdna_hgvs_column, $aa_hgvs_column, $phase_column, $transcript_column, $transcript_length_column, $exon_dist_column, $ftype_column, $mtype_column, $splicing_score_column, $splicing_effect_column, $sources_column);
|
|
38 chomp $header;
|
|
39 my @headers = split /\t/, $header;
|
|
40 for(my $i = 0; $i <= $#headers; $i++){
|
|
41 if($headers[$i] eq "Chr"){
|
|
42 $chr_column = $i;
|
|
43 }
|
|
44 elsif($headers[$i] eq "Feature type"){
|
|
45 $ftype_column = $i;
|
|
46 }
|
|
47 elsif($headers[$i] eq "Variant type"){
|
|
48 $mtype_column = $i;
|
|
49 }
|
|
50 elsif($headers[$i] eq "DNA From" or $headers[$i] eq "DNA Pos"){
|
|
51 $pos_column = $i;
|
|
52 }
|
|
53 elsif($headers[$i] eq "Obs base"){
|
|
54 $alt_column = $i;
|
|
55 }
|
|
56 elsif($headers[$i] eq "Transcript HGVS"){
|
|
57 $cdna_hgvs_column = $i;
|
|
58 }
|
|
59 elsif($headers[$i] eq "Protein HGVS"){
|
|
60 $aa_hgvs_column = $i;
|
|
61 }
|
|
62 elsif($headers[$i] eq "Phase"){
|
|
63 $phase_column = $i;
|
|
64 }
|
|
65 elsif($headers[$i] eq "Selected transcript"){
|
|
66 $transcript_column = $i;
|
|
67 }
|
|
68 elsif($headers[$i] eq "Transcript length"){
|
|
69 $transcript_length_column = $i;
|
|
70 }
|
|
71 elsif($headers[$i] eq "Closest exon junction (AA coding variants)"){
|
|
72 $exon_dist_column = $i;
|
|
73 }
|
|
74 elsif($headers[$i] eq "Splicing alteration potential"){
|
|
75 $splicing_score_column = $i;
|
|
76 }
|
|
77 elsif($headers[$i] eq "Splicing alteration details"){
|
|
78 $splicing_effect_column = $i;
|
|
79 }
|
|
80 elsif($headers[$i] eq "Sources"){
|
|
81 $sources_column = $i;
|
|
82 }
|
|
83 }
|
|
84 if(not defined $chr_column){
|
|
85 die "Could not find Chr header in $ARGV[0], aborting.\n";
|
|
86 }
|
|
87 if(not defined $pos_column){
|
|
88 die "Could not find 'DNA From' header in $ARGV[0], aborting.\n";
|
|
89 }
|
|
90 if(not defined $alt_column){
|
|
91 die "Could not find 'Obs base' header in $ARGV[0], aborting.\n";
|
|
92 }
|
|
93 if(not defined $cdna_hgvs_column){
|
|
94 die "Could not find 'Transcript HGVS' header in $ARGV[0], aborting.\n";
|
|
95 }
|
|
96 if(not defined $aa_hgvs_column){
|
|
97 die "Could not find 'Protein HGVS' header in $ARGV[0], aborting.\n";
|
|
98 }
|
|
99 if(not defined $phase_column){
|
|
100 die "Could not find Phase header in $ARGV[0], aborting.\n";
|
|
101 }
|
|
102 if(not defined $transcript_column){
|
|
103 die "Could not find 'Selected transcript' header in $ARGV[0], aborting.\n";
|
|
104 }
|
|
105 if(not defined $transcript_length_column){
|
|
106 die "Could not find 'Transcript length' header in $ARGV[0], aborting.\n";
|
|
107 }
|
|
108 #if(not defined $mtype_column){
|
|
109 # die "Could not find 'Variant type' header in $ARGV[0], aborting.\n";
|
|
110 #}
|
|
111 if(not defined $ftype_column){
|
|
112 die "Could not find 'Feature type' header in $ARGV[0], aborting.\n";
|
|
113 }
|
|
114 if(not defined $exon_dist_column){
|
|
115 die "Could not find 'Closest exon junction (AA coding variants)' header in $ARGV[0], aborting.\n";
|
|
116 }
|
|
117
|
|
118 while(<IN>){
|
|
119 chomp;
|
|
120 my @F = split /\t/, $_, -1;
|
|
121 my $chr = $F[$chr_column];
|
|
122 my $pos = $F[$pos_column];
|
|
123 my $alt = $F[$alt_column];
|
|
124 my $cdna_hgvs = $F[$cdna_hgvs_column];
|
|
125 my $hgvs = $F[$aa_hgvs_column];
|
|
126 my $ftype = $F[$ftype_column];
|
|
127 my $mtype;
|
|
128 $mtype = $F[$mtype_column] if defined $mtype_column;
|
|
129 $hgvs =~ s/\d+//g; # look only at the non-position parts of the AA syntax
|
|
130 my $exon_edge_distance = $F[$exon_dist_column];
|
|
131 my $phase = $F[$phase_column] || "";
|
|
132 # in the case of large indels (i.e. CNVs), their positions may not be the same, but effectively they should be reported as such
|
|
133 if($phase =~ /CNV-\S+?:(\S+)/){
|
|
134 $pos = $1;
|
|
135 }
|
|
136 my $preferred_id = $succinct ? ($F[$transcript_column] =~ /^$succinct/o) : 0;
|
|
137 my $id_rank = $F[$transcript_column];
|
|
138 $id_rank =~ s/^.*?0*(\d+)(\.\d+)?$/$1/; # look at only trailing non-padded number (and no .version suffix)
|
|
139
|
|
140 my $collapse_key = $succinct ? "$chr:$pos:$alt" : "$chr:$pos:$hgvs:$exon_edge_distance:$phase";
|
|
141 # Same variant
|
|
142 if($chr eq $last_chr and $pos eq $last_pos and $alt eq $last_alt){
|
|
143 # same AA effect
|
|
144 if(not exists $buffered_F{$collapse_key}){
|
|
145 $buffered_F{$collapse_key} = \@F;
|
|
146 $buffered_id_rank{$collapse_key} = $id_rank;
|
|
147 }
|
|
148 else{
|
|
149 $buffered_F{$collapse_key}->[$sources_column] .= "; $F[$sources_column]" if defined $sources_column;
|
|
150 $ftype_rank{$ftype} = 0 if not defined $ftype_rank{$ftype};
|
|
151 $mtype_rank{$mtype} = 0 if defined $mtype and not exists $mtype_rank{$mtype};
|
|
152 my $buf_ftype = $buffered_F{$collapse_key}->[$ftype_column];
|
|
153 $ftype_rank{$buf_ftype} = 0 if not defined $ftype_rank{$buf_ftype};
|
|
154 my $buf_mtype;
|
|
155 $buf_mtype = $buffered_F{$collapse_key}->[$mtype_column] if defined $mtype_column;
|
|
156 $mtype_rank{$buf_mtype} = 0 if defined $buf_mtype and not exists $mtype_rank{$buf_mtype};
|
|
157 # see if this transcript is "earlier" than the other, based on ID #
|
|
158 if($ftype_rank{$ftype} > $ftype_rank{$buf_ftype} or
|
|
159 (defined $mtype and $mtype_rank{$mtype} > $mtype_rank{$buf_mtype}) or
|
|
160 $preferred_id or
|
|
161 ($id_rank =~ /^\d+$/ and $buffered_id_rank{$collapse_key} =~ /^\d+$/ and $id_rank < $buffered_id_rank{$collapse_key})
|
|
162 or $F[$transcript_column] lt $buffered_F{$collapse_key}->[$transcript_column]){ # alphabetical as backup
|
|
163 # make this the first one reported (and use its HGVS syntax)
|
|
164 $buffered_F{$collapse_key}->[$transcript_length_column] = $F[$transcript_length_column]."; ".$buffered_F{$collapse_key}->[$transcript_length_column];
|
|
165 $buffered_F{$collapse_key}->[$transcript_column] = $F[$transcript_column]."; ".$buffered_F{$collapse_key}->[$transcript_column];
|
|
166 $buffered_F{$collapse_key}->[$cdna_hgvs_column] = $F[$cdna_hgvs_column];
|
|
167 $buffered_F{$collapse_key}->[$aa_hgvs_column] = $F[$aa_hgvs_column];
|
|
168 $buffered_id_rank{$collapse_key} = $id_rank;
|
|
169 }
|
|
170 else{
|
|
171 # just append it to the list of IDs
|
|
172 $buffered_F{$collapse_key}->[$transcript_length_column] .= "; $F[$transcript_length_column]";
|
|
173 $buffered_F{$collapse_key}->[$transcript_column] .= "; $F[$transcript_column]";
|
|
174 }
|
|
175 if($succinct and defined $splicing_score_column and $F[$splicing_score_column] ne "NA" and $F[$splicing_score_column] > $buffered_F{$collapse_key}->[$splicing_score_column]){
|
|
176 $buffered_F{$collapse_key}->[$splicing_score_column] = $F[$splicing_score_column];
|
|
177 $buffered_F{$collapse_key}->[$splicing_effect_column] = $F[$splicing_effect_column] ." (transcript model ".$F[$transcript_column].")";
|
|
178 }
|
|
179 }
|
|
180 }
|
|
181 # Different variant from the last line, dump any buffered data
|
|
182 else{
|
|
183 for my $collapse_key (keys %buffered_F){
|
|
184 if(defined $sources_column){
|
|
185 my %seen;
|
|
186 $buffered_F{$collapse_key}->[$sources_column] = join("; ", grep {not $seen{$_}++} split(/; /, $buffered_F{$collapse_key}->[$sources_column]));
|
|
187 }
|
|
188 print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n";
|
|
189 }
|
|
190 undef %buffered_F;
|
|
191 $last_chr = $chr;
|
|
192 $last_pos = $pos;
|
|
193 $last_alt = $alt;
|
|
194 $buffered_F{$collapse_key} = \@F;
|
|
195 $buffered_id_rank{$collapse_key} = $id_rank;
|
|
196 }
|
|
197 }
|
|
198 for my $collapse_key (keys %buffered_F){
|
|
199 print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n";
|
|
200 }
|
|
201 close(IN);
|