Mercurial > repos > yusuf > group_alt_variants
comparison hgvs_collapse_transcripts @ 0:7c94246126b0 default tip
initial commit
| author | Yusuf Ali <ali@yusuf.email> |
|---|---|
| date | Wed, 25 Mar 2015 13:35:53 -0600 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:7c94246126b0 |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 use strict; | |
| 4 use warnings; | |
| 5 | |
| 6 # reports the variant in transcripts separately only if the AA change is different, or distance from splicing site is different | |
| 7 @ARGV == 2 or @ARGV == 3 or die "Usage: $0 <hgvs input.txt> <output.txt> [ignore splice distance diff]\n"; | |
| 8 | |
| 9 my %ftype_rank = ( protein_coding => 100, | |
| 10 processed_transcript => 90, | |
| 11 antisense => 80, | |
| 12 retained_intron => 70, | |
| 13 lincRNA => 60, | |
| 14 nonsense_mediated_decay => 50, | |
| 15 misc_enrichment_kit_target => 0); | |
| 16 | |
| 17 my %mtype_rank = ( nonsense => 100, | |
| 18 frameshift => 99, | |
| 19 nonstop => 90, | |
| 20 missense => 80, | |
| 21 silent => 50, | |
| 22 "non-coding" => 40); | |
| 23 open(IN, $ARGV[0]) | |
| 24 or die "Cannot open $ARGV[0] for reading: $!\n"; | |
| 25 open(OUT, ">$ARGV[1]") | |
| 26 or die "Cannot open $ARGV[1] for writing: $!\n"; | |
| 27 my $succinct = (@ARGV == 3); | |
| 28 my @lines; | |
| 29 my $last_chr = ""; | |
| 30 my $last_pos = ""; | |
| 31 my $last_alt = ""; | |
| 32 my %buffered_F; | |
| 33 my %buffered_id_rank; | |
| 34 my $header = <IN>; | |
| 35 print OUT $header; | |
| 36 | |
| 37 my ($chr_column, $pos_column, $alt_column, $cdna_hgvs_column, $aa_hgvs_column, $phase_column, $transcript_column, $transcript_length_column, $exon_dist_column, $ftype_column, $mtype_column, $splicing_score_column, $splicing_effect_column, $sources_column); | |
| 38 chomp $header; | |
| 39 my @headers = split /\t/, $header; | |
| 40 for(my $i = 0; $i <= $#headers; $i++){ | |
| 41 if($headers[$i] eq "Chr"){ | |
| 42 $chr_column = $i; | |
| 43 } | |
| 44 elsif($headers[$i] eq "Feature type"){ | |
| 45 $ftype_column = $i; | |
| 46 } | |
| 47 elsif($headers[$i] eq "Variant type"){ | |
| 48 $mtype_column = $i; | |
| 49 } | |
| 50 elsif($headers[$i] eq "DNA From" or $headers[$i] eq "DNA Pos"){ | |
| 51 $pos_column = $i; | |
| 52 } | |
| 53 elsif($headers[$i] eq "Obs base"){ | |
| 54 $alt_column = $i; | |
| 55 } | |
| 56 elsif($headers[$i] eq "Transcript HGVS"){ | |
| 57 $cdna_hgvs_column = $i; | |
| 58 } | |
| 59 elsif($headers[$i] eq "Protein HGVS"){ | |
| 60 $aa_hgvs_column = $i; | |
| 61 } | |
| 62 elsif($headers[$i] eq "Phase"){ | |
| 63 $phase_column = $i; | |
| 64 } | |
| 65 elsif($headers[$i] eq "Selected transcript"){ | |
| 66 $transcript_column = $i; | |
| 67 } | |
| 68 elsif($headers[$i] eq "Transcript length"){ | |
| 69 $transcript_length_column = $i; | |
| 70 } | |
| 71 elsif($headers[$i] eq "Closest exon junction (AA coding variants)"){ | |
| 72 $exon_dist_column = $i; | |
| 73 } | |
| 74 elsif($headers[$i] eq "Splicing alteration potential"){ | |
| 75 $splicing_score_column = $i; | |
| 76 } | |
| 77 elsif($headers[$i] eq "Splicing alteration details"){ | |
| 78 $splicing_effect_column = $i; | |
| 79 } | |
| 80 elsif($headers[$i] eq "Sources"){ | |
| 81 $sources_column = $i; | |
| 82 } | |
| 83 } | |
| 84 if(not defined $chr_column){ | |
| 85 die "Could not find Chr header in $ARGV[0], aborting.\n"; | |
| 86 } | |
| 87 if(not defined $pos_column){ | |
| 88 die "Could not find 'DNA From' header in $ARGV[0], aborting.\n"; | |
| 89 } | |
| 90 if(not defined $alt_column){ | |
| 91 die "Could not find 'Obs base' header in $ARGV[0], aborting.\n"; | |
| 92 } | |
| 93 if(not defined $cdna_hgvs_column){ | |
| 94 die "Could not find 'Transcript HGVS' header in $ARGV[0], aborting.\n"; | |
| 95 } | |
| 96 if(not defined $aa_hgvs_column){ | |
| 97 die "Could not find 'Protein HGVS' header in $ARGV[0], aborting.\n"; | |
| 98 } | |
| 99 if(not defined $phase_column){ | |
| 100 die "Could not find Phase header in $ARGV[0], aborting.\n"; | |
| 101 } | |
| 102 if(not defined $transcript_column){ | |
| 103 die "Could not find 'Selected transcript' header in $ARGV[0], aborting.\n"; | |
| 104 } | |
| 105 if(not defined $transcript_length_column){ | |
| 106 die "Could not find 'Transcript length' header in $ARGV[0], aborting.\n"; | |
| 107 } | |
| 108 #if(not defined $mtype_column){ | |
| 109 # die "Could not find 'Variant type' header in $ARGV[0], aborting.\n"; | |
| 110 #} | |
| 111 if(not defined $ftype_column){ | |
| 112 die "Could not find 'Feature type' header in $ARGV[0], aborting.\n"; | |
| 113 } | |
| 114 if(not defined $exon_dist_column){ | |
| 115 die "Could not find 'Closest exon junction (AA coding variants)' header in $ARGV[0], aborting.\n"; | |
| 116 } | |
| 117 | |
| 118 while(<IN>){ | |
| 119 chomp; | |
| 120 my @F = split /\t/, $_, -1; | |
| 121 my $chr = $F[$chr_column]; | |
| 122 my $pos = $F[$pos_column]; | |
| 123 my $alt = $F[$alt_column]; | |
| 124 my $cdna_hgvs = $F[$cdna_hgvs_column]; | |
| 125 my $hgvs = $F[$aa_hgvs_column]; | |
| 126 my $ftype = $F[$ftype_column]; | |
| 127 my $mtype; | |
| 128 $mtype = $F[$mtype_column] if defined $mtype_column; | |
| 129 $hgvs =~ s/\d+//g; # look only at the non-position parts of the AA syntax | |
| 130 my $exon_edge_distance = $F[$exon_dist_column]; | |
| 131 my $phase = $F[$phase_column] || ""; | |
| 132 # in the case of large indels (i.e. CNVs), their positions may not be the same, but effectively they should be reported as such | |
| 133 if($phase =~ /CNV-\S+?:(\S+)/){ | |
| 134 $pos = $1; | |
| 135 } | |
| 136 my $preferred_id = $succinct ? ($F[$transcript_column] =~ /^$succinct/o) : 0; | |
| 137 my $id_rank = $F[$transcript_column]; | |
| 138 $id_rank =~ s/^.*?0*(\d+)(\.\d+)?$/$1/; # look at only trailing non-padded number (and no .version suffix) | |
| 139 | |
| 140 my $collapse_key = $succinct ? "$chr:$pos:$alt" : "$chr:$pos:$hgvs:$exon_edge_distance:$phase"; | |
| 141 # Same variant | |
| 142 if($chr eq $last_chr and $pos eq $last_pos and $alt eq $last_alt){ | |
| 143 # same AA effect | |
| 144 if(not exists $buffered_F{$collapse_key}){ | |
| 145 $buffered_F{$collapse_key} = \@F; | |
| 146 $buffered_id_rank{$collapse_key} = $id_rank; | |
| 147 } | |
| 148 else{ | |
| 149 $buffered_F{$collapse_key}->[$sources_column] .= "; $F[$sources_column]" if defined $sources_column; | |
| 150 $ftype_rank{$ftype} = 0 if not defined $ftype_rank{$ftype}; | |
| 151 $mtype_rank{$mtype} = 0 if defined $mtype and not exists $mtype_rank{$mtype}; | |
| 152 my $buf_ftype = $buffered_F{$collapse_key}->[$ftype_column]; | |
| 153 $ftype_rank{$buf_ftype} = 0 if not defined $ftype_rank{$buf_ftype}; | |
| 154 my $buf_mtype; | |
| 155 $buf_mtype = $buffered_F{$collapse_key}->[$mtype_column] if defined $mtype_column; | |
| 156 $mtype_rank{$buf_mtype} = 0 if defined $buf_mtype and not exists $mtype_rank{$buf_mtype}; | |
| 157 # see if this transcript is "earlier" than the other, based on ID # | |
| 158 if($ftype_rank{$ftype} > $ftype_rank{$buf_ftype} or | |
| 159 (defined $mtype and $mtype_rank{$mtype} > $mtype_rank{$buf_mtype}) or | |
| 160 $preferred_id or | |
| 161 ($id_rank =~ /^\d+$/ and $buffered_id_rank{$collapse_key} =~ /^\d+$/ and $id_rank < $buffered_id_rank{$collapse_key}) | |
| 162 or $F[$transcript_column] lt $buffered_F{$collapse_key}->[$transcript_column]){ # alphabetical as backup | |
| 163 # make this the first one reported (and use its HGVS syntax) | |
| 164 $buffered_F{$collapse_key}->[$transcript_length_column] = $F[$transcript_length_column]."; ".$buffered_F{$collapse_key}->[$transcript_length_column]; | |
| 165 $buffered_F{$collapse_key}->[$transcript_column] = $F[$transcript_column]."; ".$buffered_F{$collapse_key}->[$transcript_column]; | |
| 166 $buffered_F{$collapse_key}->[$cdna_hgvs_column] = $F[$cdna_hgvs_column]; | |
| 167 $buffered_F{$collapse_key}->[$aa_hgvs_column] = $F[$aa_hgvs_column]; | |
| 168 $buffered_id_rank{$collapse_key} = $id_rank; | |
| 169 } | |
| 170 else{ | |
| 171 # just append it to the list of IDs | |
| 172 $buffered_F{$collapse_key}->[$transcript_length_column] .= "; $F[$transcript_length_column]"; | |
| 173 $buffered_F{$collapse_key}->[$transcript_column] .= "; $F[$transcript_column]"; | |
| 174 } | |
| 175 if($succinct and defined $splicing_score_column and $F[$splicing_score_column] ne "NA" and $F[$splicing_score_column] > $buffered_F{$collapse_key}->[$splicing_score_column]){ | |
| 176 $buffered_F{$collapse_key}->[$splicing_score_column] = $F[$splicing_score_column]; | |
| 177 $buffered_F{$collapse_key}->[$splicing_effect_column] = $F[$splicing_effect_column] ." (transcript model ".$F[$transcript_column].")"; | |
| 178 } | |
| 179 } | |
| 180 } | |
| 181 # Different variant from the last line, dump any buffered data | |
| 182 else{ | |
| 183 for my $collapse_key (keys %buffered_F){ | |
| 184 if(defined $sources_column){ | |
| 185 my %seen; | |
| 186 $buffered_F{$collapse_key}->[$sources_column] = join("; ", grep {not $seen{$_}++} split(/; /, $buffered_F{$collapse_key}->[$sources_column])); | |
| 187 } | |
| 188 print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n"; | |
| 189 } | |
| 190 undef %buffered_F; | |
| 191 $last_chr = $chr; | |
| 192 $last_pos = $pos; | |
| 193 $last_alt = $alt; | |
| 194 $buffered_F{$collapse_key} = \@F; | |
| 195 $buffered_id_rank{$collapse_key} = $id_rank; | |
| 196 } | |
| 197 } | |
| 198 for my $collapse_key (keys %buffered_F){ | |
| 199 print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n"; | |
| 200 } | |
| 201 close(IN); |
