Mercurial > repos > yusuf > miseq_bam_variants
view hgvs_collapse_transcripts @ 0:1a23ea467feb default tip
intial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Thu, 26 Mar 2015 09:36:17 -0600 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl use strict; use warnings; # reports the variant in transcripts separately only if the AA change is different, or distance from splicing site is different @ARGV == 2 or @ARGV == 3 or die "Usage: $0 <hgvs input.txt> <output.txt> [ignore splice distance diff]\n"; my %ftype_rank = ( protein_coding => 100, processed_transcript => 90, antisense => 80, retained_intron => 70, lincRNA => 60, nonsense_mediated_decay => 50, misc_enrichment_kit_target => 0); my %mtype_rank = ( nonsense => 100, frameshift => 99, nonstop => 90, missense => 80, silent => 50, "non-coding" => 40); open(IN, $ARGV[0]) or die "Cannot open $ARGV[0] for reading: $!\n"; open(OUT, ">$ARGV[1]") or die "Cannot open $ARGV[1] for writing: $!\n"; my $succinct = (@ARGV == 3); my @lines; my $last_chr = ""; my $last_pos = ""; my $last_alt = ""; my %buffered_F; my %buffered_id_rank; my $header = <IN>; print OUT $header; my ($chr_column, $pos_column, $alt_column, $cdna_hgvs_column, $aa_hgvs_column, $phase_column, $transcript_column, $transcript_length_column, $exon_dist_column, $ftype_column, $mtype_column, $splicing_score_column, $splicing_effect_column, $sources_column); chomp $header; my @headers = split /\t/, $header; for(my $i = 0; $i <= $#headers; $i++){ if($headers[$i] eq "Chr"){ $chr_column = $i; } elsif($headers[$i] eq "Feature type"){ $ftype_column = $i; } elsif($headers[$i] eq "Variant type"){ $mtype_column = $i; } elsif($headers[$i] eq "DNA From" or $headers[$i] eq "DNA Pos"){ $pos_column = $i; } elsif($headers[$i] eq "Obs base"){ $alt_column = $i; } elsif($headers[$i] eq "Transcript HGVS"){ $cdna_hgvs_column = $i; } elsif($headers[$i] eq "Protein HGVS"){ $aa_hgvs_column = $i; } elsif($headers[$i] eq "Phase"){ $phase_column = $i; } elsif($headers[$i] eq "Selected transcript"){ $transcript_column = $i; } elsif($headers[$i] eq "Transcript length"){ $transcript_length_column = $i; } elsif($headers[$i] eq "Closest exon junction (AA coding variants)"){ $exon_dist_column = $i; } elsif($headers[$i] eq "Splicing alteration potential"){ $splicing_score_column = $i; } elsif($headers[$i] eq "Splicing alteration details"){ $splicing_effect_column = $i; } elsif($headers[$i] eq "Sources"){ $sources_column = $i; } } if(not defined $chr_column){ die "Could not find Chr header in $ARGV[0], aborting.\n"; } if(not defined $pos_column){ die "Could not find 'DNA From' header in $ARGV[0], aborting.\n"; } if(not defined $alt_column){ die "Could not find 'Obs base' header in $ARGV[0], aborting.\n"; } if(not defined $cdna_hgvs_column){ die "Could not find 'Transcript HGVS' header in $ARGV[0], aborting.\n"; } if(not defined $aa_hgvs_column){ die "Could not find 'Protein HGVS' header in $ARGV[0], aborting.\n"; } if(not defined $phase_column){ die "Could not find Phase header in $ARGV[0], aborting.\n"; } if(not defined $transcript_column){ die "Could not find 'Selected transcript' header in $ARGV[0], aborting.\n"; } if(not defined $transcript_length_column){ die "Could not find 'Transcript length' header in $ARGV[0], aborting.\n"; } #if(not defined $mtype_column){ # die "Could not find 'Variant type' header in $ARGV[0], aborting.\n"; #} if(not defined $ftype_column){ die "Could not find 'Feature type' header in $ARGV[0], aborting.\n"; } if(not defined $exon_dist_column){ die "Could not find 'Closest exon junction (AA coding variants)' header in $ARGV[0], aborting.\n"; } while(<IN>){ chomp; my @F = split /\t/, $_, -1; my $chr = $F[$chr_column]; my $pos = $F[$pos_column]; my $alt = $F[$alt_column]; my $cdna_hgvs = $F[$cdna_hgvs_column]; my $hgvs = $F[$aa_hgvs_column]; my $ftype = $F[$ftype_column]; my $mtype; $mtype = $F[$mtype_column] if defined $mtype_column; $hgvs =~ s/\d+//g; # look only at the non-position parts of the AA syntax my $exon_edge_distance = $F[$exon_dist_column]; my $phase = $F[$phase_column] || ""; # in the case of large indels (i.e. CNVs), their positions may not be the same, but effectively they should be reported as such if($phase =~ /CNV-\S+?:(\S+)/){ $pos = $1; } my $preferred_id = $succinct ? ($F[$transcript_column] =~ /^$succinct/o) : 0; my $id_rank = $F[$transcript_column]; $id_rank =~ s/^.*?0*(\d+)(\.\d+)?$/$1/; # look at only trailing non-padded number (and no .version suffix) my $collapse_key = $succinct ? "$chr:$pos:$alt" : "$chr:$pos:$hgvs:$exon_edge_distance:$phase"; # Same variant if($chr eq $last_chr and $pos eq $last_pos and $alt eq $last_alt){ # same AA effect if(not exists $buffered_F{$collapse_key}){ $buffered_F{$collapse_key} = \@F; $buffered_id_rank{$collapse_key} = $id_rank; } else{ $buffered_F{$collapse_key}->[$sources_column] .= "; $F[$sources_column]" if defined $sources_column; $ftype_rank{$ftype} = 0 if not defined $ftype_rank{$ftype}; $mtype_rank{$mtype} = 0 if defined $mtype and not exists $mtype_rank{$mtype}; my $buf_ftype = $buffered_F{$collapse_key}->[$ftype_column]; $ftype_rank{$buf_ftype} = 0 if not defined $ftype_rank{$buf_ftype}; my $buf_mtype; $buf_mtype = $buffered_F{$collapse_key}->[$mtype_column] if defined $mtype_column; $mtype_rank{$buf_mtype} = 0 if defined $buf_mtype and not exists $mtype_rank{$buf_mtype}; # see if this transcript is "earlier" than the other, based on ID # if($ftype_rank{$ftype} > $ftype_rank{$buf_ftype} or (defined $mtype and $mtype_rank{$mtype} > $mtype_rank{$buf_mtype}) or $preferred_id or ($id_rank =~ /^\d+$/ and $buffered_id_rank{$collapse_key} =~ /^\d+$/ and $id_rank < $buffered_id_rank{$collapse_key}) or $F[$transcript_column] lt $buffered_F{$collapse_key}->[$transcript_column]){ # alphabetical as backup # make this the first one reported (and use its HGVS syntax) $buffered_F{$collapse_key}->[$transcript_length_column] = $F[$transcript_length_column]."; ".$buffered_F{$collapse_key}->[$transcript_length_column]; $buffered_F{$collapse_key}->[$transcript_column] = $F[$transcript_column]."; ".$buffered_F{$collapse_key}->[$transcript_column]; $buffered_F{$collapse_key}->[$cdna_hgvs_column] = $F[$cdna_hgvs_column]; $buffered_F{$collapse_key}->[$aa_hgvs_column] = $F[$aa_hgvs_column]; $buffered_id_rank{$collapse_key} = $id_rank; } else{ # just append it to the list of IDs $buffered_F{$collapse_key}->[$transcript_length_column] .= "; $F[$transcript_length_column]"; $buffered_F{$collapse_key}->[$transcript_column] .= "; $F[$transcript_column]"; } if($succinct and defined $splicing_score_column and $F[$splicing_score_column] ne "NA" and $F[$splicing_score_column] > $buffered_F{$collapse_key}->[$splicing_score_column]){ $buffered_F{$collapse_key}->[$splicing_score_column] = $F[$splicing_score_column]; $buffered_F{$collapse_key}->[$splicing_effect_column] = $F[$splicing_effect_column] ." (transcript model ".$F[$transcript_column].")"; } } } # Different variant from the last line, dump any buffered data else{ for my $collapse_key (keys %buffered_F){ if(defined $sources_column){ my %seen; $buffered_F{$collapse_key}->[$sources_column] = join("; ", grep {not $seen{$_}++} split(/; /, $buffered_F{$collapse_key}->[$sources_column])); } print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n"; } undef %buffered_F; $last_chr = $chr; $last_pos = $pos; $last_alt = $alt; $buffered_F{$collapse_key} = \@F; $buffered_id_rank{$collapse_key} = $id_rank; } } for my $collapse_key (keys %buffered_F){ print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n"; } close(IN);