Mercurial > repos > yusuf > poor_gene_coverage
diff hgvs_collapse_transcripts @ 0:7cdd13ff182a default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 15:49:28 -0600 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgvs_collapse_transcripts Wed Mar 25 15:49:28 2015 -0600 @@ -0,0 +1,201 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +# reports the variant in transcripts separately only if the AA change is different, or distance from splicing site is different +@ARGV == 2 or @ARGV == 3 or die "Usage: $0 <hgvs input.txt> <output.txt> [ignore splice distance diff]\n"; + +my %ftype_rank = ( protein_coding => 100, + processed_transcript => 90, + antisense => 80, + retained_intron => 70, + lincRNA => 60, + nonsense_mediated_decay => 50, + misc_enrichment_kit_target => 0); + +my %mtype_rank = ( nonsense => 100, + frameshift => 99, + nonstop => 90, + missense => 80, + silent => 50, + "non-coding" => 40); +open(IN, $ARGV[0]) + or die "Cannot open $ARGV[0] for reading: $!\n"; +open(OUT, ">$ARGV[1]") + or die "Cannot open $ARGV[1] for writing: $!\n"; +my $succinct = (@ARGV == 3); +my @lines; +my $last_chr = ""; +my $last_pos = ""; +my $last_alt = ""; +my %buffered_F; +my %buffered_id_rank; +my $header = <IN>; +print OUT $header; + +my ($chr_column, $pos_column, $alt_column, $cdna_hgvs_column, $aa_hgvs_column, $phase_column, $transcript_column, $transcript_length_column, $exon_dist_column, $ftype_column, $mtype_column, $splicing_score_column, $splicing_effect_column, $sources_column); +chomp $header; +my @headers = split /\t/, $header; +for(my $i = 0; $i <= $#headers; $i++){ + if($headers[$i] eq "Chr"){ + $chr_column = $i; + } + elsif($headers[$i] eq "Feature type"){ + $ftype_column = $i; + } + elsif($headers[$i] eq "Variant type"){ + $mtype_column = $i; + } + elsif($headers[$i] eq "DNA From" or $headers[$i] eq "DNA Pos"){ + $pos_column = $i; + } + elsif($headers[$i] eq "Obs base"){ + $alt_column = $i; + } + elsif($headers[$i] eq "Transcript HGVS"){ + $cdna_hgvs_column = $i; + } + elsif($headers[$i] eq "Protein HGVS"){ + $aa_hgvs_column = $i; + } + elsif($headers[$i] eq "Phase"){ + $phase_column = $i; + } + elsif($headers[$i] eq "Selected transcript"){ + $transcript_column = $i; + } + elsif($headers[$i] eq "Transcript length"){ + $transcript_length_column = $i; + } + elsif($headers[$i] eq "Closest exon junction (AA coding variants)"){ + $exon_dist_column = $i; + } + elsif($headers[$i] eq "Splicing alteration potential"){ + $splicing_score_column = $i; + } + elsif($headers[$i] eq "Splicing alteration details"){ + $splicing_effect_column = $i; + } + elsif($headers[$i] eq "Sources"){ + $sources_column = $i; + } +} +if(not defined $chr_column){ + die "Could not find Chr header in $ARGV[0], aborting.\n"; +} +if(not defined $pos_column){ + die "Could not find 'DNA From' header in $ARGV[0], aborting.\n"; +} +if(not defined $alt_column){ + die "Could not find 'Obs base' header in $ARGV[0], aborting.\n"; +} +if(not defined $cdna_hgvs_column){ + die "Could not find 'Transcript HGVS' header in $ARGV[0], aborting.\n"; +} +if(not defined $aa_hgvs_column){ + die "Could not find 'Protein HGVS' header in $ARGV[0], aborting.\n"; +} +if(not defined $phase_column){ + die "Could not find Phase header in $ARGV[0], aborting.\n"; +} +if(not defined $transcript_column){ + die "Could not find 'Selected transcript' header in $ARGV[0], aborting.\n"; +} +if(not defined $transcript_length_column){ + die "Could not find 'Transcript length' header in $ARGV[0], aborting.\n"; +} +#if(not defined $mtype_column){ +# die "Could not find 'Variant type' header in $ARGV[0], aborting.\n"; +#} +if(not defined $ftype_column){ + die "Could not find 'Feature type' header in $ARGV[0], aborting.\n"; +} +if(not defined $exon_dist_column){ + die "Could not find 'Closest exon junction (AA coding variants)' header in $ARGV[0], aborting.\n"; +} + +while(<IN>){ + chomp; + my @F = split /\t/, $_, -1; + my $chr = $F[$chr_column]; + my $pos = $F[$pos_column]; + my $alt = $F[$alt_column]; + my $cdna_hgvs = $F[$cdna_hgvs_column]; + my $hgvs = $F[$aa_hgvs_column]; + my $ftype = $F[$ftype_column]; + my $mtype; + $mtype = $F[$mtype_column] if defined $mtype_column; + $hgvs =~ s/\d+//g; # look only at the non-position parts of the AA syntax + my $exon_edge_distance = $F[$exon_dist_column]; + my $phase = $F[$phase_column] || ""; + # in the case of large indels (i.e. CNVs), their positions may not be the same, but effectively they should be reported as such + if($phase =~ /CNV-\S+?:(\S+)/){ + $pos = $1; + } + my $preferred_id = $succinct ? ($F[$transcript_column] =~ /^$succinct/o) : 0; + my $id_rank = $F[$transcript_column]; + $id_rank =~ s/^.*?0*(\d+)(\.\d+)?$/$1/; # look at only trailing non-padded number (and no .version suffix) + + my $collapse_key = $succinct ? "$chr:$pos:$alt" : "$chr:$pos:$hgvs:$exon_edge_distance:$phase"; + # Same variant + if($chr eq $last_chr and $pos eq $last_pos and $alt eq $last_alt){ + # same AA effect + if(not exists $buffered_F{$collapse_key}){ + $buffered_F{$collapse_key} = \@F; + $buffered_id_rank{$collapse_key} = $id_rank; + } + else{ + $buffered_F{$collapse_key}->[$sources_column] .= "; $F[$sources_column]" if defined $sources_column; + $ftype_rank{$ftype} = 0 if not defined $ftype_rank{$ftype}; + $mtype_rank{$mtype} = 0 if defined $mtype and not exists $mtype_rank{$mtype}; + my $buf_ftype = $buffered_F{$collapse_key}->[$ftype_column]; + $ftype_rank{$buf_ftype} = 0 if not defined $ftype_rank{$buf_ftype}; + my $buf_mtype; + $buf_mtype = $buffered_F{$collapse_key}->[$mtype_column] if defined $mtype_column; + $mtype_rank{$buf_mtype} = 0 if defined $buf_mtype and not exists $mtype_rank{$buf_mtype}; + # see if this transcript is "earlier" than the other, based on ID # + if($ftype_rank{$ftype} > $ftype_rank{$buf_ftype} or + (defined $mtype and $mtype_rank{$mtype} > $mtype_rank{$buf_mtype}) or + $preferred_id or + ($id_rank =~ /^\d+$/ and $buffered_id_rank{$collapse_key} =~ /^\d+$/ and $id_rank < $buffered_id_rank{$collapse_key}) + or $F[$transcript_column] lt $buffered_F{$collapse_key}->[$transcript_column]){ # alphabetical as backup + # make this the first one reported (and use its HGVS syntax) + $buffered_F{$collapse_key}->[$transcript_length_column] = $F[$transcript_length_column]."; ".$buffered_F{$collapse_key}->[$transcript_length_column]; + $buffered_F{$collapse_key}->[$transcript_column] = $F[$transcript_column]."; ".$buffered_F{$collapse_key}->[$transcript_column]; + $buffered_F{$collapse_key}->[$cdna_hgvs_column] = $F[$cdna_hgvs_column]; + $buffered_F{$collapse_key}->[$aa_hgvs_column] = $F[$aa_hgvs_column]; + $buffered_id_rank{$collapse_key} = $id_rank; + } + else{ + # just append it to the list of IDs + $buffered_F{$collapse_key}->[$transcript_length_column] .= "; $F[$transcript_length_column]"; + $buffered_F{$collapse_key}->[$transcript_column] .= "; $F[$transcript_column]"; + } + if($succinct and defined $splicing_score_column and $F[$splicing_score_column] ne "NA" and $F[$splicing_score_column] > $buffered_F{$collapse_key}->[$splicing_score_column]){ + $buffered_F{$collapse_key}->[$splicing_score_column] = $F[$splicing_score_column]; + $buffered_F{$collapse_key}->[$splicing_effect_column] = $F[$splicing_effect_column] ." (transcript model ".$F[$transcript_column].")"; + } + } + } + # Different variant from the last line, dump any buffered data + else{ + for my $collapse_key (keys %buffered_F){ + if(defined $sources_column){ + my %seen; + $buffered_F{$collapse_key}->[$sources_column] = join("; ", grep {not $seen{$_}++} split(/; /, $buffered_F{$collapse_key}->[$sources_column])); + } + print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n"; + } + undef %buffered_F; + $last_chr = $chr; + $last_pos = $pos; + $last_alt = $alt; + $buffered_F{$collapse_key} = \@F; + $buffered_id_rank{$collapse_key} = $id_rank; + } +} +for my $collapse_key (keys %buffered_F){ + print OUT join("\t", @{$buffered_F{$collapse_key}}), "\n"; +} +close(IN);