view hgvs_add_missing_sample_coverage @ 0:a3129cb0af43 default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:02:25 -0600
parents
children
line wrap: on
line source

#!/usr/bin/env perl

use strict;
use warnings;

# This script adds sample names to the last column of an HGVS file if the coverage for that position was insufficient to make the call the other samples had.
@ARGV == 8 or die "Usage: $0 <hgvs_in.txt> <hgvs_out.txt> <sample name1> <sample1.bam> <homo FN coverage cutoff> <het coverage FN cutoff> <min map quality> <min base quality>\n";
my $hgvs_in_file = shift @ARGV;
my $hgvs_out_file = shift @ARGV;
my $sample_name = shift @ARGV;
my $bam_file = shift @ARGV;
my $homo_fn = shift @ARGV;
my $het_fn = shift @ARGV;
my $min_maq = shift @ARGV; # 30 (a.k.a. p < 0.001)
my $min_baq = shift @ARGV; # 20 (a.k.a. p < 0.01)

open(HGVS, $hgvs_in_file)
  or die "Cannot open $hgvs_in_file for reading: $!\n";
my $header_line = <HGVS>;
chomp $header_line;
my @headers = split /\t/, $header_line;
my ($chr_column, $pos_column, $ref_column, $alt_column, $zygosity_column, $alt_cnt_column, $tot_cnt_column, $srcs_column);
for(my $i = 0; $i <= $#headers; $i++){
  if($headers[$i] eq "Chr"){
    $chr_column = $i;
  }
  elsif($headers[$i] eq "DNA From"){
    $pos_column = $i;
  }
  elsif($headers[$i] eq "Ref base"){
    $ref_column = $i;
  }
  elsif($headers[$i] eq "Obs base"){
    $alt_column = $i;
  }
  elsif($headers[$i] eq "Zygosity"){
    $zygosity_column = $i;
  }
  elsif($headers[$i] eq "Variant Reads"){
    $alt_cnt_column = $i;
  }
  elsif($headers[$i] eq "Total Reads"){
    $tot_cnt_column = $i;
  }
  elsif($headers[$i] eq "Sources"){
    $srcs_column = $i;
  }
}
die "Could not find header Chr in $hgvs_in_file, aborting." if not defined $chr_column;
die "Could not find header 'DNA From' in $hgvs_in_file, aborting." if not defined $pos_column;
die "Could not find header 'Ref base' in $hgvs_in_file, aborting." if not defined $ref_column;
die "Could not find header 'Obs base' in $hgvs_in_file, aborting." if not defined $alt_column;
die "Could not find header 'Variant Reads' in $hgvs_in_file, aborting." if not defined $alt_cnt_column;
die "Could not find header 'Total Reads' in $hgvs_in_file, aborting." if not defined $tot_cnt_column;
die "Could not find header Zygosity in $hgvs_in_file, aborting." if not defined $zygosity_column;
die "Could not find header Sources in $hgvs_in_file, aborting." if not defined $srcs_column;
open(OUT, ">$hgvs_out_file")
  or die "Cannot open $hgvs_out_file for writing: $!\n";
print OUT $header_line, "\n"; # header line
my ($last_mpileup, $last_key);
while(<HGVS>){
  chomp;
  my @F = split /\t/, $_;
  if(index($F[$#F], $sample_name) != -1){
    print OUT "$_\n";  # keep line as-is
    next;
  }
  else{
    # Not in a call right now
    # If there are reads supporting the call, add the sample name to the list (last column)
    # If there are no reads supporting the call, but coverage is below threshold, put a ~ in front of the sample name
    my $chr = $F[$chr_column];
    my $pos = $F[$pos_column];
    my $ref = $F[$ref_column];
    my $alt = $F[$alt_column];
    
    # some tools report SNPs with leading or trailing matched ref seqs, so trim as necessary 
    if(length($ref) > 1 and length($ref) == length($alt)){
      while(length($ref) and substr($ref, 0, 1) eq substr($alt, 0, 1)){
        substr($ref, 0, 1) = "";
        substr($alt, 0, 1) = "";
        $pos++;
      }
      while(length($ref) and substr($ref, -1) eq substr($alt, -1)){
        substr($ref, -1) = "";
        substr($alt, -1) = "";
      }
    }
    
    # (todo: what about compound het?)
    my $zygosity = $F[$zygosity_column];
    # see if the coverage is below a called zygosity threshold
    my $end = $pos;
    # comment out next bit if we assume that reads have all indels left aligned 
    #if($alt ne "NA" and $ref ne "NA" and length($alt) > length($ref)){
    #  $end += length($alt) - length($ref); 
    #}
    my $mpileup;
    if(defined $last_key and $last_key eq "$chr:$pos:$ref:$alt"){ # used cached response if this is a repeated input line
      $mpileup = $last_mpileup;
    }
    else{
      $mpileup = `samtools mpileup -q $min_maq -Q $min_baq -r "$chr:$pos-$end" $bam_file 2>&1`;
      $last_mpileup = $mpileup;
      $last_key = "$chr:$pos:$ref:$alt";
    }

    $ref = "" if $ref eq "NA";
    $alt = "" if $alt eq "NA";
    # lines look something like chr10	74100774	N	164	G$ggGgg$GgGgGgggggG$ggggggggGggGGGgg...
    # deletions are -1N, while insertions are +1g
    for my $line (split /\n/, $mpileup){
      my @M = split /\t/, $line;
      next if $M[0] ne $chr; # header or message stuff
      my $depth = $M[3];
      if($depth == 0){ # No reads at all
        $F[$srcs_column] .= "; ~$sample_name";  # add this sample to the last column
        $F[$zygosity_column] .= "; low"; # zygosity unknown
        $F[$alt_cnt_column] .= "; 0"; # alt count
        $F[$tot_cnt_column] .= "; 0"; # tot count
        #print OUT join("\t", @F), "\n";
        last;
      }
      my %base_count;
      while($M[4] =~ /[ACGT]|\^.|(-\d+|\+\d+)/gi){
        my $call = uc($&);  # upper case version of letter
        if($1){ # call is -1n, or +3gcc, etc.
           if(substr($1, 0, 1) eq "+"){ # insertion
             my $offset = substr($1, 1);
             $call .= uc(substr($M[4], pos($M[4]), $offset)); # append the letters (not captures in the regex above) to the number
             pos($M[4]) += $offset; # skip the global search ahead of the letters just consumed
           }
        }
        elsif($call eq '$' or length($call) > 1){ # the weird ^Y kind of call
           next; # ignore
        }
        $base_count{$call}++;
      } 
      my $base_count = 0;
      if(length($ref) == 1 and length($alt) == 1){ #SNP
        if(exists $base_count{$alt}){
          $base_count = $base_count{$alt};
        }
      }
      elsif(length($ref) > length($alt)){ #del
        my $del_length = length($alt)-length($ref); #  e.g. -2
        if(exists $base_count{$del_length}){
          $base_count = $base_count{$del_length};
        }
      }
      elsif(length($ref) < length($alt)){ #ins
        my $ins_call = "+".(length($alt)-length($ref)).substr($alt, length($ref));  # e.g. +3AGT
        if(exists $base_count{$ins_call}){
          $base_count = $base_count{$ins_call};
        }
      }
      else{
        # MNP variant?
        #warn "Cannot check $ref -> $alt: not a SNP insertion or deletion\n";
        next;
      }
      if(defined $base_count and $base_count and $depth/($base_count+1) < $het_fn){ # at least the min het prop (rounded up) of bases agreeing with the (missed in this sample) alt call
          $F[$srcs_column] .= "; +$sample_name";  # add this sample to the last column
          $F[$zygosity_column] .= $base_count/$depth < 0.78 ? "; heterozygote" : ($depth < $het_fn ? "; low" : "; homozygote"); # zygosity
          $F[$alt_cnt_column] .= "; ".$base_count; # alt count
          $F[$tot_cnt_column] .= "; $depth"; # tot count
      }
      elsif($zygosity =~ /het/ and $depth < $het_fn){
          $F[$srcs_column].= "; ~$sample_name";
          $F[$zygosity_column] .= $depth < $homo_fn ? "; low" : "; heterozygote"; # zygosity
          $F[$alt_cnt_column] .= "; $base_count";
          $F[$tot_cnt_column] .= "; $depth"; # tot count
      }
      elsif($depth < $homo_fn){
          $F[$srcs_column].= "; ~$sample_name";
          $F[$zygosity_column] .= "; low"; # zygosity
          $F[$alt_cnt_column] .= "; $base_count";
          $F[$tot_cnt_column] .= "; $depth"; # tot count
      }
    }
    print OUT join("\t", @F), "\n";
  }
}
close(HGVS);
close(OUT);