| 
0
 | 
     1 #!/usr/bin/env perl
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 use strict;
 | 
| 
 | 
     4 use warnings;
 | 
| 
 | 
     5 
 | 
| 
 | 
     6 # This script adds sample names to the last column of an HGVS file if the coverage for that position was insufficient to make the call the other samples had.
 | 
| 
 | 
     7 @ARGV == 8 or die "Usage: $0 <hgvs_in.txt> <hgvs_out.txt> <sample name1> <sample1.bam> <homo FN coverage cutoff> <het coverage FN cutoff> <min map quality> <min base quality>\n";
 | 
| 
 | 
     8 my $hgvs_in_file = shift @ARGV;
 | 
| 
 | 
     9 my $hgvs_out_file = shift @ARGV;
 | 
| 
 | 
    10 my $sample_name = shift @ARGV;
 | 
| 
 | 
    11 my $bam_file = shift @ARGV;
 | 
| 
 | 
    12 my $homo_fn = shift @ARGV;
 | 
| 
 | 
    13 my $het_fn = shift @ARGV;
 | 
| 
 | 
    14 my $min_maq = shift @ARGV; # 30 (a.k.a. p < 0.001)
 | 
| 
 | 
    15 my $min_baq = shift @ARGV; # 20 (a.k.a. p < 0.01)
 | 
| 
 | 
    16 
 | 
| 
 | 
    17 open(HGVS, $hgvs_in_file)
 | 
| 
 | 
    18   or die "Cannot open $hgvs_in_file for reading: $!\n";
 | 
| 
 | 
    19 my $header_line = <HGVS>;
 | 
| 
 | 
    20 chomp $header_line;
 | 
| 
 | 
    21 my @headers = split /\t/, $header_line;
 | 
| 
 | 
    22 my ($chr_column, $pos_column, $ref_column, $alt_column, $zygosity_column, $alt_cnt_column, $tot_cnt_column, $srcs_column);
 | 
| 
 | 
    23 for(my $i = 0; $i <= $#headers; $i++){
 | 
| 
 | 
    24   if($headers[$i] eq "Chr"){
 | 
| 
 | 
    25     $chr_column = $i;
 | 
| 
 | 
    26   }
 | 
| 
 | 
    27   elsif($headers[$i] eq "DNA From"){
 | 
| 
 | 
    28     $pos_column = $i;
 | 
| 
 | 
    29   }
 | 
| 
 | 
    30   elsif($headers[$i] eq "Ref base"){
 | 
| 
 | 
    31     $ref_column = $i;
 | 
| 
 | 
    32   }
 | 
| 
 | 
    33   elsif($headers[$i] eq "Obs base"){
 | 
| 
 | 
    34     $alt_column = $i;
 | 
| 
 | 
    35   }
 | 
| 
 | 
    36   elsif($headers[$i] eq "Zygosity"){
 | 
| 
 | 
    37     $zygosity_column = $i;
 | 
| 
 | 
    38   }
 | 
| 
 | 
    39   elsif($headers[$i] eq "Variant Reads"){
 | 
| 
 | 
    40     $alt_cnt_column = $i;
 | 
| 
 | 
    41   }
 | 
| 
 | 
    42   elsif($headers[$i] eq "Total Reads"){
 | 
| 
 | 
    43     $tot_cnt_column = $i;
 | 
| 
 | 
    44   }
 | 
| 
 | 
    45   elsif($headers[$i] eq "Sources"){
 | 
| 
 | 
    46     $srcs_column = $i;
 | 
| 
 | 
    47   }
 | 
| 
 | 
    48 }
 | 
| 
 | 
    49 die "Could not find header Chr in $hgvs_in_file, aborting." if not defined $chr_column;
 | 
| 
 | 
    50 die "Could not find header 'DNA From' in $hgvs_in_file, aborting." if not defined $pos_column;
 | 
| 
 | 
    51 die "Could not find header 'Ref base' in $hgvs_in_file, aborting." if not defined $ref_column;
 | 
| 
 | 
    52 die "Could not find header 'Obs base' in $hgvs_in_file, aborting." if not defined $alt_column;
 | 
| 
 | 
    53 die "Could not find header 'Variant Reads' in $hgvs_in_file, aborting." if not defined $alt_cnt_column;
 | 
| 
 | 
    54 die "Could not find header 'Total Reads' in $hgvs_in_file, aborting." if not defined $tot_cnt_column;
 | 
| 
 | 
    55 die "Could not find header Zygosity in $hgvs_in_file, aborting." if not defined $zygosity_column;
 | 
| 
 | 
    56 die "Could not find header Sources in $hgvs_in_file, aborting." if not defined $srcs_column;
 | 
| 
 | 
    57 open(OUT, ">$hgvs_out_file")
 | 
| 
 | 
    58   or die "Cannot open $hgvs_out_file for writing: $!\n";
 | 
| 
 | 
    59 print OUT $header_line, "\n"; # header line
 | 
| 
 | 
    60 my ($last_mpileup, $last_key);
 | 
| 
 | 
    61 while(<HGVS>){
 | 
| 
 | 
    62   chomp;
 | 
| 
 | 
    63   my @F = split /\t/, $_;
 | 
| 
 | 
    64   if(index($F[$#F], $sample_name) != -1){
 | 
| 
 | 
    65     print OUT "$_\n";  # keep line as-is
 | 
| 
 | 
    66     next;
 | 
| 
 | 
    67   }
 | 
| 
 | 
    68   else{
 | 
| 
 | 
    69     # Not in a call right now
 | 
| 
 | 
    70     # If there are reads supporting the call, add the sample name to the list (last column)
 | 
| 
 | 
    71     # If there are no reads supporting the call, but coverage is below threshold, put a ~ in front of the sample name
 | 
| 
 | 
    72     my $chr = $F[$chr_column];
 | 
| 
 | 
    73     my $pos = $F[$pos_column];
 | 
| 
 | 
    74     my $ref = $F[$ref_column];
 | 
| 
 | 
    75     my $alt = $F[$alt_column];
 | 
| 
 | 
    76     
 | 
| 
 | 
    77     # some tools report SNPs with leading or trailing matched ref seqs, so trim as necessary 
 | 
| 
 | 
    78     if(length($ref) > 1 and length($ref) == length($alt)){
 | 
| 
 | 
    79       while(length($ref) and substr($ref, 0, 1) eq substr($alt, 0, 1)){
 | 
| 
 | 
    80         substr($ref, 0, 1) = "";
 | 
| 
 | 
    81         substr($alt, 0, 1) = "";
 | 
| 
 | 
    82         $pos++;
 | 
| 
 | 
    83       }
 | 
| 
 | 
    84       while(length($ref) and substr($ref, -1) eq substr($alt, -1)){
 | 
| 
 | 
    85         substr($ref, -1) = "";
 | 
| 
 | 
    86         substr($alt, -1) = "";
 | 
| 
 | 
    87       }
 | 
| 
 | 
    88     }
 | 
| 
 | 
    89     
 | 
| 
 | 
    90     # (todo: what about compound het?)
 | 
| 
 | 
    91     my $zygosity = $F[$zygosity_column];
 | 
| 
 | 
    92     # see if the coverage is below a called zygosity threshold
 | 
| 
 | 
    93     my $end = $pos;
 | 
| 
 | 
    94     # comment out next bit if we assume that reads have all indels left aligned 
 | 
| 
 | 
    95     #if($alt ne "NA" and $ref ne "NA" and length($alt) > length($ref)){
 | 
| 
 | 
    96     #  $end += length($alt) - length($ref); 
 | 
| 
 | 
    97     #}
 | 
| 
 | 
    98     my $mpileup;
 | 
| 
 | 
    99     if(defined $last_key and $last_key eq "$chr:$pos:$ref:$alt"){ # used cached response if this is a repeated input line
 | 
| 
 | 
   100       $mpileup = $last_mpileup;
 | 
| 
 | 
   101     }
 | 
| 
 | 
   102     else{
 | 
| 
 | 
   103       $mpileup = `samtools mpileup -q $min_maq -Q $min_baq -r "$chr:$pos-$end" $bam_file 2>&1`;
 | 
| 
 | 
   104       $last_mpileup = $mpileup;
 | 
| 
 | 
   105       $last_key = "$chr:$pos:$ref:$alt";
 | 
| 
 | 
   106     }
 | 
| 
 | 
   107 
 | 
| 
 | 
   108     $ref = "" if $ref eq "NA";
 | 
| 
 | 
   109     $alt = "" if $alt eq "NA";
 | 
| 
 | 
   110     # lines look something like chr10	74100774	N	164	G$ggGgg$GgGgGgggggG$ggggggggGggGGGgg...
 | 
| 
 | 
   111     # deletions are -1N, while insertions are +1g
 | 
| 
 | 
   112     for my $line (split /\n/, $mpileup){
 | 
| 
 | 
   113       my @M = split /\t/, $line;
 | 
| 
 | 
   114       next if $M[0] ne $chr; # header or message stuff
 | 
| 
 | 
   115       my $depth = $M[3];
 | 
| 
 | 
   116       if($depth == 0){ # No reads at all
 | 
| 
 | 
   117         $F[$srcs_column] .= "; ~$sample_name";  # add this sample to the last column
 | 
| 
 | 
   118         $F[$zygosity_column] .= "; low"; # zygosity unknown
 | 
| 
 | 
   119         $F[$alt_cnt_column] .= "; 0"; # alt count
 | 
| 
 | 
   120         $F[$tot_cnt_column] .= "; 0"; # tot count
 | 
| 
 | 
   121         #print OUT join("\t", @F), "\n";
 | 
| 
 | 
   122         last;
 | 
| 
 | 
   123       }
 | 
| 
 | 
   124       my %base_count;
 | 
| 
 | 
   125       while($M[4] =~ /[ACGT]|\^.|(-\d+|\+\d+)/gi){
 | 
| 
 | 
   126         my $call = uc($&);  # upper case version of letter
 | 
| 
 | 
   127         if($1){ # call is -1n, or +3gcc, etc.
 | 
| 
 | 
   128            if(substr($1, 0, 1) eq "+"){ # insertion
 | 
| 
 | 
   129              my $offset = substr($1, 1);
 | 
| 
 | 
   130              $call .= uc(substr($M[4], pos($M[4]), $offset)); # append the letters (not captures in the regex above) to the number
 | 
| 
 | 
   131              pos($M[4]) += $offset; # skip the global search ahead of the letters just consumed
 | 
| 
 | 
   132            }
 | 
| 
 | 
   133         }
 | 
| 
 | 
   134         elsif($call eq '$' or length($call) > 1){ # the weird ^Y kind of call
 | 
| 
 | 
   135            next; # ignore
 | 
| 
 | 
   136         }
 | 
| 
 | 
   137         $base_count{$call}++;
 | 
| 
 | 
   138       } 
 | 
| 
 | 
   139       my $base_count = 0;
 | 
| 
 | 
   140       if(length($ref) == 1 and length($alt) == 1){ #SNP
 | 
| 
 | 
   141         if(exists $base_count{$alt}){
 | 
| 
 | 
   142           $base_count = $base_count{$alt};
 | 
| 
 | 
   143         }
 | 
| 
 | 
   144       }
 | 
| 
 | 
   145       elsif(length($ref) > length($alt)){ #del
 | 
| 
 | 
   146         my $del_length = length($alt)-length($ref); #  e.g. -2
 | 
| 
 | 
   147         if(exists $base_count{$del_length}){
 | 
| 
 | 
   148           $base_count = $base_count{$del_length};
 | 
| 
 | 
   149         }
 | 
| 
 | 
   150       }
 | 
| 
 | 
   151       elsif(length($ref) < length($alt)){ #ins
 | 
| 
 | 
   152         my $ins_call = "+".(length($alt)-length($ref)).substr($alt, length($ref));  # e.g. +3AGT
 | 
| 
 | 
   153         if(exists $base_count{$ins_call}){
 | 
| 
 | 
   154           $base_count = $base_count{$ins_call};
 | 
| 
 | 
   155         }
 | 
| 
 | 
   156       }
 | 
| 
 | 
   157       else{
 | 
| 
 | 
   158         # MNP variant?
 | 
| 
 | 
   159         #warn "Cannot check $ref -> $alt: not a SNP insertion or deletion\n";
 | 
| 
 | 
   160         next;
 | 
| 
 | 
   161       }
 | 
| 
 | 
   162       if(defined $base_count and $base_count and $depth/($base_count+1) < $het_fn){ # at least the min het prop (rounded up) of bases agreeing with the (missed in this sample) alt call
 | 
| 
 | 
   163           $F[$srcs_column] .= "; +$sample_name";  # add this sample to the last column
 | 
| 
 | 
   164           $F[$zygosity_column] .= $base_count/$depth < 0.78 ? "; heterozygote" : ($depth < $het_fn ? "; low" : "; homozygote"); # zygosity
 | 
| 
 | 
   165           $F[$alt_cnt_column] .= "; ".$base_count; # alt count
 | 
| 
 | 
   166           $F[$tot_cnt_column] .= "; $depth"; # tot count
 | 
| 
 | 
   167       }
 | 
| 
 | 
   168       elsif($zygosity =~ /het/ and $depth < $het_fn){
 | 
| 
 | 
   169           $F[$srcs_column].= "; ~$sample_name";
 | 
| 
 | 
   170           $F[$zygosity_column] .= $depth < $homo_fn ? "; low" : "; heterozygote"; # zygosity
 | 
| 
 | 
   171           $F[$alt_cnt_column] .= "; $base_count";
 | 
| 
 | 
   172           $F[$tot_cnt_column] .= "; $depth"; # tot count
 | 
| 
 | 
   173       }
 | 
| 
 | 
   174       elsif($depth < $homo_fn){
 | 
| 
 | 
   175           $F[$srcs_column].= "; ~$sample_name";
 | 
| 
 | 
   176           $F[$zygosity_column] .= "; low"; # zygosity
 | 
| 
 | 
   177           $F[$alt_cnt_column] .= "; $base_count";
 | 
| 
 | 
   178           $F[$tot_cnt_column] .= "; $depth"; # tot count
 | 
| 
 | 
   179       }
 | 
| 
 | 
   180     }
 | 
| 
 | 
   181     print OUT join("\t", @F), "\n";
 | 
| 
 | 
   182   }
 | 
| 
 | 
   183 }
 | 
| 
 | 
   184 close(HGVS);
 | 
| 
 | 
   185 close(OUT);
 |