changeset 0:a3129cb0af43 default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:02:25 -0600
parents
children
files AddMissingCoverage.xml hgvs_add_missing_sample_coverage
diffstat 2 files changed, 217 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/AddMissingCoverage.xml	Wed Mar 25 13:02:25 2015 -0600
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+
+<tool id="hgvs_add_coverage" name="Add poor/missing coverage info">
+  <description>to an annotated HGVS table</description>
+  <version_string>hgvs_add_missing_sample_coverage -v</version_string>
+  <command interpreter="perl">hgvs_add_missing_sample_coverage $input_hgvs_annotated_table $out_hgvs_annotated_table $sample_name $sample_bam $max_homo_fn_coverage $max_het_fn_coverage $min_mapq $min_baq</command>
+  <inputs>
+    <param format="achri_annotated_snp_table" name="input_hgvs_annotated_table" type="data" label="Annotated variant calls table with HGVS syntax"/>
+    <param name="sample_name" type="text" label="Name of sample to check for read coverage"/>
+    <param name="sample_bam" type="data" format="bam" label="Mapped reads to be checked (BAM format)" help="For each variant in the input table that does NOT include the specified sample name in the Sources column, this BAM file will be checked for concordant reads"/>
+    <param name="max_homo_fn_coverage" label="Minimum read coverage to exclude false negative homozygous variants" type="integer" min="1" value="3" />
+    <param name="max_het_fn_coverage" label="Minimum read coverage to exclude false negative heterozygous variants" type="integer" min="1" value="20" />
+    <param name="min_mapq" label="Minimum mapping quality score to consider a read" type="integer" min="1" value="20" help="(10=p-value 0.1, 20=p-value 0.01, etc.)"/>
+    <param name="min_baq" label="Minimum base quality score to consider a read" type="integer" min="1" value="20" help="(10=p-value 0.1, 20=p-value 0.01, etc.)"/>
+  </inputs>
+  <outputs>
+    <data format="achri_annotated_snp_table" name="out_hgvs_annotated_table" type="data" label="Variant table with poor/missing coverage data for ${sample_name}"/>
+  </outputs>
+
+  <tests>
+  </tests>
+
+  <help>
+  Once this tools is run, lack of the given sample name in a table row means the sample almost definitely does not contain the variant that row reports.
+  This tool adds a sample to the Sources column of the variant table if it is not already there, but the given BAM file either:
+
+1. has at least 1/max_het_fn_coverage read proportion supporting the variant (in which case the sample name is added with a "+"), or
+
+2. has fewer than max_homo_fn_coverage or max_het_fn_coverage reads mapped in total, and those reads neither conflict with nor support the variant called (in which case the sample name is added with a "~")
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hgvs_add_missing_sample_coverage	Wed Mar 25 13:02:25 2015 -0600
@@ -0,0 +1,185 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+# This script adds sample names to the last column of an HGVS file if the coverage for that position was insufficient to make the call the other samples had.
+@ARGV == 8 or die "Usage: $0 <hgvs_in.txt> <hgvs_out.txt> <sample name1> <sample1.bam> <homo FN coverage cutoff> <het coverage FN cutoff> <min map quality> <min base quality>\n";
+my $hgvs_in_file = shift @ARGV;
+my $hgvs_out_file = shift @ARGV;
+my $sample_name = shift @ARGV;
+my $bam_file = shift @ARGV;
+my $homo_fn = shift @ARGV;
+my $het_fn = shift @ARGV;
+my $min_maq = shift @ARGV; # 30 (a.k.a. p < 0.001)
+my $min_baq = shift @ARGV; # 20 (a.k.a. p < 0.01)
+
+open(HGVS, $hgvs_in_file)
+  or die "Cannot open $hgvs_in_file for reading: $!\n";
+my $header_line = <HGVS>;
+chomp $header_line;
+my @headers = split /\t/, $header_line;
+my ($chr_column, $pos_column, $ref_column, $alt_column, $zygosity_column, $alt_cnt_column, $tot_cnt_column, $srcs_column);
+for(my $i = 0; $i <= $#headers; $i++){
+  if($headers[$i] eq "Chr"){
+    $chr_column = $i;
+  }
+  elsif($headers[$i] eq "DNA From"){
+    $pos_column = $i;
+  }
+  elsif($headers[$i] eq "Ref base"){
+    $ref_column = $i;
+  }
+  elsif($headers[$i] eq "Obs base"){
+    $alt_column = $i;
+  }
+  elsif($headers[$i] eq "Zygosity"){
+    $zygosity_column = $i;
+  }
+  elsif($headers[$i] eq "Variant Reads"){
+    $alt_cnt_column = $i;
+  }
+  elsif($headers[$i] eq "Total Reads"){
+    $tot_cnt_column = $i;
+  }
+  elsif($headers[$i] eq "Sources"){
+    $srcs_column = $i;
+  }
+}
+die "Could not find header Chr in $hgvs_in_file, aborting." if not defined $chr_column;
+die "Could not find header 'DNA From' in $hgvs_in_file, aborting." if not defined $pos_column;
+die "Could not find header 'Ref base' in $hgvs_in_file, aborting." if not defined $ref_column;
+die "Could not find header 'Obs base' in $hgvs_in_file, aborting." if not defined $alt_column;
+die "Could not find header 'Variant Reads' in $hgvs_in_file, aborting." if not defined $alt_cnt_column;
+die "Could not find header 'Total Reads' in $hgvs_in_file, aborting." if not defined $tot_cnt_column;
+die "Could not find header Zygosity in $hgvs_in_file, aborting." if not defined $zygosity_column;
+die "Could not find header Sources in $hgvs_in_file, aborting." if not defined $srcs_column;
+open(OUT, ">$hgvs_out_file")
+  or die "Cannot open $hgvs_out_file for writing: $!\n";
+print OUT $header_line, "\n"; # header line
+my ($last_mpileup, $last_key);
+while(<HGVS>){
+  chomp;
+  my @F = split /\t/, $_;
+  if(index($F[$#F], $sample_name) != -1){
+    print OUT "$_\n";  # keep line as-is
+    next;
+  }
+  else{
+    # Not in a call right now
+    # If there are reads supporting the call, add the sample name to the list (last column)
+    # If there are no reads supporting the call, but coverage is below threshold, put a ~ in front of the sample name
+    my $chr = $F[$chr_column];
+    my $pos = $F[$pos_column];
+    my $ref = $F[$ref_column];
+    my $alt = $F[$alt_column];
+    
+    # some tools report SNPs with leading or trailing matched ref seqs, so trim as necessary 
+    if(length($ref) > 1 and length($ref) == length($alt)){
+      while(length($ref) and substr($ref, 0, 1) eq substr($alt, 0, 1)){
+        substr($ref, 0, 1) = "";
+        substr($alt, 0, 1) = "";
+        $pos++;
+      }
+      while(length($ref) and substr($ref, -1) eq substr($alt, -1)){
+        substr($ref, -1) = "";
+        substr($alt, -1) = "";
+      }
+    }
+    
+    # (todo: what about compound het?)
+    my $zygosity = $F[$zygosity_column];
+    # see if the coverage is below a called zygosity threshold
+    my $end = $pos;
+    # comment out next bit if we assume that reads have all indels left aligned 
+    #if($alt ne "NA" and $ref ne "NA" and length($alt) > length($ref)){
+    #  $end += length($alt) - length($ref); 
+    #}
+    my $mpileup;
+    if(defined $last_key and $last_key eq "$chr:$pos:$ref:$alt"){ # used cached response if this is a repeated input line
+      $mpileup = $last_mpileup;
+    }
+    else{
+      $mpileup = `samtools mpileup -q $min_maq -Q $min_baq -r "$chr:$pos-$end" $bam_file 2>&1`;
+      $last_mpileup = $mpileup;
+      $last_key = "$chr:$pos:$ref:$alt";
+    }
+
+    $ref = "" if $ref eq "NA";
+    $alt = "" if $alt eq "NA";
+    # lines look something like chr10	74100774	N	164	G$ggGgg$GgGgGgggggG$ggggggggGggGGGgg...
+    # deletions are -1N, while insertions are +1g
+    for my $line (split /\n/, $mpileup){
+      my @M = split /\t/, $line;
+      next if $M[0] ne $chr; # header or message stuff
+      my $depth = $M[3];
+      if($depth == 0){ # No reads at all
+        $F[$srcs_column] .= "; ~$sample_name";  # add this sample to the last column
+        $F[$zygosity_column] .= "; low"; # zygosity unknown
+        $F[$alt_cnt_column] .= "; 0"; # alt count
+        $F[$tot_cnt_column] .= "; 0"; # tot count
+        #print OUT join("\t", @F), "\n";
+        last;
+      }
+      my %base_count;
+      while($M[4] =~ /[ACGT]|\^.|(-\d+|\+\d+)/gi){
+        my $call = uc($&);  # upper case version of letter
+        if($1){ # call is -1n, or +3gcc, etc.
+           if(substr($1, 0, 1) eq "+"){ # insertion
+             my $offset = substr($1, 1);
+             $call .= uc(substr($M[4], pos($M[4]), $offset)); # append the letters (not captures in the regex above) to the number
+             pos($M[4]) += $offset; # skip the global search ahead of the letters just consumed
+           }
+        }
+        elsif($call eq '$' or length($call) > 1){ # the weird ^Y kind of call
+           next; # ignore
+        }
+        $base_count{$call}++;
+      } 
+      my $base_count = 0;
+      if(length($ref) == 1 and length($alt) == 1){ #SNP
+        if(exists $base_count{$alt}){
+          $base_count = $base_count{$alt};
+        }
+      }
+      elsif(length($ref) > length($alt)){ #del
+        my $del_length = length($alt)-length($ref); #  e.g. -2
+        if(exists $base_count{$del_length}){
+          $base_count = $base_count{$del_length};
+        }
+      }
+      elsif(length($ref) < length($alt)){ #ins
+        my $ins_call = "+".(length($alt)-length($ref)).substr($alt, length($ref));  # e.g. +3AGT
+        if(exists $base_count{$ins_call}){
+          $base_count = $base_count{$ins_call};
+        }
+      }
+      else{
+        # MNP variant?
+        #warn "Cannot check $ref -> $alt: not a SNP insertion or deletion\n";
+        next;
+      }
+      if(defined $base_count and $base_count and $depth/($base_count+1) < $het_fn){ # at least the min het prop (rounded up) of bases agreeing with the (missed in this sample) alt call
+          $F[$srcs_column] .= "; +$sample_name";  # add this sample to the last column
+          $F[$zygosity_column] .= $base_count/$depth < 0.78 ? "; heterozygote" : ($depth < $het_fn ? "; low" : "; homozygote"); # zygosity
+          $F[$alt_cnt_column] .= "; ".$base_count; # alt count
+          $F[$tot_cnt_column] .= "; $depth"; # tot count
+      }
+      elsif($zygosity =~ /het/ and $depth < $het_fn){
+          $F[$srcs_column].= "; ~$sample_name";
+          $F[$zygosity_column] .= $depth < $homo_fn ? "; low" : "; heterozygote"; # zygosity
+          $F[$alt_cnt_column] .= "; $base_count";
+          $F[$tot_cnt_column] .= "; $depth"; # tot count
+      }
+      elsif($depth < $homo_fn){
+          $F[$srcs_column].= "; ~$sample_name";
+          $F[$zygosity_column] .= "; low"; # zygosity
+          $F[$alt_cnt_column] .= "; $base_count";
+          $F[$tot_cnt_column] .= "; $depth"; # tot count
+      }
+    }
+    print OUT join("\t", @F), "\n";
+  }
+}
+close(HGVS);
+close(OUT);