Mercurial > repos > yusuf > add_missing_coverage
diff hgvs_add_missing_sample_coverage @ 0:a3129cb0af43 default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:02:25 -0600 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/hgvs_add_missing_sample_coverage Wed Mar 25 13:02:25 2015 -0600 @@ -0,0 +1,185 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +# This script adds sample names to the last column of an HGVS file if the coverage for that position was insufficient to make the call the other samples had. +@ARGV == 8 or die "Usage: $0 <hgvs_in.txt> <hgvs_out.txt> <sample name1> <sample1.bam> <homo FN coverage cutoff> <het coverage FN cutoff> <min map quality> <min base quality>\n"; +my $hgvs_in_file = shift @ARGV; +my $hgvs_out_file = shift @ARGV; +my $sample_name = shift @ARGV; +my $bam_file = shift @ARGV; +my $homo_fn = shift @ARGV; +my $het_fn = shift @ARGV; +my $min_maq = shift @ARGV; # 30 (a.k.a. p < 0.001) +my $min_baq = shift @ARGV; # 20 (a.k.a. p < 0.01) + +open(HGVS, $hgvs_in_file) + or die "Cannot open $hgvs_in_file for reading: $!\n"; +my $header_line = <HGVS>; +chomp $header_line; +my @headers = split /\t/, $header_line; +my ($chr_column, $pos_column, $ref_column, $alt_column, $zygosity_column, $alt_cnt_column, $tot_cnt_column, $srcs_column); +for(my $i = 0; $i <= $#headers; $i++){ + if($headers[$i] eq "Chr"){ + $chr_column = $i; + } + elsif($headers[$i] eq "DNA From"){ + $pos_column = $i; + } + elsif($headers[$i] eq "Ref base"){ + $ref_column = $i; + } + elsif($headers[$i] eq "Obs base"){ + $alt_column = $i; + } + elsif($headers[$i] eq "Zygosity"){ + $zygosity_column = $i; + } + elsif($headers[$i] eq "Variant Reads"){ + $alt_cnt_column = $i; + } + elsif($headers[$i] eq "Total Reads"){ + $tot_cnt_column = $i; + } + elsif($headers[$i] eq "Sources"){ + $srcs_column = $i; + } +} +die "Could not find header Chr in $hgvs_in_file, aborting." if not defined $chr_column; +die "Could not find header 'DNA From' in $hgvs_in_file, aborting." if not defined $pos_column; +die "Could not find header 'Ref base' in $hgvs_in_file, aborting." if not defined $ref_column; +die "Could not find header 'Obs base' in $hgvs_in_file, aborting." if not defined $alt_column; +die "Could not find header 'Variant Reads' in $hgvs_in_file, aborting." if not defined $alt_cnt_column; +die "Could not find header 'Total Reads' in $hgvs_in_file, aborting." if not defined $tot_cnt_column; +die "Could not find header Zygosity in $hgvs_in_file, aborting." if not defined $zygosity_column; +die "Could not find header Sources in $hgvs_in_file, aborting." if not defined $srcs_column; +open(OUT, ">$hgvs_out_file") + or die "Cannot open $hgvs_out_file for writing: $!\n"; +print OUT $header_line, "\n"; # header line +my ($last_mpileup, $last_key); +while(<HGVS>){ + chomp; + my @F = split /\t/, $_; + if(index($F[$#F], $sample_name) != -1){ + print OUT "$_\n"; # keep line as-is + next; + } + else{ + # Not in a call right now + # If there are reads supporting the call, add the sample name to the list (last column) + # If there are no reads supporting the call, but coverage is below threshold, put a ~ in front of the sample name + my $chr = $F[$chr_column]; + my $pos = $F[$pos_column]; + my $ref = $F[$ref_column]; + my $alt = $F[$alt_column]; + + # some tools report SNPs with leading or trailing matched ref seqs, so trim as necessary + if(length($ref) > 1 and length($ref) == length($alt)){ + while(length($ref) and substr($ref, 0, 1) eq substr($alt, 0, 1)){ + substr($ref, 0, 1) = ""; + substr($alt, 0, 1) = ""; + $pos++; + } + while(length($ref) and substr($ref, -1) eq substr($alt, -1)){ + substr($ref, -1) = ""; + substr($alt, -1) = ""; + } + } + + # (todo: what about compound het?) + my $zygosity = $F[$zygosity_column]; + # see if the coverage is below a called zygosity threshold + my $end = $pos; + # comment out next bit if we assume that reads have all indels left aligned + #if($alt ne "NA" and $ref ne "NA" and length($alt) > length($ref)){ + # $end += length($alt) - length($ref); + #} + my $mpileup; + if(defined $last_key and $last_key eq "$chr:$pos:$ref:$alt"){ # used cached response if this is a repeated input line + $mpileup = $last_mpileup; + } + else{ + $mpileup = `samtools mpileup -q $min_maq -Q $min_baq -r "$chr:$pos-$end" $bam_file 2>&1`; + $last_mpileup = $mpileup; + $last_key = "$chr:$pos:$ref:$alt"; + } + + $ref = "" if $ref eq "NA"; + $alt = "" if $alt eq "NA"; + # lines look something like chr10 74100774 N 164 G$ggGgg$GgGgGgggggG$ggggggggGggGGGgg... + # deletions are -1N, while insertions are +1g + for my $line (split /\n/, $mpileup){ + my @M = split /\t/, $line; + next if $M[0] ne $chr; # header or message stuff + my $depth = $M[3]; + if($depth == 0){ # No reads at all + $F[$srcs_column] .= "; ~$sample_name"; # add this sample to the last column + $F[$zygosity_column] .= "; low"; # zygosity unknown + $F[$alt_cnt_column] .= "; 0"; # alt count + $F[$tot_cnt_column] .= "; 0"; # tot count + #print OUT join("\t", @F), "\n"; + last; + } + my %base_count; + while($M[4] =~ /[ACGT]|\^.|(-\d+|\+\d+)/gi){ + my $call = uc($&); # upper case version of letter + if($1){ # call is -1n, or +3gcc, etc. + if(substr($1, 0, 1) eq "+"){ # insertion + my $offset = substr($1, 1); + $call .= uc(substr($M[4], pos($M[4]), $offset)); # append the letters (not captures in the regex above) to the number + pos($M[4]) += $offset; # skip the global search ahead of the letters just consumed + } + } + elsif($call eq '$' or length($call) > 1){ # the weird ^Y kind of call + next; # ignore + } + $base_count{$call}++; + } + my $base_count = 0; + if(length($ref) == 1 and length($alt) == 1){ #SNP + if(exists $base_count{$alt}){ + $base_count = $base_count{$alt}; + } + } + elsif(length($ref) > length($alt)){ #del + my $del_length = length($alt)-length($ref); # e.g. -2 + if(exists $base_count{$del_length}){ + $base_count = $base_count{$del_length}; + } + } + elsif(length($ref) < length($alt)){ #ins + my $ins_call = "+".(length($alt)-length($ref)).substr($alt, length($ref)); # e.g. +3AGT + if(exists $base_count{$ins_call}){ + $base_count = $base_count{$ins_call}; + } + } + else{ + # MNP variant? + #warn "Cannot check $ref -> $alt: not a SNP insertion or deletion\n"; + next; + } + if(defined $base_count and $base_count and $depth/($base_count+1) < $het_fn){ # at least the min het prop (rounded up) of bases agreeing with the (missed in this sample) alt call + $F[$srcs_column] .= "; +$sample_name"; # add this sample to the last column + $F[$zygosity_column] .= $base_count/$depth < 0.78 ? "; heterozygote" : ($depth < $het_fn ? "; low" : "; homozygote"); # zygosity + $F[$alt_cnt_column] .= "; ".$base_count; # alt count + $F[$tot_cnt_column] .= "; $depth"; # tot count + } + elsif($zygosity =~ /het/ and $depth < $het_fn){ + $F[$srcs_column].= "; ~$sample_name"; + $F[$zygosity_column] .= $depth < $homo_fn ? "; low" : "; heterozygote"; # zygosity + $F[$alt_cnt_column] .= "; $base_count"; + $F[$tot_cnt_column] .= "; $depth"; # tot count + } + elsif($depth < $homo_fn){ + $F[$srcs_column].= "; ~$sample_name"; + $F[$zygosity_column] .= "; low"; # zygosity + $F[$alt_cnt_column] .= "; $base_count"; + $F[$tot_cnt_column] .= "; $depth"; # tot count + } + } + print OUT join("\t", @F), "\n"; + } +} +close(HGVS); +close(OUT);