Mercurial > repos > yusuf > add_missing_coverage
comparison hgvs_add_missing_sample_coverage @ 0:a3129cb0af43 default tip
initial commit
| author | Yusuf Ali <ali@yusuf.email> |
|---|---|
| date | Wed, 25 Mar 2015 13:02:25 -0600 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:a3129cb0af43 |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 use strict; | |
| 4 use warnings; | |
| 5 | |
| 6 # This script adds sample names to the last column of an HGVS file if the coverage for that position was insufficient to make the call the other samples had. | |
| 7 @ARGV == 8 or die "Usage: $0 <hgvs_in.txt> <hgvs_out.txt> <sample name1> <sample1.bam> <homo FN coverage cutoff> <het coverage FN cutoff> <min map quality> <min base quality>\n"; | |
| 8 my $hgvs_in_file = shift @ARGV; | |
| 9 my $hgvs_out_file = shift @ARGV; | |
| 10 my $sample_name = shift @ARGV; | |
| 11 my $bam_file = shift @ARGV; | |
| 12 my $homo_fn = shift @ARGV; | |
| 13 my $het_fn = shift @ARGV; | |
| 14 my $min_maq = shift @ARGV; # 30 (a.k.a. p < 0.001) | |
| 15 my $min_baq = shift @ARGV; # 20 (a.k.a. p < 0.01) | |
| 16 | |
| 17 open(HGVS, $hgvs_in_file) | |
| 18 or die "Cannot open $hgvs_in_file for reading: $!\n"; | |
| 19 my $header_line = <HGVS>; | |
| 20 chomp $header_line; | |
| 21 my @headers = split /\t/, $header_line; | |
| 22 my ($chr_column, $pos_column, $ref_column, $alt_column, $zygosity_column, $alt_cnt_column, $tot_cnt_column, $srcs_column); | |
| 23 for(my $i = 0; $i <= $#headers; $i++){ | |
| 24 if($headers[$i] eq "Chr"){ | |
| 25 $chr_column = $i; | |
| 26 } | |
| 27 elsif($headers[$i] eq "DNA From"){ | |
| 28 $pos_column = $i; | |
| 29 } | |
| 30 elsif($headers[$i] eq "Ref base"){ | |
| 31 $ref_column = $i; | |
| 32 } | |
| 33 elsif($headers[$i] eq "Obs base"){ | |
| 34 $alt_column = $i; | |
| 35 } | |
| 36 elsif($headers[$i] eq "Zygosity"){ | |
| 37 $zygosity_column = $i; | |
| 38 } | |
| 39 elsif($headers[$i] eq "Variant Reads"){ | |
| 40 $alt_cnt_column = $i; | |
| 41 } | |
| 42 elsif($headers[$i] eq "Total Reads"){ | |
| 43 $tot_cnt_column = $i; | |
| 44 } | |
| 45 elsif($headers[$i] eq "Sources"){ | |
| 46 $srcs_column = $i; | |
| 47 } | |
| 48 } | |
| 49 die "Could not find header Chr in $hgvs_in_file, aborting." if not defined $chr_column; | |
| 50 die "Could not find header 'DNA From' in $hgvs_in_file, aborting." if not defined $pos_column; | |
| 51 die "Could not find header 'Ref base' in $hgvs_in_file, aborting." if not defined $ref_column; | |
| 52 die "Could not find header 'Obs base' in $hgvs_in_file, aborting." if not defined $alt_column; | |
| 53 die "Could not find header 'Variant Reads' in $hgvs_in_file, aborting." if not defined $alt_cnt_column; | |
| 54 die "Could not find header 'Total Reads' in $hgvs_in_file, aborting." if not defined $tot_cnt_column; | |
| 55 die "Could not find header Zygosity in $hgvs_in_file, aborting." if not defined $zygosity_column; | |
| 56 die "Could not find header Sources in $hgvs_in_file, aborting." if not defined $srcs_column; | |
| 57 open(OUT, ">$hgvs_out_file") | |
| 58 or die "Cannot open $hgvs_out_file for writing: $!\n"; | |
| 59 print OUT $header_line, "\n"; # header line | |
| 60 my ($last_mpileup, $last_key); | |
| 61 while(<HGVS>){ | |
| 62 chomp; | |
| 63 my @F = split /\t/, $_; | |
| 64 if(index($F[$#F], $sample_name) != -1){ | |
| 65 print OUT "$_\n"; # keep line as-is | |
| 66 next; | |
| 67 } | |
| 68 else{ | |
| 69 # Not in a call right now | |
| 70 # If there are reads supporting the call, add the sample name to the list (last column) | |
| 71 # If there are no reads supporting the call, but coverage is below threshold, put a ~ in front of the sample name | |
| 72 my $chr = $F[$chr_column]; | |
| 73 my $pos = $F[$pos_column]; | |
| 74 my $ref = $F[$ref_column]; | |
| 75 my $alt = $F[$alt_column]; | |
| 76 | |
| 77 # some tools report SNPs with leading or trailing matched ref seqs, so trim as necessary | |
| 78 if(length($ref) > 1 and length($ref) == length($alt)){ | |
| 79 while(length($ref) and substr($ref, 0, 1) eq substr($alt, 0, 1)){ | |
| 80 substr($ref, 0, 1) = ""; | |
| 81 substr($alt, 0, 1) = ""; | |
| 82 $pos++; | |
| 83 } | |
| 84 while(length($ref) and substr($ref, -1) eq substr($alt, -1)){ | |
| 85 substr($ref, -1) = ""; | |
| 86 substr($alt, -1) = ""; | |
| 87 } | |
| 88 } | |
| 89 | |
| 90 # (todo: what about compound het?) | |
| 91 my $zygosity = $F[$zygosity_column]; | |
| 92 # see if the coverage is below a called zygosity threshold | |
| 93 my $end = $pos; | |
| 94 # comment out next bit if we assume that reads have all indels left aligned | |
| 95 #if($alt ne "NA" and $ref ne "NA" and length($alt) > length($ref)){ | |
| 96 # $end += length($alt) - length($ref); | |
| 97 #} | |
| 98 my $mpileup; | |
| 99 if(defined $last_key and $last_key eq "$chr:$pos:$ref:$alt"){ # used cached response if this is a repeated input line | |
| 100 $mpileup = $last_mpileup; | |
| 101 } | |
| 102 else{ | |
| 103 $mpileup = `samtools mpileup -q $min_maq -Q $min_baq -r "$chr:$pos-$end" $bam_file 2>&1`; | |
| 104 $last_mpileup = $mpileup; | |
| 105 $last_key = "$chr:$pos:$ref:$alt"; | |
| 106 } | |
| 107 | |
| 108 $ref = "" if $ref eq "NA"; | |
| 109 $alt = "" if $alt eq "NA"; | |
| 110 # lines look something like chr10 74100774 N 164 G$ggGgg$GgGgGgggggG$ggggggggGggGGGgg... | |
| 111 # deletions are -1N, while insertions are +1g | |
| 112 for my $line (split /\n/, $mpileup){ | |
| 113 my @M = split /\t/, $line; | |
| 114 next if $M[0] ne $chr; # header or message stuff | |
| 115 my $depth = $M[3]; | |
| 116 if($depth == 0){ # No reads at all | |
| 117 $F[$srcs_column] .= "; ~$sample_name"; # add this sample to the last column | |
| 118 $F[$zygosity_column] .= "; low"; # zygosity unknown | |
| 119 $F[$alt_cnt_column] .= "; 0"; # alt count | |
| 120 $F[$tot_cnt_column] .= "; 0"; # tot count | |
| 121 #print OUT join("\t", @F), "\n"; | |
| 122 last; | |
| 123 } | |
| 124 my %base_count; | |
| 125 while($M[4] =~ /[ACGT]|\^.|(-\d+|\+\d+)/gi){ | |
| 126 my $call = uc($&); # upper case version of letter | |
| 127 if($1){ # call is -1n, or +3gcc, etc. | |
| 128 if(substr($1, 0, 1) eq "+"){ # insertion | |
| 129 my $offset = substr($1, 1); | |
| 130 $call .= uc(substr($M[4], pos($M[4]), $offset)); # append the letters (not captures in the regex above) to the number | |
| 131 pos($M[4]) += $offset; # skip the global search ahead of the letters just consumed | |
| 132 } | |
| 133 } | |
| 134 elsif($call eq '$' or length($call) > 1){ # the weird ^Y kind of call | |
| 135 next; # ignore | |
| 136 } | |
| 137 $base_count{$call}++; | |
| 138 } | |
| 139 my $base_count = 0; | |
| 140 if(length($ref) == 1 and length($alt) == 1){ #SNP | |
| 141 if(exists $base_count{$alt}){ | |
| 142 $base_count = $base_count{$alt}; | |
| 143 } | |
| 144 } | |
| 145 elsif(length($ref) > length($alt)){ #del | |
| 146 my $del_length = length($alt)-length($ref); # e.g. -2 | |
| 147 if(exists $base_count{$del_length}){ | |
| 148 $base_count = $base_count{$del_length}; | |
| 149 } | |
| 150 } | |
| 151 elsif(length($ref) < length($alt)){ #ins | |
| 152 my $ins_call = "+".(length($alt)-length($ref)).substr($alt, length($ref)); # e.g. +3AGT | |
| 153 if(exists $base_count{$ins_call}){ | |
| 154 $base_count = $base_count{$ins_call}; | |
| 155 } | |
| 156 } | |
| 157 else{ | |
| 158 # MNP variant? | |
| 159 #warn "Cannot check $ref -> $alt: not a SNP insertion or deletion\n"; | |
| 160 next; | |
| 161 } | |
| 162 if(defined $base_count and $base_count and $depth/($base_count+1) < $het_fn){ # at least the min het prop (rounded up) of bases agreeing with the (missed in this sample) alt call | |
| 163 $F[$srcs_column] .= "; +$sample_name"; # add this sample to the last column | |
| 164 $F[$zygosity_column] .= $base_count/$depth < 0.78 ? "; heterozygote" : ($depth < $het_fn ? "; low" : "; homozygote"); # zygosity | |
| 165 $F[$alt_cnt_column] .= "; ".$base_count; # alt count | |
| 166 $F[$tot_cnt_column] .= "; $depth"; # tot count | |
| 167 } | |
| 168 elsif($zygosity =~ /het/ and $depth < $het_fn){ | |
| 169 $F[$srcs_column].= "; ~$sample_name"; | |
| 170 $F[$zygosity_column] .= $depth < $homo_fn ? "; low" : "; heterozygote"; # zygosity | |
| 171 $F[$alt_cnt_column] .= "; $base_count"; | |
| 172 $F[$tot_cnt_column] .= "; $depth"; # tot count | |
| 173 } | |
| 174 elsif($depth < $homo_fn){ | |
| 175 $F[$srcs_column].= "; ~$sample_name"; | |
| 176 $F[$zygosity_column] .= "; low"; # zygosity | |
| 177 $F[$alt_cnt_column] .= "; $base_count"; | |
| 178 $F[$tot_cnt_column] .= "; $depth"; # tot count | |
| 179 } | |
| 180 } | |
| 181 print OUT join("\t", @F), "\n"; | |
| 182 } | |
| 183 } | |
| 184 close(HGVS); | |
| 185 close(OUT); |
