0
+ − 1 #!/usr/bin/env perl
+ − 2
+ − 3 use strict;
+ − 4 use warnings;
+ − 5
+ − 6 # This script adds sample names to the last column of an HGVS file if the coverage for that position was insufficient to make the call the other samples had.
+ − 7 @ARGV == 8 or die "Usage: $0 <hgvs_in.txt> <hgvs_out.txt> <sample name1> <sample1.bam> <homo FN coverage cutoff> <het coverage FN cutoff> <min map quality> <min base quality>\n";
+ − 8 my $hgvs_in_file = shift @ARGV;
+ − 9 my $hgvs_out_file = shift @ARGV;
+ − 10 my $sample_name = shift @ARGV;
+ − 11 my $bam_file = shift @ARGV;
+ − 12 my $homo_fn = shift @ARGV;
+ − 13 my $het_fn = shift @ARGV;
+ − 14 my $min_maq = shift @ARGV; # 30 (a.k.a. p < 0.001)
+ − 15 my $min_baq = shift @ARGV; # 20 (a.k.a. p < 0.01)
+ − 16
+ − 17 open(HGVS, $hgvs_in_file)
+ − 18 or die "Cannot open $hgvs_in_file for reading: $!\n";
+ − 19 my $header_line = <HGVS>;
+ − 20 chomp $header_line;
+ − 21 my @headers = split /\t/, $header_line;
+ − 22 my ($chr_column, $pos_column, $ref_column, $alt_column, $zygosity_column, $alt_cnt_column, $tot_cnt_column, $srcs_column);
+ − 23 for(my $i = 0; $i <= $#headers; $i++){
+ − 24 if($headers[$i] eq "Chr"){
+ − 25 $chr_column = $i;
+ − 26 }
+ − 27 elsif($headers[$i] eq "DNA From"){
+ − 28 $pos_column = $i;
+ − 29 }
+ − 30 elsif($headers[$i] eq "Ref base"){
+ − 31 $ref_column = $i;
+ − 32 }
+ − 33 elsif($headers[$i] eq "Obs base"){
+ − 34 $alt_column = $i;
+ − 35 }
+ − 36 elsif($headers[$i] eq "Zygosity"){
+ − 37 $zygosity_column = $i;
+ − 38 }
+ − 39 elsif($headers[$i] eq "Variant Reads"){
+ − 40 $alt_cnt_column = $i;
+ − 41 }
+ − 42 elsif($headers[$i] eq "Total Reads"){
+ − 43 $tot_cnt_column = $i;
+ − 44 }
+ − 45 elsif($headers[$i] eq "Sources"){
+ − 46 $srcs_column = $i;
+ − 47 }
+ − 48 }
+ − 49 die "Could not find header Chr in $hgvs_in_file, aborting." if not defined $chr_column;
+ − 50 die "Could not find header 'DNA From' in $hgvs_in_file, aborting." if not defined $pos_column;
+ − 51 die "Could not find header 'Ref base' in $hgvs_in_file, aborting." if not defined $ref_column;
+ − 52 die "Could not find header 'Obs base' in $hgvs_in_file, aborting." if not defined $alt_column;
+ − 53 die "Could not find header 'Variant Reads' in $hgvs_in_file, aborting." if not defined $alt_cnt_column;
+ − 54 die "Could not find header 'Total Reads' in $hgvs_in_file, aborting." if not defined $tot_cnt_column;
+ − 55 die "Could not find header Zygosity in $hgvs_in_file, aborting." if not defined $zygosity_column;
+ − 56 die "Could not find header Sources in $hgvs_in_file, aborting." if not defined $srcs_column;
+ − 57 open(OUT, ">$hgvs_out_file")
+ − 58 or die "Cannot open $hgvs_out_file for writing: $!\n";
+ − 59 print OUT $header_line, "\n"; # header line
+ − 60 my ($last_mpileup, $last_key);
+ − 61 while(<HGVS>){
+ − 62 chomp;
+ − 63 my @F = split /\t/, $_;
+ − 64 if(index($F[$#F], $sample_name) != -1){
+ − 65 print OUT "$_\n"; # keep line as-is
+ − 66 next;
+ − 67 }
+ − 68 else{
+ − 69 # Not in a call right now
+ − 70 # If there are reads supporting the call, add the sample name to the list (last column)
+ − 71 # If there are no reads supporting the call, but coverage is below threshold, put a ~ in front of the sample name
+ − 72 my $chr = $F[$chr_column];
+ − 73 my $pos = $F[$pos_column];
+ − 74 my $ref = $F[$ref_column];
+ − 75 my $alt = $F[$alt_column];
+ − 76
+ − 77 # some tools report SNPs with leading or trailing matched ref seqs, so trim as necessary
+ − 78 if(length($ref) > 1 and length($ref) == length($alt)){
+ − 79 while(length($ref) and substr($ref, 0, 1) eq substr($alt, 0, 1)){
+ − 80 substr($ref, 0, 1) = "";
+ − 81 substr($alt, 0, 1) = "";
+ − 82 $pos++;
+ − 83 }
+ − 84 while(length($ref) and substr($ref, -1) eq substr($alt, -1)){
+ − 85 substr($ref, -1) = "";
+ − 86 substr($alt, -1) = "";
+ − 87 }
+ − 88 }
+ − 89
+ − 90 # (todo: what about compound het?)
+ − 91 my $zygosity = $F[$zygosity_column];
+ − 92 # see if the coverage is below a called zygosity threshold
+ − 93 my $end = $pos;
+ − 94 # comment out next bit if we assume that reads have all indels left aligned
+ − 95 #if($alt ne "NA" and $ref ne "NA" and length($alt) > length($ref)){
+ − 96 # $end += length($alt) - length($ref);
+ − 97 #}
+ − 98 my $mpileup;
+ − 99 if(defined $last_key and $last_key eq "$chr:$pos:$ref:$alt"){ # used cached response if this is a repeated input line
+ − 100 $mpileup = $last_mpileup;
+ − 101 }
+ − 102 else{
+ − 103 $mpileup = `samtools mpileup -q $min_maq -Q $min_baq -r "$chr:$pos-$end" $bam_file 2>&1`;
+ − 104 $last_mpileup = $mpileup;
+ − 105 $last_key = "$chr:$pos:$ref:$alt";
+ − 106 }
+ − 107
+ − 108 $ref = "" if $ref eq "NA";
+ − 109 $alt = "" if $alt eq "NA";
+ − 110 # lines look something like chr10 74100774 N 164 G$ggGgg$GgGgGgggggG$ggggggggGggGGGgg...
+ − 111 # deletions are -1N, while insertions are +1g
+ − 112 for my $line (split /\n/, $mpileup){
+ − 113 my @M = split /\t/, $line;
+ − 114 next if $M[0] ne $chr; # header or message stuff
+ − 115 my $depth = $M[3];
+ − 116 if($depth == 0){ # No reads at all
+ − 117 $F[$srcs_column] .= "; ~$sample_name"; # add this sample to the last column
+ − 118 $F[$zygosity_column] .= "; low"; # zygosity unknown
+ − 119 $F[$alt_cnt_column] .= "; 0"; # alt count
+ − 120 $F[$tot_cnt_column] .= "; 0"; # tot count
+ − 121 #print OUT join("\t", @F), "\n";
+ − 122 last;
+ − 123 }
+ − 124 my %base_count;
+ − 125 while($M[4] =~ /[ACGT]|\^.|(-\d+|\+\d+)/gi){
+ − 126 my $call = uc($&); # upper case version of letter
+ − 127 if($1){ # call is -1n, or +3gcc, etc.
+ − 128 if(substr($1, 0, 1) eq "+"){ # insertion
+ − 129 my $offset = substr($1, 1);
+ − 130 $call .= uc(substr($M[4], pos($M[4]), $offset)); # append the letters (not captures in the regex above) to the number
+ − 131 pos($M[4]) += $offset; # skip the global search ahead of the letters just consumed
+ − 132 }
+ − 133 }
+ − 134 elsif($call eq '$' or length($call) > 1){ # the weird ^Y kind of call
+ − 135 next; # ignore
+ − 136 }
+ − 137 $base_count{$call}++;
+ − 138 }
+ − 139 my $base_count = 0;
+ − 140 if(length($ref) == 1 and length($alt) == 1){ #SNP
+ − 141 if(exists $base_count{$alt}){
+ − 142 $base_count = $base_count{$alt};
+ − 143 }
+ − 144 }
+ − 145 elsif(length($ref) > length($alt)){ #del
+ − 146 my $del_length = length($alt)-length($ref); # e.g. -2
+ − 147 if(exists $base_count{$del_length}){
+ − 148 $base_count = $base_count{$del_length};
+ − 149 }
+ − 150 }
+ − 151 elsif(length($ref) < length($alt)){ #ins
+ − 152 my $ins_call = "+".(length($alt)-length($ref)).substr($alt, length($ref)); # e.g. +3AGT
+ − 153 if(exists $base_count{$ins_call}){
+ − 154 $base_count = $base_count{$ins_call};
+ − 155 }
+ − 156 }
+ − 157 else{
+ − 158 # MNP variant?
+ − 159 #warn "Cannot check $ref -> $alt: not a SNP insertion or deletion\n";
+ − 160 next;
+ − 161 }
+ − 162 if(defined $base_count and $base_count and $depth/($base_count+1) < $het_fn){ # at least the min het prop (rounded up) of bases agreeing with the (missed in this sample) alt call
+ − 163 $F[$srcs_column] .= "; +$sample_name"; # add this sample to the last column
+ − 164 $F[$zygosity_column] .= $base_count/$depth < 0.78 ? "; heterozygote" : ($depth < $het_fn ? "; low" : "; homozygote"); # zygosity
+ − 165 $F[$alt_cnt_column] .= "; ".$base_count; # alt count
+ − 166 $F[$tot_cnt_column] .= "; $depth"; # tot count
+ − 167 }
+ − 168 elsif($zygosity =~ /het/ and $depth < $het_fn){
+ − 169 $F[$srcs_column].= "; ~$sample_name";
+ − 170 $F[$zygosity_column] .= $depth < $homo_fn ? "; low" : "; heterozygote"; # zygosity
+ − 171 $F[$alt_cnt_column] .= "; $base_count";
+ − 172 $F[$tot_cnt_column] .= "; $depth"; # tot count
+ − 173 }
+ − 174 elsif($depth < $homo_fn){
+ − 175 $F[$srcs_column].= "; ~$sample_name";
+ − 176 $F[$zygosity_column] .= "; low"; # zygosity
+ − 177 $F[$alt_cnt_column] .= "; $base_count";
+ − 178 $F[$tot_cnt_column] .= "; $depth"; # tot count
+ − 179 }
+ − 180 }
+ − 181 print OUT join("\t", @F), "\n";
+ − 182 }
+ − 183 }
+ − 184 close(HGVS);
+ − 185 close(OUT);