comparison hgvs_add_missing_sample_coverage @ 0:a3129cb0af43 default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:02:25 -0600
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:a3129cb0af43
1 #!/usr/bin/env perl
2
3 use strict;
4 use warnings;
5
6 # This script adds sample names to the last column of an HGVS file if the coverage for that position was insufficient to make the call the other samples had.
7 @ARGV == 8 or die "Usage: $0 <hgvs_in.txt> <hgvs_out.txt> <sample name1> <sample1.bam> <homo FN coverage cutoff> <het coverage FN cutoff> <min map quality> <min base quality>\n";
8 my $hgvs_in_file = shift @ARGV;
9 my $hgvs_out_file = shift @ARGV;
10 my $sample_name = shift @ARGV;
11 my $bam_file = shift @ARGV;
12 my $homo_fn = shift @ARGV;
13 my $het_fn = shift @ARGV;
14 my $min_maq = shift @ARGV; # 30 (a.k.a. p < 0.001)
15 my $min_baq = shift @ARGV; # 20 (a.k.a. p < 0.01)
16
17 open(HGVS, $hgvs_in_file)
18 or die "Cannot open $hgvs_in_file for reading: $!\n";
19 my $header_line = <HGVS>;
20 chomp $header_line;
21 my @headers = split /\t/, $header_line;
22 my ($chr_column, $pos_column, $ref_column, $alt_column, $zygosity_column, $alt_cnt_column, $tot_cnt_column, $srcs_column);
23 for(my $i = 0; $i <= $#headers; $i++){
24 if($headers[$i] eq "Chr"){
25 $chr_column = $i;
26 }
27 elsif($headers[$i] eq "DNA From"){
28 $pos_column = $i;
29 }
30 elsif($headers[$i] eq "Ref base"){
31 $ref_column = $i;
32 }
33 elsif($headers[$i] eq "Obs base"){
34 $alt_column = $i;
35 }
36 elsif($headers[$i] eq "Zygosity"){
37 $zygosity_column = $i;
38 }
39 elsif($headers[$i] eq "Variant Reads"){
40 $alt_cnt_column = $i;
41 }
42 elsif($headers[$i] eq "Total Reads"){
43 $tot_cnt_column = $i;
44 }
45 elsif($headers[$i] eq "Sources"){
46 $srcs_column = $i;
47 }
48 }
49 die "Could not find header Chr in $hgvs_in_file, aborting." if not defined $chr_column;
50 die "Could not find header 'DNA From' in $hgvs_in_file, aborting." if not defined $pos_column;
51 die "Could not find header 'Ref base' in $hgvs_in_file, aborting." if not defined $ref_column;
52 die "Could not find header 'Obs base' in $hgvs_in_file, aborting." if not defined $alt_column;
53 die "Could not find header 'Variant Reads' in $hgvs_in_file, aborting." if not defined $alt_cnt_column;
54 die "Could not find header 'Total Reads' in $hgvs_in_file, aborting." if not defined $tot_cnt_column;
55 die "Could not find header Zygosity in $hgvs_in_file, aborting." if not defined $zygosity_column;
56 die "Could not find header Sources in $hgvs_in_file, aborting." if not defined $srcs_column;
57 open(OUT, ">$hgvs_out_file")
58 or die "Cannot open $hgvs_out_file for writing: $!\n";
59 print OUT $header_line, "\n"; # header line
60 my ($last_mpileup, $last_key);
61 while(<HGVS>){
62 chomp;
63 my @F = split /\t/, $_;
64 if(index($F[$#F], $sample_name) != -1){
65 print OUT "$_\n"; # keep line as-is
66 next;
67 }
68 else{
69 # Not in a call right now
70 # If there are reads supporting the call, add the sample name to the list (last column)
71 # If there are no reads supporting the call, but coverage is below threshold, put a ~ in front of the sample name
72 my $chr = $F[$chr_column];
73 my $pos = $F[$pos_column];
74 my $ref = $F[$ref_column];
75 my $alt = $F[$alt_column];
76
77 # some tools report SNPs with leading or trailing matched ref seqs, so trim as necessary
78 if(length($ref) > 1 and length($ref) == length($alt)){
79 while(length($ref) and substr($ref, 0, 1) eq substr($alt, 0, 1)){
80 substr($ref, 0, 1) = "";
81 substr($alt, 0, 1) = "";
82 $pos++;
83 }
84 while(length($ref) and substr($ref, -1) eq substr($alt, -1)){
85 substr($ref, -1) = "";
86 substr($alt, -1) = "";
87 }
88 }
89
90 # (todo: what about compound het?)
91 my $zygosity = $F[$zygosity_column];
92 # see if the coverage is below a called zygosity threshold
93 my $end = $pos;
94 # comment out next bit if we assume that reads have all indels left aligned
95 #if($alt ne "NA" and $ref ne "NA" and length($alt) > length($ref)){
96 # $end += length($alt) - length($ref);
97 #}
98 my $mpileup;
99 if(defined $last_key and $last_key eq "$chr:$pos:$ref:$alt"){ # used cached response if this is a repeated input line
100 $mpileup = $last_mpileup;
101 }
102 else{
103 $mpileup = `samtools mpileup -q $min_maq -Q $min_baq -r "$chr:$pos-$end" $bam_file 2>&1`;
104 $last_mpileup = $mpileup;
105 $last_key = "$chr:$pos:$ref:$alt";
106 }
107
108 $ref = "" if $ref eq "NA";
109 $alt = "" if $alt eq "NA";
110 # lines look something like chr10 74100774 N 164 G$ggGgg$GgGgGgggggG$ggggggggGggGGGgg...
111 # deletions are -1N, while insertions are +1g
112 for my $line (split /\n/, $mpileup){
113 my @M = split /\t/, $line;
114 next if $M[0] ne $chr; # header or message stuff
115 my $depth = $M[3];
116 if($depth == 0){ # No reads at all
117 $F[$srcs_column] .= "; ~$sample_name"; # add this sample to the last column
118 $F[$zygosity_column] .= "; low"; # zygosity unknown
119 $F[$alt_cnt_column] .= "; 0"; # alt count
120 $F[$tot_cnt_column] .= "; 0"; # tot count
121 #print OUT join("\t", @F), "\n";
122 last;
123 }
124 my %base_count;
125 while($M[4] =~ /[ACGT]|\^.|(-\d+|\+\d+)/gi){
126 my $call = uc($&); # upper case version of letter
127 if($1){ # call is -1n, or +3gcc, etc.
128 if(substr($1, 0, 1) eq "+"){ # insertion
129 my $offset = substr($1, 1);
130 $call .= uc(substr($M[4], pos($M[4]), $offset)); # append the letters (not captures in the regex above) to the number
131 pos($M[4]) += $offset; # skip the global search ahead of the letters just consumed
132 }
133 }
134 elsif($call eq '$' or length($call) > 1){ # the weird ^Y kind of call
135 next; # ignore
136 }
137 $base_count{$call}++;
138 }
139 my $base_count = 0;
140 if(length($ref) == 1 and length($alt) == 1){ #SNP
141 if(exists $base_count{$alt}){
142 $base_count = $base_count{$alt};
143 }
144 }
145 elsif(length($ref) > length($alt)){ #del
146 my $del_length = length($alt)-length($ref); # e.g. -2
147 if(exists $base_count{$del_length}){
148 $base_count = $base_count{$del_length};
149 }
150 }
151 elsif(length($ref) < length($alt)){ #ins
152 my $ins_call = "+".(length($alt)-length($ref)).substr($alt, length($ref)); # e.g. +3AGT
153 if(exists $base_count{$ins_call}){
154 $base_count = $base_count{$ins_call};
155 }
156 }
157 else{
158 # MNP variant?
159 #warn "Cannot check $ref -> $alt: not a SNP insertion or deletion\n";
160 next;
161 }
162 if(defined $base_count and $base_count and $depth/($base_count+1) < $het_fn){ # at least the min het prop (rounded up) of bases agreeing with the (missed in this sample) alt call
163 $F[$srcs_column] .= "; +$sample_name"; # add this sample to the last column
164 $F[$zygosity_column] .= $base_count/$depth < 0.78 ? "; heterozygote" : ($depth < $het_fn ? "; low" : "; homozygote"); # zygosity
165 $F[$alt_cnt_column] .= "; ".$base_count; # alt count
166 $F[$tot_cnt_column] .= "; $depth"; # tot count
167 }
168 elsif($zygosity =~ /het/ and $depth < $het_fn){
169 $F[$srcs_column].= "; ~$sample_name";
170 $F[$zygosity_column] .= $depth < $homo_fn ? "; low" : "; heterozygote"; # zygosity
171 $F[$alt_cnt_column] .= "; $base_count";
172 $F[$tot_cnt_column] .= "; $depth"; # tot count
173 }
174 elsif($depth < $homo_fn){
175 $F[$srcs_column].= "; ~$sample_name";
176 $F[$zygosity_column] .= "; low"; # zygosity
177 $F[$alt_cnt_column] .= "; $base_count";
178 $F[$tot_cnt_column] .= "; $depth"; # tot count
179 }
180 }
181 print OUT join("\t", @F), "\n";
182 }
183 }
184 close(HGVS);
185 close(OUT);