0
|
1 #!/usr/bin/env perl
|
|
2
|
|
3 use strict;
|
|
4 use warnings;
|
|
5
|
|
6 # This script adds sample names to the last column of an HGVS file if the coverage for that position was insufficient to make the call the other samples had.
|
|
7 @ARGV == 8 or die "Usage: $0 <hgvs_in.txt> <hgvs_out.txt> <sample name1> <sample1.bam> <homo FN coverage cutoff> <het coverage FN cutoff> <min map quality> <min base quality>\n";
|
|
8 my $hgvs_in_file = shift @ARGV;
|
|
9 my $hgvs_out_file = shift @ARGV;
|
|
10 my $sample_name = shift @ARGV;
|
|
11 my $bam_file = shift @ARGV;
|
|
12 my $homo_fn = shift @ARGV;
|
|
13 my $het_fn = shift @ARGV;
|
|
14 my $min_maq = shift @ARGV; # 30 (a.k.a. p < 0.001)
|
|
15 my $min_baq = shift @ARGV; # 20 (a.k.a. p < 0.01)
|
|
16
|
|
17 open(HGVS, $hgvs_in_file)
|
|
18 or die "Cannot open $hgvs_in_file for reading: $!\n";
|
|
19 my $header_line = <HGVS>;
|
|
20 chomp $header_line;
|
|
21 my @headers = split /\t/, $header_line;
|
|
22 my ($chr_column, $pos_column, $ref_column, $alt_column, $zygosity_column, $alt_cnt_column, $tot_cnt_column, $srcs_column);
|
|
23 for(my $i = 0; $i <= $#headers; $i++){
|
|
24 if($headers[$i] eq "Chr"){
|
|
25 $chr_column = $i;
|
|
26 }
|
|
27 elsif($headers[$i] eq "DNA From"){
|
|
28 $pos_column = $i;
|
|
29 }
|
|
30 elsif($headers[$i] eq "Ref base"){
|
|
31 $ref_column = $i;
|
|
32 }
|
|
33 elsif($headers[$i] eq "Obs base"){
|
|
34 $alt_column = $i;
|
|
35 }
|
|
36 elsif($headers[$i] eq "Zygosity"){
|
|
37 $zygosity_column = $i;
|
|
38 }
|
|
39 elsif($headers[$i] eq "Variant Reads"){
|
|
40 $alt_cnt_column = $i;
|
|
41 }
|
|
42 elsif($headers[$i] eq "Total Reads"){
|
|
43 $tot_cnt_column = $i;
|
|
44 }
|
|
45 elsif($headers[$i] eq "Sources"){
|
|
46 $srcs_column = $i;
|
|
47 }
|
|
48 }
|
|
49 die "Could not find header Chr in $hgvs_in_file, aborting." if not defined $chr_column;
|
|
50 die "Could not find header 'DNA From' in $hgvs_in_file, aborting." if not defined $pos_column;
|
|
51 die "Could not find header 'Ref base' in $hgvs_in_file, aborting." if not defined $ref_column;
|
|
52 die "Could not find header 'Obs base' in $hgvs_in_file, aborting." if not defined $alt_column;
|
|
53 die "Could not find header 'Variant Reads' in $hgvs_in_file, aborting." if not defined $alt_cnt_column;
|
|
54 die "Could not find header 'Total Reads' in $hgvs_in_file, aborting." if not defined $tot_cnt_column;
|
|
55 die "Could not find header Zygosity in $hgvs_in_file, aborting." if not defined $zygosity_column;
|
|
56 die "Could not find header Sources in $hgvs_in_file, aborting." if not defined $srcs_column;
|
|
57 open(OUT, ">$hgvs_out_file")
|
|
58 or die "Cannot open $hgvs_out_file for writing: $!\n";
|
|
59 print OUT $header_line, "\n"; # header line
|
|
60 my ($last_mpileup, $last_key);
|
|
61 while(<HGVS>){
|
|
62 chomp;
|
|
63 my @F = split /\t/, $_;
|
|
64 if(index($F[$#F], $sample_name) != -1){
|
|
65 print OUT "$_\n"; # keep line as-is
|
|
66 next;
|
|
67 }
|
|
68 else{
|
|
69 # Not in a call right now
|
|
70 # If there are reads supporting the call, add the sample name to the list (last column)
|
|
71 # If there are no reads supporting the call, but coverage is below threshold, put a ~ in front of the sample name
|
|
72 my $chr = $F[$chr_column];
|
|
73 my $pos = $F[$pos_column];
|
|
74 my $ref = $F[$ref_column];
|
|
75 my $alt = $F[$alt_column];
|
|
76
|
|
77 # some tools report SNPs with leading or trailing matched ref seqs, so trim as necessary
|
|
78 if(length($ref) > 1 and length($ref) == length($alt)){
|
|
79 while(length($ref) and substr($ref, 0, 1) eq substr($alt, 0, 1)){
|
|
80 substr($ref, 0, 1) = "";
|
|
81 substr($alt, 0, 1) = "";
|
|
82 $pos++;
|
|
83 }
|
|
84 while(length($ref) and substr($ref, -1) eq substr($alt, -1)){
|
|
85 substr($ref, -1) = "";
|
|
86 substr($alt, -1) = "";
|
|
87 }
|
|
88 }
|
|
89
|
|
90 # (todo: what about compound het?)
|
|
91 my $zygosity = $F[$zygosity_column];
|
|
92 # see if the coverage is below a called zygosity threshold
|
|
93 my $end = $pos;
|
|
94 # comment out next bit if we assume that reads have all indels left aligned
|
|
95 #if($alt ne "NA" and $ref ne "NA" and length($alt) > length($ref)){
|
|
96 # $end += length($alt) - length($ref);
|
|
97 #}
|
|
98 my $mpileup;
|
|
99 if(defined $last_key and $last_key eq "$chr:$pos:$ref:$alt"){ # used cached response if this is a repeated input line
|
|
100 $mpileup = $last_mpileup;
|
|
101 }
|
|
102 else{
|
|
103 $mpileup = `samtools mpileup -q $min_maq -Q $min_baq -r "$chr:$pos-$end" $bam_file 2>&1`;
|
|
104 $last_mpileup = $mpileup;
|
|
105 $last_key = "$chr:$pos:$ref:$alt";
|
|
106 }
|
|
107
|
|
108 $ref = "" if $ref eq "NA";
|
|
109 $alt = "" if $alt eq "NA";
|
|
110 # lines look something like chr10 74100774 N 164 G$ggGgg$GgGgGgggggG$ggggggggGggGGGgg...
|
|
111 # deletions are -1N, while insertions are +1g
|
|
112 for my $line (split /\n/, $mpileup){
|
|
113 my @M = split /\t/, $line;
|
|
114 next if $M[0] ne $chr; # header or message stuff
|
|
115 my $depth = $M[3];
|
|
116 if($depth == 0){ # No reads at all
|
|
117 $F[$srcs_column] .= "; ~$sample_name"; # add this sample to the last column
|
|
118 $F[$zygosity_column] .= "; low"; # zygosity unknown
|
|
119 $F[$alt_cnt_column] .= "; 0"; # alt count
|
|
120 $F[$tot_cnt_column] .= "; 0"; # tot count
|
|
121 #print OUT join("\t", @F), "\n";
|
|
122 last;
|
|
123 }
|
|
124 my %base_count;
|
|
125 while($M[4] =~ /[ACGT]|\^.|(-\d+|\+\d+)/gi){
|
|
126 my $call = uc($&); # upper case version of letter
|
|
127 if($1){ # call is -1n, or +3gcc, etc.
|
|
128 if(substr($1, 0, 1) eq "+"){ # insertion
|
|
129 my $offset = substr($1, 1);
|
|
130 $call .= uc(substr($M[4], pos($M[4]), $offset)); # append the letters (not captures in the regex above) to the number
|
|
131 pos($M[4]) += $offset; # skip the global search ahead of the letters just consumed
|
|
132 }
|
|
133 }
|
|
134 elsif($call eq '$' or length($call) > 1){ # the weird ^Y kind of call
|
|
135 next; # ignore
|
|
136 }
|
|
137 $base_count{$call}++;
|
|
138 }
|
|
139 my $base_count = 0;
|
|
140 if(length($ref) == 1 and length($alt) == 1){ #SNP
|
|
141 if(exists $base_count{$alt}){
|
|
142 $base_count = $base_count{$alt};
|
|
143 }
|
|
144 }
|
|
145 elsif(length($ref) > length($alt)){ #del
|
|
146 my $del_length = length($alt)-length($ref); # e.g. -2
|
|
147 if(exists $base_count{$del_length}){
|
|
148 $base_count = $base_count{$del_length};
|
|
149 }
|
|
150 }
|
|
151 elsif(length($ref) < length($alt)){ #ins
|
|
152 my $ins_call = "+".(length($alt)-length($ref)).substr($alt, length($ref)); # e.g. +3AGT
|
|
153 if(exists $base_count{$ins_call}){
|
|
154 $base_count = $base_count{$ins_call};
|
|
155 }
|
|
156 }
|
|
157 else{
|
|
158 # MNP variant?
|
|
159 #warn "Cannot check $ref -> $alt: not a SNP insertion or deletion\n";
|
|
160 next;
|
|
161 }
|
|
162 if(defined $base_count and $base_count and $depth/($base_count+1) < $het_fn){ # at least the min het prop (rounded up) of bases agreeing with the (missed in this sample) alt call
|
|
163 $F[$srcs_column] .= "; +$sample_name"; # add this sample to the last column
|
|
164 $F[$zygosity_column] .= $base_count/$depth < 0.78 ? "; heterozygote" : ($depth < $het_fn ? "; low" : "; homozygote"); # zygosity
|
|
165 $F[$alt_cnt_column] .= "; ".$base_count; # alt count
|
|
166 $F[$tot_cnt_column] .= "; $depth"; # tot count
|
|
167 }
|
|
168 elsif($zygosity =~ /het/ and $depth < $het_fn){
|
|
169 $F[$srcs_column].= "; ~$sample_name";
|
|
170 $F[$zygosity_column] .= $depth < $homo_fn ? "; low" : "; heterozygote"; # zygosity
|
|
171 $F[$alt_cnt_column] .= "; $base_count";
|
|
172 $F[$tot_cnt_column] .= "; $depth"; # tot count
|
|
173 }
|
|
174 elsif($depth < $homo_fn){
|
|
175 $F[$srcs_column].= "; ~$sample_name";
|
|
176 $F[$zygosity_column] .= "; low"; # zygosity
|
|
177 $F[$alt_cnt_column] .= "; $base_count";
|
|
178 $F[$tot_cnt_column] .= "; $depth"; # tot count
|
|
179 }
|
|
180 }
|
|
181 print OUT join("\t", @F), "\n";
|
|
182 }
|
|
183 }
|
|
184 close(HGVS);
|
|
185 close(OUT);
|