Mercurial > repos > yusuf > concordance_report
comparison kappa_report @ 0:2d601bd04c93 default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:28:12 -0600 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:2d601bd04c93 |
---|---|
1 #!/usr/bin/env perl | |
2 | |
3 use strict; | |
4 use warnings; | |
5 | |
6 | |
7 my $quiet = 0; | |
8 if(@ARGV and $ARGV[0] =~ /^-q/){ | |
9 $quiet = 1; | |
10 shift @ARGV; | |
11 } | |
12 # Calculates Cohen's kappa measure for inter-annotator agreement, here for two genotype callers | |
13 # SPECIAL ATTENTION MUST BE PAID to include or exclude calls using depth of coverage information, and only look at targeted regions (e.g. exome analysis) | |
14 # Also, the probability of a non-call must be incorporated, as there is going to be 99.9% agreement by default due to the low rate of | |
15 # SNPs in the genome in human, for example. | |
16 @ARGV == 9 or die "Usage: $0 [-q(uiet)] <exome targets.bed> <coding regions.gtf> <genotypes1.hgvs.txt> <bam1> <coverage cutoff1> <genotypes2.hgvs.txt> <bam2> <coverage cutoff2> <output report.txt>\n"; | |
17 | |
18 my $targets_file = shift @ARGV; | |
19 my $coding_file = shift @ARGV; | |
20 my $calls1_file = shift @ARGV; | |
21 my $bam_file1 = shift @ARGV; | |
22 my $calls1_min_depth = shift @ARGV; | |
23 my $calls2_file = shift @ARGV; | |
24 my $bam_file2 = shift @ARGV; | |
25 my $calls2_min_depth = shift @ARGV; | |
26 my $output_file = shift @ARGV; | |
27 | |
28 my %reporting; | |
29 print STDERR "Reading in target regions list...\n" unless $quiet; | |
30 open(BED, $targets_file) | |
31 or die "Cannot open target BED file $targets_file for reading: $!\n"; | |
32 while(<BED>){ | |
33 next if /^(\s*#|track=)/; | |
34 chomp; | |
35 my @fields = split /\t/, $_; | |
36 | |
37 if(not exists $reporting{$fields[0]}){ | |
38 $reporting{$fields[0]} = [[$fields[1], $fields[2]]]; | |
39 next; | |
40 } | |
41 push @{$reporting{$fields[0]}}, [$fields[1], $fields[2]]; | |
42 } | |
43 close(BED); | |
44 for my $c (keys %reporting){ | |
45 $reporting{$c} = [sort {$a->[0] <=> $b->[0]} @{$reporting{$c}}]; | |
46 } | |
47 | |
48 my %coding; | |
49 print STDERR "Reading in coding regions list...\n" unless $quiet; | |
50 open(GTF, $coding_file) | |
51 or die "Cannot open $coding_file for reading: $!\n"; | |
52 while(<GTF>){ | |
53 next if /^\s*#/; | |
54 my @fields = split /\t/, $_; | |
55 | |
56 if($fields[2] eq "CDS"){ | |
57 if(not exists $coding{$fields[0]}){ | |
58 $coding{$fields[0]} = [[$fields[3], $fields[4]]]; | |
59 next; | |
60 } | |
61 push @{$coding{$fields[0]}}, [$fields[3], $fields[4]]; | |
62 } | |
63 } | |
64 close(GTF); | |
65 for my $c (keys %coding){ | |
66 $coding{$c} = [sort {$a->[0] <=> $b->[0]} @{$coding{$c}}]; | |
67 } | |
68 | |
69 print STDERR "Reading in genotype calls...\n" unless $quiet; | |
70 open(OUT, ">$output_file") | |
71 or die "Cannot open $output_file for writing: $!\n"; | |
72 my ($calls1, $subpar_calls1) = retrieve_calls($calls1_file, \%reporting, \%coding, $calls1_min_depth); | |
73 my ($calls2, $subpar_calls2) = retrieve_calls($calls2_file, \%reporting, \%coding, $calls2_min_depth); | |
74 | |
75 my $total_calls1 = 0; | |
76 for my $chr (keys %$calls1){ | |
77 $total_calls1 += scalar(keys %{$calls1->{$chr}}); | |
78 } | |
79 my $total_calls2 = 0; | |
80 for my $chr (keys %$calls2){ | |
81 $total_calls2 += scalar(keys %{$calls2->{$chr}}); | |
82 } | |
83 | |
84 print OUT "Total Method 1 calls\t$total_calls1\n"; | |
85 print OUT "Total Method 2 calls\t$total_calls2\n"; | |
86 print STDERR "Comparing genotype calls...\n" unless $quiet; | |
87 # Go through the BAM file in all targeted locations to see if there is a reference or variant call (HGVS file only shows variants) | |
88 my $bases_diff = 0; | |
89 my $bases_same = 0; | |
90 my $zygosities_diff = 0; | |
91 my $zygosities_same = 0; | |
92 for my $chr (keys %$calls1){ | |
93 for my $pos (keys %{$calls1->{$chr}}){ | |
94 if(exists $calls2->{$chr}->{$pos}){ | |
95 # same variant base | |
96 if($calls2->{$chr}->{$pos}->[0] eq $calls1->{$chr}->{$pos}->[0]){ | |
97 $bases_same++; | |
98 if($calls1->{$chr}->{$pos}->[1] eq $calls2->{$chr}->{$pos}->[1]){ | |
99 $zygosities_same++; | |
100 } | |
101 else{ | |
102 $zygosities_diff++; | |
103 } | |
104 } | |
105 # diff variant base | |
106 else{ | |
107 $bases_diff++; | |
108 } | |
109 delete $calls1->{$chr}->{$pos}; # so in the end all that's left is method-1-only calls in the hash | |
110 delete $calls2->{$chr}->{$pos}; # so in the end all that's left is method-2-only calls in the hash | |
111 delete $subpar_calls1->{$chr}->{$pos} if exists $subpar_calls1->{$chr}; # so in the end all that's left is method-1-only calls in the hash | |
112 delete $subpar_calls2->{$chr}->{$pos} if exists $subpar_calls2->{$chr}; # so in the end all that's left is method-2-only calls in the hash | |
113 } | |
114 } | |
115 } | |
116 | |
117 print OUT "Same calls\t$zygosities_same\n"; | |
118 print OUT "Diff zygosity\t$zygosities_diff\n"; | |
119 print OUT "Diff bases\t$bases_diff\n"; | |
120 | |
121 my $covbed = "$$.bed"; | |
122 my $num_to_check = 0; | |
123 open(TMPCOV, ">$covbed") | |
124 or die "Cannot open temporary BED file $covbed for samtools: $!\n"; | |
125 for my $chr (keys %$calls1){ | |
126 for my $pos (keys %{$calls1->{$chr}}){ | |
127 if(exists $subpar_calls2->{$chr} and exists $subpar_calls2->{$chr}->{$pos}){ | |
128 next; #exclude counting positions with sufficent coverage in one analysis, but not the other | |
129 } | |
130 else{ | |
131 $num_to_check++; | |
132 print TMPCOV "$chr\t$pos\n"; | |
133 } | |
134 } | |
135 } | |
136 close(TMPCOV); | |
137 | |
138 print STDERR "Checking $num_to_check read depths for missing calls in $bam_file2...\n" unless $quiet; | |
139 my $remaining_calls1_homo = 0; | |
140 my $remaining_calls1_het = 0; | |
141 my $remaining_calls1_novel = 0; | |
142 my $remaining_calls1_novel_homo = 0; | |
143 open(SAM, "samtools mpileup -l $covbed -I $bam_file2 2>/dev/null |") | |
144 or die "Cannot run samtools: $!\n"; | |
145 while(<SAM>){ | |
146 my @B = split /\t/, $_; | |
147 my $depth = @B > 1 ? $B[3] : 0; | |
148 | |
149 # The two methods may have used different patch versions of the same genome. Check if there was a no-call due to different reference bases. | |
150 my $called_base = @B > 1 ? $B[2] : "N"; | |
151 if($depth and $called_base eq $calls1->{$B[0]}->{$B[1]}->[0]){ | |
152 next; | |
153 } | |
154 | |
155 if($depth >= $calls2_min_depth){ # have good coverage in sample 2, but didn't make a SNP call like in sample 1 | |
156 if($calls1->{$B[0]}->{$B[1]}->[1] =~ /het/){ | |
157 $remaining_calls1_het++; | |
158 } | |
159 else{ | |
160 $remaining_calls1_homo++; | |
161 } | |
162 if($calls1->{$B[0]}->{$B[1]}->[3] eq "NA" or $calls1->{$B[0]}->{$B[1]}->[3] < 0.001){ | |
163 $remaining_calls1_novel++; | |
164 $remaining_calls1_novel_homo++ if $calls1->{$B[0]}->{$B[1]}->[1] !~ /het/; | |
165 } | |
166 else{ | |
167 #print STDERR "Got method 1 only non-novel call: ", join("/", @{$calls1->{$B[0]}->{$B[1]}}), "\n"; | |
168 } | |
169 } | |
170 } | |
171 print OUT "Method 1 only\t",($remaining_calls1_homo+$remaining_calls1_het),"\n"; | |
172 printf OUT "Method 1 only, het %%\t%.1f\n",$remaining_calls1_het/($remaining_calls1_homo+$remaining_calls1_het)*100; | |
173 printf OUT "Method 1 only, novel %%\t%.1f\n",$remaining_calls1_novel/($remaining_calls1_homo+$remaining_calls1_het)*100; | |
174 printf OUT "Method 1 only, novel, homo %%\t%.1f\n",$remaining_calls1_novel_homo/$remaining_calls1_novel*100; | |
175 | |
176 $num_to_check = 0; | |
177 open(TMPCOV, ">$covbed") | |
178 or die "Cannot open temporary BED file $covbed for samtools: $!\n"; | |
179 for my $chr (keys %$calls2){ | |
180 for my $pos (keys %{$calls2->{$chr}}){ | |
181 if(exists $subpar_calls1->{$chr} and exists $subpar_calls1->{$chr}->{$pos}){ | |
182 next; #exclude counting positions with sufficent coverage in one analysis, but not the other | |
183 } | |
184 else{ | |
185 $num_to_check++; | |
186 print TMPCOV "$chr\t$pos\n"; | |
187 } | |
188 } | |
189 } | |
190 close(TMPCOV); | |
191 | |
192 print STDERR "Checking $num_to_check read depths for missing calls in $bam_file1...\n" unless $quiet; | |
193 my $remaining_calls2_het = 0; | |
194 my $remaining_calls2_homo = 0; | |
195 my $remaining_calls2_novel = 0; | |
196 my $remaining_calls2_novel_homo = 0; | |
197 open(SAM, "samtools mpileup -l $covbed -I $bam_file1 2>/dev/null |") | |
198 or die "Cannot run samtools: $!\n"; | |
199 while(<SAM>){ | |
200 my @B = split /\t/, $_; | |
201 my $depth = @B > 1 ? $B[3] : 0; | |
202 | |
203 # The two methods may have used different patch versions of the same genome. Check if there was a no-call due to different reference bases. | |
204 my $called_base = @B > 1 ? $B[2] : "N"; | |
205 if($depth and $called_base eq $calls2->{$B[0]}->{$B[1]}->[0]){ | |
206 next; | |
207 } | |
208 | |
209 if($depth >= $calls1_min_depth){ # have good coverage in sample 1, but didn't make a SNP call like in sample 2 | |
210 if($calls2->{$B[0]}->{$B[1]}->[1] =~ /het/){ | |
211 $remaining_calls2_het++; | |
212 } | |
213 else{ | |
214 $remaining_calls2_homo++; | |
215 } | |
216 if($calls2->{$B[0]}->{$B[1]}->[3] eq "NA" or $calls2->{$B[0]}->{$B[1]}->[3] < 0.001){ | |
217 $remaining_calls2_novel++; | |
218 $remaining_calls2_novel_homo++ if $calls2->{$B[0]}->{$B[1]}->[1] !~ /het/; | |
219 } | |
220 } | |
221 } | |
222 | |
223 print OUT "Method 2 only\t",($remaining_calls2_homo+$remaining_calls2_het),"\n"; | |
224 printf OUT "Method 2 only, het %%\t%.1f\n",$remaining_calls2_het/($remaining_calls2_homo+$remaining_calls2_het)*100; | |
225 printf OUT "Method 2 only, novel %%\t%.1f\n",$remaining_calls2_novel/($remaining_calls2_homo+$remaining_calls2_het)*100; | |
226 printf OUT "Method 2 only, novel, homo %%\t%.1f\n",$remaining_calls2_novel_homo/$remaining_calls2_novel*100; | |
227 my $total_calls = $zygosities_same+$zygosities_diff+$bases_diff+$remaining_calls1_homo+$remaining_calls1_het+$remaining_calls2_homo+$remaining_calls2_het; | |
228 printf OUT "Total coding sequence genotype concordance%%\t%.1f\n", $zygosities_same/$total_calls*100; | |
229 printf OUT "Total coding sequence zygosity concordance%%\t%.1f\n", 100-$zygosities_diff/$total_calls*100; | |
230 | |
231 unlink $covbed; | |
232 | |
233 sub retrieve_calls{ | |
234 my ($hgvs_file, $reporting, $coding, $coverage_cutoff) = @_; | |
235 my %calls; | |
236 my %bad_coverage; | |
237 open(HGVS, $hgvs_file) | |
238 or die "Cannot open the first genotypes file $hgvs_file for reading: $!\n"; | |
239 <HGVS>; # header | |
240 while(<HGVS>){ | |
241 my @F = split /\t/, $_; | |
242 my $HGVS = $F[2]; | |
243 next unless $HGVS =~ />([ACGT])/; # only look at SNPs | |
244 my $new_base = $1; | |
245 $new_base =~ tr/ACGT/TGCA/ if $F[3] eq "-"; # rev comp cDNA HGVS | |
246 my $chr = $F[4]; | |
247 my $pos = $F[5]; | |
248 my $zygosity = $F[6]; | |
249 my $coverage = $F[9]; | |
250 my $maf = $F[14]; | |
251 if($coverage_cutoff > $coverage or $F[16] ne ""){ | |
252 $bad_coverage{$chr} = {} unless exists $bad_coverage{$chr}; | |
253 $bad_coverage{$chr}->{$pos} = [$new_base, $zygosity, $coverage, $maf] unless exists $bad_coverage{$chr}->{$pos}; | |
254 next; | |
255 } | |
256 | |
257 if(exists $reporting->{$chr}){ # in a region reported in the annotation, and it's protein coding? | |
258 my $in_region = 0; | |
259 my $arrayref = $reporting->{$chr}; | |
260 for(my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){ | |
261 my $interval = $arrayref->[$search_index]; | |
262 if($pos >= $interval->[0] and $pos <= $interval->[1]){ | |
263 $in_region = 1; last; | |
264 } | |
265 last if $pos < $interval->[0]; | |
266 } | |
267 next unless $in_region; | |
268 $in_region = 0; | |
269 $arrayref = $coding->{$chr}; | |
270 for(my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){ | |
271 my $interval = $arrayref->[$search_index]; | |
272 if($pos >= $interval->[0] and $pos <= $interval->[1]){ | |
273 $in_region = 1; last; | |
274 } | |
275 last if $pos < $interval->[0]; | |
276 } | |
277 next unless $in_region; | |
278 } | |
279 | |
280 if(not exists $calls{$chr}){ | |
281 $calls{$chr} = {}; | |
282 } | |
283 next if exists $calls{$chr}->{$pos}; | |
284 $calls{$chr}->{$pos} = [$new_base, $zygosity, $coverage, $maf]; | |
285 } | |
286 close(HGVS); | |
287 return (\%calls, \%bad_coverage); | |
288 } | |
289 | |
290 | |
291 sub find_earliest_index{ | |
292 # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start | |
293 my ($query, $array) = @_; | |
294 | |
295 return 0 if $query < $array->[0]->[0]; | |
296 | |
297 my ($left, $right, $prevCenter) = (0, $#$array, -1); | |
298 | |
299 while(1) { | |
300 my $center = int (($left + $right)/2); | |
301 | |
302 my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1); | |
303 | |
304 return $center if $cmp == 0; | |
305 if ($center == $prevCenter) { | |
306 return $left; | |
307 } | |
308 $right = $center if $cmp < 0; | |
309 $left = $center if $cmp > 0; | |
310 $prevCenter = $center; | |
311 } | |
312 } | |
313 |