Mercurial > repos > yusuf > concordance_report
view kappa_report @ 0:2d601bd04c93 default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:28:12 -0600 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl use strict; use warnings; my $quiet = 0; if(@ARGV and $ARGV[0] =~ /^-q/){ $quiet = 1; shift @ARGV; } # Calculates Cohen's kappa measure for inter-annotator agreement, here for two genotype callers # SPECIAL ATTENTION MUST BE PAID to include or exclude calls using depth of coverage information, and only look at targeted regions (e.g. exome analysis) # Also, the probability of a non-call must be incorporated, as there is going to be 99.9% agreement by default due to the low rate of # SNPs in the genome in human, for example. @ARGV == 9 or die "Usage: $0 [-q(uiet)] <exome targets.bed> <coding regions.gtf> <genotypes1.hgvs.txt> <bam1> <coverage cutoff1> <genotypes2.hgvs.txt> <bam2> <coverage cutoff2> <output report.txt>\n"; my $targets_file = shift @ARGV; my $coding_file = shift @ARGV; my $calls1_file = shift @ARGV; my $bam_file1 = shift @ARGV; my $calls1_min_depth = shift @ARGV; my $calls2_file = shift @ARGV; my $bam_file2 = shift @ARGV; my $calls2_min_depth = shift @ARGV; my $output_file = shift @ARGV; my %reporting; print STDERR "Reading in target regions list...\n" unless $quiet; open(BED, $targets_file) or die "Cannot open target BED file $targets_file for reading: $!\n"; while(<BED>){ next if /^(\s*#|track=)/; chomp; my @fields = split /\t/, $_; if(not exists $reporting{$fields[0]}){ $reporting{$fields[0]} = [[$fields[1], $fields[2]]]; next; } push @{$reporting{$fields[0]}}, [$fields[1], $fields[2]]; } close(BED); for my $c (keys %reporting){ $reporting{$c} = [sort {$a->[0] <=> $b->[0]} @{$reporting{$c}}]; } my %coding; print STDERR "Reading in coding regions list...\n" unless $quiet; open(GTF, $coding_file) or die "Cannot open $coding_file for reading: $!\n"; while(<GTF>){ next if /^\s*#/; my @fields = split /\t/, $_; if($fields[2] eq "CDS"){ if(not exists $coding{$fields[0]}){ $coding{$fields[0]} = [[$fields[3], $fields[4]]]; next; } push @{$coding{$fields[0]}}, [$fields[3], $fields[4]]; } } close(GTF); for my $c (keys %coding){ $coding{$c} = [sort {$a->[0] <=> $b->[0]} @{$coding{$c}}]; } print STDERR "Reading in genotype calls...\n" unless $quiet; open(OUT, ">$output_file") or die "Cannot open $output_file for writing: $!\n"; my ($calls1, $subpar_calls1) = retrieve_calls($calls1_file, \%reporting, \%coding, $calls1_min_depth); my ($calls2, $subpar_calls2) = retrieve_calls($calls2_file, \%reporting, \%coding, $calls2_min_depth); my $total_calls1 = 0; for my $chr (keys %$calls1){ $total_calls1 += scalar(keys %{$calls1->{$chr}}); } my $total_calls2 = 0; for my $chr (keys %$calls2){ $total_calls2 += scalar(keys %{$calls2->{$chr}}); } print OUT "Total Method 1 calls\t$total_calls1\n"; print OUT "Total Method 2 calls\t$total_calls2\n"; print STDERR "Comparing genotype calls...\n" unless $quiet; # Go through the BAM file in all targeted locations to see if there is a reference or variant call (HGVS file only shows variants) my $bases_diff = 0; my $bases_same = 0; my $zygosities_diff = 0; my $zygosities_same = 0; for my $chr (keys %$calls1){ for my $pos (keys %{$calls1->{$chr}}){ if(exists $calls2->{$chr}->{$pos}){ # same variant base if($calls2->{$chr}->{$pos}->[0] eq $calls1->{$chr}->{$pos}->[0]){ $bases_same++; if($calls1->{$chr}->{$pos}->[1] eq $calls2->{$chr}->{$pos}->[1]){ $zygosities_same++; } else{ $zygosities_diff++; } } # diff variant base else{ $bases_diff++; } delete $calls1->{$chr}->{$pos}; # so in the end all that's left is method-1-only calls in the hash delete $calls2->{$chr}->{$pos}; # so in the end all that's left is method-2-only calls in the hash delete $subpar_calls1->{$chr}->{$pos} if exists $subpar_calls1->{$chr}; # so in the end all that's left is method-1-only calls in the hash delete $subpar_calls2->{$chr}->{$pos} if exists $subpar_calls2->{$chr}; # so in the end all that's left is method-2-only calls in the hash } } } print OUT "Same calls\t$zygosities_same\n"; print OUT "Diff zygosity\t$zygosities_diff\n"; print OUT "Diff bases\t$bases_diff\n"; my $covbed = "$$.bed"; my $num_to_check = 0; open(TMPCOV, ">$covbed") or die "Cannot open temporary BED file $covbed for samtools: $!\n"; for my $chr (keys %$calls1){ for my $pos (keys %{$calls1->{$chr}}){ if(exists $subpar_calls2->{$chr} and exists $subpar_calls2->{$chr}->{$pos}){ next; #exclude counting positions with sufficent coverage in one analysis, but not the other } else{ $num_to_check++; print TMPCOV "$chr\t$pos\n"; } } } close(TMPCOV); print STDERR "Checking $num_to_check read depths for missing calls in $bam_file2...\n" unless $quiet; my $remaining_calls1_homo = 0; my $remaining_calls1_het = 0; my $remaining_calls1_novel = 0; my $remaining_calls1_novel_homo = 0; open(SAM, "samtools mpileup -l $covbed -I $bam_file2 2>/dev/null |") or die "Cannot run samtools: $!\n"; while(<SAM>){ my @B = split /\t/, $_; my $depth = @B > 1 ? $B[3] : 0; # The two methods may have used different patch versions of the same genome. Check if there was a no-call due to different reference bases. my $called_base = @B > 1 ? $B[2] : "N"; if($depth and $called_base eq $calls1->{$B[0]}->{$B[1]}->[0]){ next; } if($depth >= $calls2_min_depth){ # have good coverage in sample 2, but didn't make a SNP call like in sample 1 if($calls1->{$B[0]}->{$B[1]}->[1] =~ /het/){ $remaining_calls1_het++; } else{ $remaining_calls1_homo++; } if($calls1->{$B[0]}->{$B[1]}->[3] eq "NA" or $calls1->{$B[0]}->{$B[1]}->[3] < 0.001){ $remaining_calls1_novel++; $remaining_calls1_novel_homo++ if $calls1->{$B[0]}->{$B[1]}->[1] !~ /het/; } else{ #print STDERR "Got method 1 only non-novel call: ", join("/", @{$calls1->{$B[0]}->{$B[1]}}), "\n"; } } } print OUT "Method 1 only\t",($remaining_calls1_homo+$remaining_calls1_het),"\n"; printf OUT "Method 1 only, het %%\t%.1f\n",$remaining_calls1_het/($remaining_calls1_homo+$remaining_calls1_het)*100; printf OUT "Method 1 only, novel %%\t%.1f\n",$remaining_calls1_novel/($remaining_calls1_homo+$remaining_calls1_het)*100; printf OUT "Method 1 only, novel, homo %%\t%.1f\n",$remaining_calls1_novel_homo/$remaining_calls1_novel*100; $num_to_check = 0; open(TMPCOV, ">$covbed") or die "Cannot open temporary BED file $covbed for samtools: $!\n"; for my $chr (keys %$calls2){ for my $pos (keys %{$calls2->{$chr}}){ if(exists $subpar_calls1->{$chr} and exists $subpar_calls1->{$chr}->{$pos}){ next; #exclude counting positions with sufficent coverage in one analysis, but not the other } else{ $num_to_check++; print TMPCOV "$chr\t$pos\n"; } } } close(TMPCOV); print STDERR "Checking $num_to_check read depths for missing calls in $bam_file1...\n" unless $quiet; my $remaining_calls2_het = 0; my $remaining_calls2_homo = 0; my $remaining_calls2_novel = 0; my $remaining_calls2_novel_homo = 0; open(SAM, "samtools mpileup -l $covbed -I $bam_file1 2>/dev/null |") or die "Cannot run samtools: $!\n"; while(<SAM>){ my @B = split /\t/, $_; my $depth = @B > 1 ? $B[3] : 0; # The two methods may have used different patch versions of the same genome. Check if there was a no-call due to different reference bases. my $called_base = @B > 1 ? $B[2] : "N"; if($depth and $called_base eq $calls2->{$B[0]}->{$B[1]}->[0]){ next; } if($depth >= $calls1_min_depth){ # have good coverage in sample 1, but didn't make a SNP call like in sample 2 if($calls2->{$B[0]}->{$B[1]}->[1] =~ /het/){ $remaining_calls2_het++; } else{ $remaining_calls2_homo++; } if($calls2->{$B[0]}->{$B[1]}->[3] eq "NA" or $calls2->{$B[0]}->{$B[1]}->[3] < 0.001){ $remaining_calls2_novel++; $remaining_calls2_novel_homo++ if $calls2->{$B[0]}->{$B[1]}->[1] !~ /het/; } } } print OUT "Method 2 only\t",($remaining_calls2_homo+$remaining_calls2_het),"\n"; printf OUT "Method 2 only, het %%\t%.1f\n",$remaining_calls2_het/($remaining_calls2_homo+$remaining_calls2_het)*100; printf OUT "Method 2 only, novel %%\t%.1f\n",$remaining_calls2_novel/($remaining_calls2_homo+$remaining_calls2_het)*100; printf OUT "Method 2 only, novel, homo %%\t%.1f\n",$remaining_calls2_novel_homo/$remaining_calls2_novel*100; my $total_calls = $zygosities_same+$zygosities_diff+$bases_diff+$remaining_calls1_homo+$remaining_calls1_het+$remaining_calls2_homo+$remaining_calls2_het; printf OUT "Total coding sequence genotype concordance%%\t%.1f\n", $zygosities_same/$total_calls*100; printf OUT "Total coding sequence zygosity concordance%%\t%.1f\n", 100-$zygosities_diff/$total_calls*100; unlink $covbed; sub retrieve_calls{ my ($hgvs_file, $reporting, $coding, $coverage_cutoff) = @_; my %calls; my %bad_coverage; open(HGVS, $hgvs_file) or die "Cannot open the first genotypes file $hgvs_file for reading: $!\n"; <HGVS>; # header while(<HGVS>){ my @F = split /\t/, $_; my $HGVS = $F[2]; next unless $HGVS =~ />([ACGT])/; # only look at SNPs my $new_base = $1; $new_base =~ tr/ACGT/TGCA/ if $F[3] eq "-"; # rev comp cDNA HGVS my $chr = $F[4]; my $pos = $F[5]; my $zygosity = $F[6]; my $coverage = $F[9]; my $maf = $F[14]; if($coverage_cutoff > $coverage or $F[16] ne ""){ $bad_coverage{$chr} = {} unless exists $bad_coverage{$chr}; $bad_coverage{$chr}->{$pos} = [$new_base, $zygosity, $coverage, $maf] unless exists $bad_coverage{$chr}->{$pos}; next; } if(exists $reporting->{$chr}){ # in a region reported in the annotation, and it's protein coding? my $in_region = 0; my $arrayref = $reporting->{$chr}; for(my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){ my $interval = $arrayref->[$search_index]; if($pos >= $interval->[0] and $pos <= $interval->[1]){ $in_region = 1; last; } last if $pos < $interval->[0]; } next unless $in_region; $in_region = 0; $arrayref = $coding->{$chr}; for(my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){ my $interval = $arrayref->[$search_index]; if($pos >= $interval->[0] and $pos <= $interval->[1]){ $in_region = 1; last; } last if $pos < $interval->[0]; } next unless $in_region; } if(not exists $calls{$chr}){ $calls{$chr} = {}; } next if exists $calls{$chr}->{$pos}; $calls{$chr}->{$pos} = [$new_base, $zygosity, $coverage, $maf]; } close(HGVS); return (\%calls, \%bad_coverage); } sub find_earliest_index{ # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start my ($query, $array) = @_; return 0 if $query < $array->[0]->[0]; my ($left, $right, $prevCenter) = (0, $#$array, -1); while(1) { my $center = int (($left + $right)/2); my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1); return $center if $cmp == 0; if ($center == $prevCenter) { return $left; } $right = $center if $cmp < 0; $left = $center if $cmp > 0; $prevCenter = $center; } }