# HG changeset patch # User Yusuf Ali # Date 1427311692 21600 # Node ID 2d601bd04c931487c2fe3bb7ce223427164d664c initial commit diff -r 000000000000 -r 2d601bd04c93 ConcordanceReport.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ConcordanceReport.xml Wed Mar 25 13:28:12 2015 -0600 @@ -0,0 +1,32 @@ + + + + over targeted regions in a genome + kappa_report -v + kappa_report -q $target_bed $coding_gtf $input_ngs_calls1 $input_bam1 $coverage_cutoff1 $input_ngs_calls2 $input_bam2 $coverage_cutoff2 $out_summary_table + + + + + + + + + + + + + + + + + + +**What it does** + +This tool reports several statistics comparing the genotypes derived from two next-gen sequencing runs. This is intended to measure methodological consistency, +assuming the samples for both methods are from the same individual. When coverage is good (typically 10x for colorspace, 20x for base space), there should be a large +(>= 95%) overlap between the calls if they are reasonably accurate. + + + diff -r 000000000000 -r 2d601bd04c93 kappa_report --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/kappa_report Wed Mar 25 13:28:12 2015 -0600 @@ -0,0 +1,313 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + + +my $quiet = 0; +if(@ARGV and $ARGV[0] =~ /^-q/){ + $quiet = 1; + shift @ARGV; +} +# Calculates Cohen's kappa measure for inter-annotator agreement, here for two genotype callers +# SPECIAL ATTENTION MUST BE PAID to include or exclude calls using depth of coverage information, and only look at targeted regions (e.g. exome analysis) +# Also, the probability of a non-call must be incorporated, as there is going to be 99.9% agreement by default due to the low rate of +# SNPs in the genome in human, for example. +@ARGV == 9 or die "Usage: $0 [-q(uiet)] \n"; + +my $targets_file = shift @ARGV; +my $coding_file = shift @ARGV; +my $calls1_file = shift @ARGV; +my $bam_file1 = shift @ARGV; +my $calls1_min_depth = shift @ARGV; +my $calls2_file = shift @ARGV; +my $bam_file2 = shift @ARGV; +my $calls2_min_depth = shift @ARGV; +my $output_file = shift @ARGV; + +my %reporting; +print STDERR "Reading in target regions list...\n" unless $quiet; +open(BED, $targets_file) + or die "Cannot open target BED file $targets_file for reading: $!\n"; +while(){ + next if /^(\s*#|track=)/; + chomp; + my @fields = split /\t/, $_; + + if(not exists $reporting{$fields[0]}){ + $reporting{$fields[0]} = [[$fields[1], $fields[2]]]; + next; + } + push @{$reporting{$fields[0]}}, [$fields[1], $fields[2]]; +} +close(BED); +for my $c (keys %reporting){ + $reporting{$c} = [sort {$a->[0] <=> $b->[0]} @{$reporting{$c}}]; +} + +my %coding; +print STDERR "Reading in coding regions list...\n" unless $quiet; +open(GTF, $coding_file) + or die "Cannot open $coding_file for reading: $!\n"; +while(){ + next if /^\s*#/; + my @fields = split /\t/, $_; + + if($fields[2] eq "CDS"){ + if(not exists $coding{$fields[0]}){ + $coding{$fields[0]} = [[$fields[3], $fields[4]]]; + next; + } + push @{$coding{$fields[0]}}, [$fields[3], $fields[4]]; + } +} +close(GTF); +for my $c (keys %coding){ + $coding{$c} = [sort {$a->[0] <=> $b->[0]} @{$coding{$c}}]; +} + +print STDERR "Reading in genotype calls...\n" unless $quiet; +open(OUT, ">$output_file") + or die "Cannot open $output_file for writing: $!\n"; +my ($calls1, $subpar_calls1) = retrieve_calls($calls1_file, \%reporting, \%coding, $calls1_min_depth); +my ($calls2, $subpar_calls2) = retrieve_calls($calls2_file, \%reporting, \%coding, $calls2_min_depth); + +my $total_calls1 = 0; +for my $chr (keys %$calls1){ + $total_calls1 += scalar(keys %{$calls1->{$chr}}); +} +my $total_calls2 = 0; +for my $chr (keys %$calls2){ + $total_calls2 += scalar(keys %{$calls2->{$chr}}); +} + +print OUT "Total Method 1 calls\t$total_calls1\n"; +print OUT "Total Method 2 calls\t$total_calls2\n"; +print STDERR "Comparing genotype calls...\n" unless $quiet; +# Go through the BAM file in all targeted locations to see if there is a reference or variant call (HGVS file only shows variants) +my $bases_diff = 0; +my $bases_same = 0; +my $zygosities_diff = 0; +my $zygosities_same = 0; +for my $chr (keys %$calls1){ + for my $pos (keys %{$calls1->{$chr}}){ + if(exists $calls2->{$chr}->{$pos}){ + # same variant base + if($calls2->{$chr}->{$pos}->[0] eq $calls1->{$chr}->{$pos}->[0]){ + $bases_same++; + if($calls1->{$chr}->{$pos}->[1] eq $calls2->{$chr}->{$pos}->[1]){ + $zygosities_same++; + } + else{ + $zygosities_diff++; + } + } + # diff variant base + else{ + $bases_diff++; + } + delete $calls1->{$chr}->{$pos}; # so in the end all that's left is method-1-only calls in the hash + delete $calls2->{$chr}->{$pos}; # so in the end all that's left is method-2-only calls in the hash + delete $subpar_calls1->{$chr}->{$pos} if exists $subpar_calls1->{$chr}; # so in the end all that's left is method-1-only calls in the hash + delete $subpar_calls2->{$chr}->{$pos} if exists $subpar_calls2->{$chr}; # so in the end all that's left is method-2-only calls in the hash + } + } +} + +print OUT "Same calls\t$zygosities_same\n"; +print OUT "Diff zygosity\t$zygosities_diff\n"; +print OUT "Diff bases\t$bases_diff\n"; + +my $covbed = "$$.bed"; +my $num_to_check = 0; +open(TMPCOV, ">$covbed") + or die "Cannot open temporary BED file $covbed for samtools: $!\n"; +for my $chr (keys %$calls1){ + for my $pos (keys %{$calls1->{$chr}}){ + if(exists $subpar_calls2->{$chr} and exists $subpar_calls2->{$chr}->{$pos}){ + next; #exclude counting positions with sufficent coverage in one analysis, but not the other + } + else{ + $num_to_check++; + print TMPCOV "$chr\t$pos\n"; + } + } +} +close(TMPCOV); + +print STDERR "Checking $num_to_check read depths for missing calls in $bam_file2...\n" unless $quiet; +my $remaining_calls1_homo = 0; +my $remaining_calls1_het = 0; +my $remaining_calls1_novel = 0; +my $remaining_calls1_novel_homo = 0; +open(SAM, "samtools mpileup -l $covbed -I $bam_file2 2>/dev/null |") + or die "Cannot run samtools: $!\n"; +while(){ + my @B = split /\t/, $_; + my $depth = @B > 1 ? $B[3] : 0; + + # The two methods may have used different patch versions of the same genome. Check if there was a no-call due to different reference bases. + my $called_base = @B > 1 ? $B[2] : "N"; + if($depth and $called_base eq $calls1->{$B[0]}->{$B[1]}->[0]){ + next; + } + + if($depth >= $calls2_min_depth){ # have good coverage in sample 2, but didn't make a SNP call like in sample 1 + if($calls1->{$B[0]}->{$B[1]}->[1] =~ /het/){ + $remaining_calls1_het++; + } + else{ + $remaining_calls1_homo++; + } + if($calls1->{$B[0]}->{$B[1]}->[3] eq "NA" or $calls1->{$B[0]}->{$B[1]}->[3] < 0.001){ + $remaining_calls1_novel++; + $remaining_calls1_novel_homo++ if $calls1->{$B[0]}->{$B[1]}->[1] !~ /het/; + } + else{ + #print STDERR "Got method 1 only non-novel call: ", join("/", @{$calls1->{$B[0]}->{$B[1]}}), "\n"; + } + } +} +print OUT "Method 1 only\t",($remaining_calls1_homo+$remaining_calls1_het),"\n"; +printf OUT "Method 1 only, het %%\t%.1f\n",$remaining_calls1_het/($remaining_calls1_homo+$remaining_calls1_het)*100; +printf OUT "Method 1 only, novel %%\t%.1f\n",$remaining_calls1_novel/($remaining_calls1_homo+$remaining_calls1_het)*100; +printf OUT "Method 1 only, novel, homo %%\t%.1f\n",$remaining_calls1_novel_homo/$remaining_calls1_novel*100; + +$num_to_check = 0; +open(TMPCOV, ">$covbed") + or die "Cannot open temporary BED file $covbed for samtools: $!\n"; +for my $chr (keys %$calls2){ + for my $pos (keys %{$calls2->{$chr}}){ + if(exists $subpar_calls1->{$chr} and exists $subpar_calls1->{$chr}->{$pos}){ + next; #exclude counting positions with sufficent coverage in one analysis, but not the other + } + else{ + $num_to_check++; + print TMPCOV "$chr\t$pos\n"; + } + } +} +close(TMPCOV); + +print STDERR "Checking $num_to_check read depths for missing calls in $bam_file1...\n" unless $quiet; +my $remaining_calls2_het = 0; +my $remaining_calls2_homo = 0; +my $remaining_calls2_novel = 0; +my $remaining_calls2_novel_homo = 0; +open(SAM, "samtools mpileup -l $covbed -I $bam_file1 2>/dev/null |") + or die "Cannot run samtools: $!\n"; +while(){ + my @B = split /\t/, $_; + my $depth = @B > 1 ? $B[3] : 0; + + # The two methods may have used different patch versions of the same genome. Check if there was a no-call due to different reference bases. + my $called_base = @B > 1 ? $B[2] : "N"; + if($depth and $called_base eq $calls2->{$B[0]}->{$B[1]}->[0]){ + next; + } + + if($depth >= $calls1_min_depth){ # have good coverage in sample 1, but didn't make a SNP call like in sample 2 + if($calls2->{$B[0]}->{$B[1]}->[1] =~ /het/){ + $remaining_calls2_het++; + } + else{ + $remaining_calls2_homo++; + } + if($calls2->{$B[0]}->{$B[1]}->[3] eq "NA" or $calls2->{$B[0]}->{$B[1]}->[3] < 0.001){ + $remaining_calls2_novel++; + $remaining_calls2_novel_homo++ if $calls2->{$B[0]}->{$B[1]}->[1] !~ /het/; + } + } +} + +print OUT "Method 2 only\t",($remaining_calls2_homo+$remaining_calls2_het),"\n"; +printf OUT "Method 2 only, het %%\t%.1f\n",$remaining_calls2_het/($remaining_calls2_homo+$remaining_calls2_het)*100; +printf OUT "Method 2 only, novel %%\t%.1f\n",$remaining_calls2_novel/($remaining_calls2_homo+$remaining_calls2_het)*100; +printf OUT "Method 2 only, novel, homo %%\t%.1f\n",$remaining_calls2_novel_homo/$remaining_calls2_novel*100; +my $total_calls = $zygosities_same+$zygosities_diff+$bases_diff+$remaining_calls1_homo+$remaining_calls1_het+$remaining_calls2_homo+$remaining_calls2_het; +printf OUT "Total coding sequence genotype concordance%%\t%.1f\n", $zygosities_same/$total_calls*100; +printf OUT "Total coding sequence zygosity concordance%%\t%.1f\n", 100-$zygosities_diff/$total_calls*100; + +unlink $covbed; + +sub retrieve_calls{ + my ($hgvs_file, $reporting, $coding, $coverage_cutoff) = @_; + my %calls; + my %bad_coverage; + open(HGVS, $hgvs_file) + or die "Cannot open the first genotypes file $hgvs_file for reading: $!\n"; + ; # header + while(){ + my @F = split /\t/, $_; + my $HGVS = $F[2]; + next unless $HGVS =~ />([ACGT])/; # only look at SNPs + my $new_base = $1; + $new_base =~ tr/ACGT/TGCA/ if $F[3] eq "-"; # rev comp cDNA HGVS + my $chr = $F[4]; + my $pos = $F[5]; + my $zygosity = $F[6]; + my $coverage = $F[9]; + my $maf = $F[14]; + if($coverage_cutoff > $coverage or $F[16] ne ""){ + $bad_coverage{$chr} = {} unless exists $bad_coverage{$chr}; + $bad_coverage{$chr}->{$pos} = [$new_base, $zygosity, $coverage, $maf] unless exists $bad_coverage{$chr}->{$pos}; + next; + } + + if(exists $reporting->{$chr}){ # in a region reported in the annotation, and it's protein coding? + my $in_region = 0; + my $arrayref = $reporting->{$chr}; + for(my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){ + my $interval = $arrayref->[$search_index]; + if($pos >= $interval->[0] and $pos <= $interval->[1]){ + $in_region = 1; last; + } + last if $pos < $interval->[0]; + } + next unless $in_region; + $in_region = 0; + $arrayref = $coding->{$chr}; + for(my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){ + my $interval = $arrayref->[$search_index]; + if($pos >= $interval->[0] and $pos <= $interval->[1]){ + $in_region = 1; last; + } + last if $pos < $interval->[0]; + } + next unless $in_region; + } + + if(not exists $calls{$chr}){ + $calls{$chr} = {}; + } + next if exists $calls{$chr}->{$pos}; + $calls{$chr}->{$pos} = [$new_base, $zygosity, $coverage, $maf]; + } + close(HGVS); + return (\%calls, \%bad_coverage); +} + + +sub find_earliest_index{ + # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start + my ($query, $array) = @_; + + return 0 if $query < $array->[0]->[0]; + + my ($left, $right, $prevCenter) = (0, $#$array, -1); + + while(1) { + my $center = int (($left + $right)/2); + + my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1); + + return $center if $cmp == 0; + if ($center == $prevCenter) { + return $left; + } + $right = $center if $cmp < 0; + $left = $center if $cmp > 0; + $prevCenter = $center; + } +} +