changeset 0:2d601bd04c93 default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:28:12 -0600
parents
children
files ConcordanceReport.xml kappa_report
diffstat 2 files changed, 345 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/ConcordanceReport.xml	Wed Mar 25 13:28:12 2015 -0600
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+
+<tool id="concordance_report_1" name="Compute intra-NGS concordance">
+  <description>over targeted regions in a genome</description>
+  <version_string>kappa_report -v</version_string>
+  <command interpreter="perl">kappa_report -q $target_bed $coding_gtf $input_ngs_calls1 $input_bam1 $coverage_cutoff1 $input_ngs_calls2 $input_bam2 $coverage_cutoff2 $out_summary_table</command>
+  <inputs>
+    <param format="gtf" name="coding_gtf" type="data" label="Coding regions" help="This must be the same (GTF) file used to define the coding regions for the HGVS annotation of the NGS variant calls"/>
+    <param format="bed" name="target_bed" type="data" label="Target sequencing regions" help="e.g. a BED file from the Shared Data 'Exome target regions' folder, corresponding to the capture kit used for sequencing the sample"/>
+    <param format="achri_snp_table" name="input_ngs_calls1" type="data" label="HGVS annotated NGS variant calls for genotyping method #1"/>
+    <param format="bam" name="input_bam1" type="data" label="Mapped reads file (BAM) that was used to generate the NGS variant calls for genotyping method #1"/>
+    <param type="integer" name="coverage_cutoff1" value="20" min="1" max="40" label="Report method #1 calls only for positions with read depth equal to or greater than..."/>
+    <param format="achri_snp_table" name="input_ngs_calls2" type="data" label="HGVS annotated NGS variant calls for genotyping method #2"/>
+    <param format="bam" name="input_bam2" type="data" label="Mapped reads file (BAM) that was used to generate the NGS variant calls for genotyping method #2"/>
+    <param type="integer" name="coverage_cutoff2" value="20" min="1" max="40" label="Report method #2 calls only for positions with read depth equal to or greater than..."/>
+  </inputs>
+  <outputs>
+    <data name="out_summary_table" format="tabular" label="Concordance summary (shared, unique calls, etc.)"/>
+  </outputs>
+
+  <tests>
+  </tests>
+
+  <help>
+**What it does**
+
+This tool reports several statistics comparing the genotypes derived from two next-gen sequencing runs. This is intended to measure methodological consistency,
+assuming the samples for both methods are from the same individual. When coverage is good (typically 10x for colorspace, 20x for base space), there should be a large
+(>= 95%) overlap between the calls if they are reasonably accurate.
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kappa_report	Wed Mar 25 13:28:12 2015 -0600
@@ -0,0 +1,313 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+
+my $quiet = 0;
+if(@ARGV and $ARGV[0] =~ /^-q/){
+  $quiet = 1;
+  shift @ARGV;
+} 
+# Calculates Cohen's kappa measure for inter-annotator agreement, here for two genotype callers
+# SPECIAL ATTENTION MUST BE PAID to include or exclude calls using depth of coverage information, and only look at targeted regions (e.g. exome analysis)
+# Also, the probability of a non-call must be incorporated, as there is going to be 99.9% agreement by default due to the low rate of 
+# SNPs in the genome in human, for example.
+@ARGV == 9 or die "Usage: $0 [-q(uiet)] <exome targets.bed> <coding regions.gtf> <genotypes1.hgvs.txt> <bam1> <coverage cutoff1> <genotypes2.hgvs.txt> <bam2> <coverage cutoff2> <output report.txt>\n";
+
+my $targets_file = shift @ARGV;
+my $coding_file = shift @ARGV;
+my $calls1_file = shift @ARGV;
+my $bam_file1 = shift @ARGV;
+my $calls1_min_depth = shift @ARGV;
+my $calls2_file = shift @ARGV;
+my $bam_file2 = shift @ARGV;
+my $calls2_min_depth = shift @ARGV;
+my $output_file = shift @ARGV;
+
+my %reporting;
+print STDERR "Reading in target regions list...\n" unless $quiet;
+open(BED, $targets_file)
+  or die "Cannot open target BED file $targets_file for reading: $!\n";
+while(<BED>){
+    next if /^(\s*#|track=)/;
+    chomp;
+    my @fields = split /\t/, $_;
+
+    if(not exists $reporting{$fields[0]}){
+      $reporting{$fields[0]} = [[$fields[1], $fields[2]]];
+      next;
+    }
+    push @{$reporting{$fields[0]}}, [$fields[1], $fields[2]];
+}
+close(BED);
+for my $c (keys %reporting){
+  $reporting{$c} = [sort {$a->[0] <=> $b->[0]} @{$reporting{$c}}];
+}
+
+my %coding;
+print STDERR "Reading in coding regions list...\n" unless $quiet;
+open(GTF, $coding_file)
+  or die "Cannot open $coding_file for reading: $!\n";
+while(<GTF>){
+    next if /^\s*#/;
+    my @fields = split /\t/, $_;
+
+    if($fields[2] eq "CDS"){
+      if(not exists $coding{$fields[0]}){
+        $coding{$fields[0]} = [[$fields[3], $fields[4]]];
+        next;
+      }
+      push @{$coding{$fields[0]}}, [$fields[3], $fields[4]];
+    }
+}
+close(GTF);
+for my $c (keys %coding){
+  $coding{$c} = [sort {$a->[0] <=> $b->[0]} @{$coding{$c}}];
+}
+
+print STDERR "Reading in genotype calls...\n" unless $quiet;
+open(OUT, ">$output_file")
+  or die "Cannot open $output_file for writing: $!\n";
+my ($calls1, $subpar_calls1) = retrieve_calls($calls1_file, \%reporting, \%coding, $calls1_min_depth);
+my ($calls2, $subpar_calls2) = retrieve_calls($calls2_file, \%reporting, \%coding, $calls2_min_depth);
+
+my $total_calls1 = 0;
+for my $chr (keys %$calls1){
+  $total_calls1 += scalar(keys %{$calls1->{$chr}});
+}
+my $total_calls2 = 0;
+for my $chr (keys %$calls2){
+  $total_calls2 += scalar(keys %{$calls2->{$chr}});
+}
+
+print OUT "Total Method 1 calls\t$total_calls1\n";
+print OUT "Total Method 2 calls\t$total_calls2\n";
+print STDERR "Comparing genotype calls...\n" unless $quiet;
+# Go through the BAM file in all targeted locations to see if there is a reference or variant call (HGVS file only shows variants)
+my $bases_diff = 0;
+my $bases_same = 0;
+my $zygosities_diff = 0;
+my $zygosities_same = 0;
+for my $chr (keys %$calls1){
+  for my $pos (keys %{$calls1->{$chr}}){
+    if(exists $calls2->{$chr}->{$pos}){
+      # same variant base
+      if($calls2->{$chr}->{$pos}->[0] eq $calls1->{$chr}->{$pos}->[0]){
+        $bases_same++;
+        if($calls1->{$chr}->{$pos}->[1] eq $calls2->{$chr}->{$pos}->[1]){
+          $zygosities_same++;
+        }
+        else{
+          $zygosities_diff++;
+        }
+      }
+      # diff variant base
+      else{
+        $bases_diff++;
+      }
+      delete $calls1->{$chr}->{$pos}; # so in the end all that's left is method-1-only calls in the hash 
+      delete $calls2->{$chr}->{$pos}; # so in the end all that's left is method-2-only calls in the hash 
+      delete $subpar_calls1->{$chr}->{$pos} if exists $subpar_calls1->{$chr}; # so in the end all that's left is method-1-only calls in the hash 
+      delete $subpar_calls2->{$chr}->{$pos} if exists $subpar_calls2->{$chr}; # so in the end all that's left is method-2-only calls in the hash 
+    }
+  }
+}
+
+print OUT "Same calls\t$zygosities_same\n";
+print OUT "Diff zygosity\t$zygosities_diff\n";
+print OUT "Diff bases\t$bases_diff\n";
+
+my $covbed = "$$.bed";
+my $num_to_check = 0;
+open(TMPCOV, ">$covbed")
+  or die "Cannot open temporary BED file $covbed for samtools: $!\n";
+for my $chr (keys %$calls1){
+  for my $pos (keys %{$calls1->{$chr}}){
+    if(exists $subpar_calls2->{$chr} and exists $subpar_calls2->{$chr}->{$pos}){
+      next; #exclude counting positions with sufficent coverage in one analysis, but not the other
+    }
+    else{
+      $num_to_check++;
+      print TMPCOV "$chr\t$pos\n";
+    }
+  }
+}
+close(TMPCOV);
+
+print STDERR "Checking $num_to_check read depths for missing calls in $bam_file2...\n" unless $quiet;
+my $remaining_calls1_homo = 0;
+my $remaining_calls1_het = 0;
+my $remaining_calls1_novel = 0;
+my $remaining_calls1_novel_homo = 0;
+open(SAM, "samtools mpileup -l $covbed -I $bam_file2 2>/dev/null |")
+  or die "Cannot run samtools: $!\n";
+while(<SAM>){
+   my @B = split /\t/, $_;
+   my $depth = @B > 1 ? $B[3] : 0;
+
+   # The two methods may have used different patch versions of the same genome. Check if there was a no-call due to different reference bases.
+   my $called_base = @B > 1 ? $B[2] : "N";
+   if($depth and $called_base eq $calls1->{$B[0]}->{$B[1]}->[0]){
+     next;
+   }
+
+   if($depth >= $calls2_min_depth){ # have good coverage in sample 2, but didn't make a SNP call like in sample 1
+     if($calls1->{$B[0]}->{$B[1]}->[1] =~ /het/){
+       $remaining_calls1_het++;
+     }
+     else{
+       $remaining_calls1_homo++;
+     }
+     if($calls1->{$B[0]}->{$B[1]}->[3] eq "NA" or $calls1->{$B[0]}->{$B[1]}->[3] < 0.001){
+       $remaining_calls1_novel++;
+       $remaining_calls1_novel_homo++ if $calls1->{$B[0]}->{$B[1]}->[1] !~ /het/;
+     }
+     else{
+       #print STDERR "Got method 1 only non-novel call: ", join("/", @{$calls1->{$B[0]}->{$B[1]}}), "\n";
+     } 
+   }
+}
+print OUT "Method 1 only\t",($remaining_calls1_homo+$remaining_calls1_het),"\n";
+printf OUT "Method 1 only, het %%\t%.1f\n",$remaining_calls1_het/($remaining_calls1_homo+$remaining_calls1_het)*100;
+printf OUT "Method 1 only, novel %%\t%.1f\n",$remaining_calls1_novel/($remaining_calls1_homo+$remaining_calls1_het)*100;
+printf OUT "Method 1 only, novel, homo %%\t%.1f\n",$remaining_calls1_novel_homo/$remaining_calls1_novel*100;
+
+$num_to_check = 0;
+open(TMPCOV, ">$covbed")
+  or die "Cannot open temporary BED file $covbed for samtools: $!\n";
+for my $chr (keys %$calls2){
+  for my $pos (keys %{$calls2->{$chr}}){
+    if(exists $subpar_calls1->{$chr} and exists $subpar_calls1->{$chr}->{$pos}){
+      next; #exclude counting positions with sufficent coverage in one analysis, but not the other
+    }
+    else{
+      $num_to_check++;
+      print TMPCOV "$chr\t$pos\n";
+    }
+  }
+}
+close(TMPCOV);
+
+print STDERR "Checking $num_to_check read depths for missing calls in $bam_file1...\n" unless $quiet;
+my $remaining_calls2_het = 0;
+my $remaining_calls2_homo = 0;
+my $remaining_calls2_novel = 0;
+my $remaining_calls2_novel_homo = 0;
+open(SAM, "samtools mpileup -l $covbed -I $bam_file1 2>/dev/null |")
+  or die "Cannot run samtools: $!\n";
+while(<SAM>){
+   my @B = split /\t/, $_;
+   my $depth = @B > 1 ? $B[3] : 0;
+
+   # The two methods may have used different patch versions of the same genome. Check if there was a no-call due to different reference bases.
+   my $called_base = @B > 1 ? $B[2] : "N";
+   if($depth and $called_base eq $calls2->{$B[0]}->{$B[1]}->[0]){
+     next;
+   }
+
+   if($depth >= $calls1_min_depth){ # have good coverage in sample 1, but didn't make a SNP call like in sample 2
+     if($calls2->{$B[0]}->{$B[1]}->[1] =~ /het/){
+       $remaining_calls2_het++;
+     }
+     else{
+       $remaining_calls2_homo++;
+     }
+     if($calls2->{$B[0]}->{$B[1]}->[3] eq "NA" or $calls2->{$B[0]}->{$B[1]}->[3] < 0.001){
+       $remaining_calls2_novel++;
+       $remaining_calls2_novel_homo++ if $calls2->{$B[0]}->{$B[1]}->[1] !~ /het/;
+     }
+   }
+}
+
+print OUT "Method 2 only\t",($remaining_calls2_homo+$remaining_calls2_het),"\n";
+printf OUT "Method 2 only, het %%\t%.1f\n",$remaining_calls2_het/($remaining_calls2_homo+$remaining_calls2_het)*100;
+printf OUT "Method 2 only, novel %%\t%.1f\n",$remaining_calls2_novel/($remaining_calls2_homo+$remaining_calls2_het)*100;
+printf OUT "Method 2 only, novel, homo %%\t%.1f\n",$remaining_calls2_novel_homo/$remaining_calls2_novel*100;
+my $total_calls = $zygosities_same+$zygosities_diff+$bases_diff+$remaining_calls1_homo+$remaining_calls1_het+$remaining_calls2_homo+$remaining_calls2_het;
+printf OUT "Total coding sequence genotype concordance%%\t%.1f\n", $zygosities_same/$total_calls*100;
+printf OUT "Total coding sequence zygosity concordance%%\t%.1f\n", 100-$zygosities_diff/$total_calls*100;
+
+unlink $covbed;
+
+sub retrieve_calls{
+  my ($hgvs_file, $reporting, $coding, $coverage_cutoff) = @_;
+  my %calls;
+  my %bad_coverage;
+  open(HGVS, $hgvs_file)
+    or die "Cannot open the first genotypes file $hgvs_file for reading: $!\n";
+  <HGVS>; # header
+  while(<HGVS>){
+    my @F = split /\t/, $_;
+    my $HGVS = $F[2];
+    next unless $HGVS =~ />([ACGT])/; # only look at SNPs
+    my $new_base = $1;
+    $new_base =~ tr/ACGT/TGCA/ if $F[3] eq "-"; # rev comp cDNA HGVS
+    my $chr = $F[4];
+    my $pos = $F[5];
+    my $zygosity = $F[6];
+    my $coverage = $F[9];
+    my $maf = $F[14];
+    if($coverage_cutoff > $coverage or $F[16] ne ""){
+      $bad_coverage{$chr} = {} unless exists $bad_coverage{$chr};
+      $bad_coverage{$chr}->{$pos} = [$new_base, $zygosity, $coverage, $maf] unless exists $bad_coverage{$chr}->{$pos};
+      next;
+    }
+
+    if(exists $reporting->{$chr}){ # in a region reported in the annotation, and it's protein coding?
+      my $in_region = 0;
+      my $arrayref = $reporting->{$chr};
+      for(my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){
+        my $interval = $arrayref->[$search_index];
+        if($pos >= $interval->[0] and $pos <= $interval->[1]){
+          $in_region = 1; last;
+        }
+        last if $pos < $interval->[0];
+      }
+      next unless $in_region;
+      $in_region = 0;
+      $arrayref = $coding->{$chr};
+      for(my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){
+        my $interval = $arrayref->[$search_index];
+        if($pos >= $interval->[0] and $pos <= $interval->[1]){
+          $in_region = 1; last;
+        }
+        last if $pos < $interval->[0];
+      }
+      next unless $in_region;
+    }
+  
+    if(not exists $calls{$chr}){
+      $calls{$chr} = {};
+    }
+    next if exists $calls{$chr}->{$pos};
+    $calls{$chr}->{$pos} = [$new_base, $zygosity, $coverage, $maf];
+  }
+  close(HGVS);
+  return (\%calls, \%bad_coverage);
+}
+
+
+sub find_earliest_index{
+  # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start
+  my ($query, $array) = @_;
+
+  return 0 if $query < $array->[0]->[0];
+
+  my ($left, $right, $prevCenter) = (0, $#$array, -1);
+
+  while(1) {
+    my $center = int (($left + $right)/2);
+
+    my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1);
+
+    return $center if $cmp == 0;
+    if ($center == $prevCenter) {
+      return $left;
+    }
+    $right = $center if $cmp < 0;
+    $left  = $center if $cmp > 0;
+    $prevCenter = $center;
+  }
+}
+