diff microarray_reports @ 0:882d119ede1f default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:40:00 -0600
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/microarray_reports	Wed Mar 25 13:40:00 2015 -0600
@@ -0,0 +1,483 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use File::Basename;
+# Assume the microarray file looks something like this...
+## comment lines...
+#Probe Set ID	11-01451_(GenomeWideSNP_5).brlmm-p.chp Forward Strand Base Calls	dbSNP RS ID	Chromosome	Chromosomal Position
+#SNP_A-1780520	GG	rs16994928	20	48440771
+#SNP_A-1780985	AG	rs3859360	18	34178190
+#...end example input file
+if(@ARGV == 1 and $ARGV[0] eq "-v"){
+  print "Version 1.0\n";
+  exit;
+}
+
+# configuration file stuff
+my $dirname = dirname(__FILE__);
+my %config;
+my $tool_data =  shift @ARGV;
+if(not -e "$tool_data/microarray_report.loc"){
+  system("cp $dirname/tool-data/microarray_report.loc $tool_data/microarray_report.loc");
+}
+open CONFIG, '<', "$tool_data/microarray_report.loc";
+while(<CONFIG>){
+  (my $key, my $value) = split(/\s+/, $_);
+  $config{$key} = $value;
+}
+my $dbs_dir = $config{"dbs_directory"};
+close CONFIG;
+
+my $quiet = 0;
+if(@ARGV and $ARGV[0] =~ /^-q/){
+  $quiet = 1;
+  shift @ARGV;
+}
+
+@ARGV == 8 or @ARGV == 9 or die "Usage: $0 [-q(uiet)] <output discordant.bed> <output summary.txt> <input ngs hgvs.txt> <input microarray calls.txt> <input.bam> <reporting cds regions.gtf> <reference genome.fasta> <coverage cutoff for stats> [exome target ngs regions.bed]\n";
+
+print STDERR "Reading in reference sequence...\n" unless $quiet;
+my %seq;
+open(FASTA, "$dbs_dir/$ARGV[6]" )
+  or die "Cannot open $dbs_dir/$ARGV[6] for reading: $!\n";
+$/ = "\n>";
+while(<FASTA>){
+  chomp;
+  my ($name) = /^>?(\S+)/;
+  s/^[^\n]+//;
+  tr/\r\n//d;
+  $seq{$name} = $_;
+}
+close(FASTA);
+$/ = "\n";
+
+my $cov_cutoff = $ARGV[7];
+my %reporting;
+print STDERR "Reading in reporting regions list...\n" unless $quiet;
+open(GTF, $ARGV[5])
+  or die "Cannot open $ARGV[5] for reading: $!\n";
+while(<GTF>){
+    next if /^\s*#/;
+    my @fields = split /\t/, $_;
+
+    if($fields[2] eq "exon"){
+      if(not exists $reporting{$fields[0]}){
+        $reporting{$fields[0]} = [[$fields[3], $fields[4]]];
+        next;
+      }
+      push @{$reporting{$fields[0]}}, [$fields[3], $fields[4]];
+    }
+}
+close(GTF);
+for my $c (keys %reporting){
+  $reporting{$c} = [sort {$a->[0] <=> $b->[0]} @{$reporting{$c}}];
+}
+
+my %regions;
+if(@ARGV == 9){
+  print STDERR "Reading in target regions list...\n" unless $quiet;
+  open(BED, $ARGV[8])
+    or die "Cannot open $ARGV[8] for reading: $!\n";
+  while(<BED>){
+    chomp;
+    my @F = split /\t/, $_;
+    next unless @F > 4;
+    if(not exists $regions{$F[0]}){
+      $regions{$F[0]} = [];
+    }
+    push @{$regions{$F[0]}}, [$F[1],$F[2]]; # assume they are in order
+  }
+}
+
+open(BED, ">$ARGV[0]")
+  or die "Cannot open $ARGV[0] for writing: $!\n";
+print BED "track name=\"NGSvsMicroarrayDiscordance\" columns=\"chr pos pos microGenotype/ngsGenotype NGSReadDepth\" comment=\"asterisk after genotype indicates likely microarray false positive based on NGS coverage, lack of caveats and no mismatched NGS data\" useScore=\"colour gradient\"\n";
+
+open(MICRO, $ARGV[3])
+  or die "Cannot open microarray calls $ARGV[3] for reading: $!\n";
+my $micro_count = 1;
+do {$_ = <MICRO>} while /^#/; #header lines
+$micro_count++ while <MICRO>; # get a line count for progress meter later
+close(MICRO);
+$micro_count = int($micro_count/100);
+
+print STDERR "Reading in NGS calls...\n" unless $quiet;
+open(MICRO, $ARGV[3])
+  or die "Cannot open microarray calls $ARGV[3] for reading: $!\n";
+open(HGVS, $ARGV[2])
+  or die "Cannot open HGVS genotype calls $ARGV[2] for reading: $!\n";
+open(SUMMARY, ">$ARGV[1]")
+  or die "Cannot open $ARGV[1] for writing: $!\n";
+my $bam_file = $ARGV[4];
+die "Input BAM file does not exist\n" if not -e $bam_file;
+die "Input BAM file is not readable\n" if not -r $bam_file;
+die "Input BAM file is empty\n" if -z $bam_file;
+<HGVS>; # header
+my (%ngs_call, %ngs_caveats, %ngs_methods, %ngs_varreads, %ngs_depth, %key2pos, %ignore);
+while(<HGVS>){
+  chomp;
+  my @F = split /\t/, $_;
+  my $pos_key = "$F[4]:$F[5]";
+  # ignore non-SNPs
+  if($F[2] =~ /c.[\-*]?(\d+)_(\d+)/){ #multi-base events ignored, as microarray probes are likely to be reporting wrong here anyway
+    my $modlength = $2-$1;
+    for my $offset (0..$modlength){
+      if($F[3] eq "-"){
+        $ignore{"$F[4]:".($F[5]-$offset)} = 1;
+      }
+      else{
+        $ignore{"$F[4]:".($F[5]+$offset)} = 1;
+      }
+    }
+  }
+  elsif($F[2] =~ /(?:del|ins|inv)/){
+    next;
+  }
+  next unless $F[2] =~ />([ACGT])$/; # only looking for SNPs, to do add MNPs
+  my $new_base = $1;
+  $new_base =~ tr/ACGT/TGCA/ if $F[3] eq "-"; # rev comp cDNA HGVS
+  my $rs_key = $F[11];   # use both position and rsID as patches to hg19 have shifted some positions
+  my $ngs_call = ($F[6] =~ /homozygote/ ? $new_base : $F[10]) . $new_base;
+  $ngs_call{$pos_key} = $ngs_call{$rs_key} = $ngs_call;
+  $ngs_caveats{$pos_key} = $ngs_caveats{$rs_key} = $F[16] if $F[16] =~ /\S/;
+  $ngs_methods{$pos_key} = $ngs_methods{$rs_key} = $F[18];
+  $ngs_varreads{$pos_key} = $ngs_varreads{$rs_key} = $F[8];
+  $ngs_depth{$pos_key} = $ngs_depth{$rs_key} = $F[9];
+  $key2pos{$rs_key} = $key2pos{$pos_key} = [$F[4],$F[5]];
+}
+close(HGVS);
+
+print STDERR "Comparing to microarray calls...\n" unless $quiet;
+print STDERR "         10%       20%       30%       40%       50%       60%       70%       80%       90%       100%\n" unless $quiet;
+my $num_ignored_snps = 0;
+my $num_usable_snps = 0;
+my $num_targeted_snps = 0;
+my %discordant; # key => [zygosity, method, caveats, variant depth, total depth];
+my %cov_missing; # List all sites with unknown coverage
+do {$_ = <MICRO>} while /^#/; #header lines
+while(<MICRO>){
+ if(not $quiet){
+   if($.%($micro_count*10) == 1){
+     print STDERR "|";
+   }
+   elsif($.%($micro_count*5) == 1){
+     print STDERR ":";
+   }
+   elsif($.%$micro_count == 1){
+     print STDERR ".";
+   }
+ }
+ chomp;
+ my @F = split /\t/, $_;
+ my $micro_call = $F[1];
+ next if $micro_call eq "---";
+ my $rsid = $F[2];
+ my $chr = $F[3];
+ next if $chr eq "---";
+ my $pos = $F[4];
+ if(exists $ignore{"chr$chr:$pos"}){ # multinucleotide events probably wrong on the microarray
+   $num_ignored_snps++;
+   next;
+ }
+ $num_usable_snps++;
+
+ if(exists $reporting{"chr".$chr}){ # in a region reported in the annotation?
+   my $in_region = 0;
+   my $arrayref = $reporting{"chr".$chr};
+   for(my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){
+     my $interval = $arrayref->[$search_index];
+     if($pos >= $interval->[0] and $pos <= $interval->[1]){
+       $in_region = 1; last;
+     }
+     last if $pos < $interval->[0];
+   }
+   next unless $in_region;
+ }
+
+ if(exists $regions{"chr".$chr}){ # in the exome target region?
+   my $in_region = 0;
+   my $arrayref = $regions{"chr".$chr};
+   for (my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){
+     my $interval = $arrayref->[$search_index];
+     if($pos >= $interval->[0] and $pos <= $interval->[1]){
+       $in_region = 1; last;
+     }
+     last if $pos < $interval->[0];
+   }
+
+   next unless $in_region;
+ }
+
+ $num_targeted_snps++;
+ my $m = substr($micro_call, 0, 1);
+ my $m2 = substr($micro_call, 1, 1);
+ my $key;
+ if(exists $ngs_call{$rsid}){
+   $key = $rsid;
+ }
+ elsif(exists $ngs_call{"chr$chr:$pos"}){
+   $key = "chr$chr:$pos";
+ }
+ else{
+   if(not exists $seq{"chr$chr"}){
+     warn "Skipping microarray call, no reference sequence was provided for chr$chr\n";
+     $num_targeted_snps--;
+     next;
+   }
+   # 4 situations could be e.g. ref AA, but micro says AA or AG or GG or GC
+   my $ref_base = uc(substr($seq{"chr$chr"}, $pos-1, 1));
+   if($m eq $m2){
+     if($micro_call eq $ref_base.$ref_base){
+       # concordant homo calls as reference
+       # undefs will be filled in later en masse in a single call to samtools for efficiency
+       $discordant{$rsid} = ["micro ref, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$micro_call"];
+     }
+     else{
+       # called homo var by microarray, but homo ref by NGS
+       #print STDERR "chr$chr $pos micro $micro_call vs NGS homo ref $ref_base";
+       $discordant{$rsid} = ["micro homo, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"];
+     }
+   }
+   else { #$m ne $m2 
+     if($m eq $ref_base or $m2 eq $ref_base){
+       # called het var by microarray, but homo ref by NGS
+       $discordant{$rsid} = ["micro het, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"];
+     }
+     else{
+       $discordant{$rsid} = ["micro compound het, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"];
+     }
+   }
+   $cov_missing{"chr$chr:$pos"} = [$rsid, $m, $m2];
+   next;
+ }
+
+ next if $ngs_depth{$key} <= $cov_cutoff;
+ my $ngs_call = $ngs_call{$key};
+ # if we get to here, there are both NGS and micro calls
+ if($micro_call eq $ngs_call or $micro_call eq reverse($ngs_call)){
+   #print STDERR "Concordant NGS call for chr$chr:$pos\n";
+   if($m eq $m2){
+     $discordant{$key} = ["micro homo, ngs homo", $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"];
+   }
+   else{
+     $discordant{$key} = ["micro het, ngs het", $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"];
+   }
+   next;
+ }
+ else{
+   # discordant
+   #print STDERR "Discordant NGS call for chr$chr:$pos $micro_call vs. $ngs_call\n";
+     # is it just a zygosity difference?
+     my $n = substr($ngs_call, 0, 1);
+     my $n2 = substr($ngs_call, 1, 1);
+     my $discordance;
+     if($micro_call eq $n.$n or $micro_call eq $n2.$n2){ # micro is homo for variant, ngs is het
+       $discordance = "micro homo, ngs het";
+     }
+     elsif($m.$m eq $ngs_call or $m2.$m2 eq $ngs_call){ # micro is het for variant, ngs is homo
+       $discordance = "micro het, ngs homo";
+     }
+     elsif($m eq $m2){
+       if($n eq $n2){
+         $discordance = "diff micro homo, ngs homo";
+       }
+       else{
+         $discordance = "diff micro homo, ngs het";
+       }
+     }
+     else{
+       if($n eq $n2){
+         $discordance = "diff micro het, ngs homo";
+       }
+       else{
+         $discordance = "diff micro het, ngs het";
+       }
+     }
+     $discordant{$key} = [$discordance, $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"];
+ }
+}
+close(MICRO);
+print STDERR "\n" unless $quiet;
+
+print STDERR "Retrieving reference call depth of coverage stats...\n" unless $quiet;
+my $covbed = "$$.bed";
+open(TMPCOV, ">$covbed")
+  or die "Cannot open temporary BED file for samtools: $!\n";
+my $num_covmissing = 0;
+for(sort keys %cov_missing){
+  $num_covmissing++;
+  my ($chr, $pos) = split /:/, $_;
+  print TMPCOV "$chr\t$pos\n";
+}
+close(TMPCOV);
+
+my $cov_count = int($num_covmissing/100);
+print STDERR "         10%       20%       30%       40%       50%       60%       70%       80%       90%       100%\n" unless $quiet;
+#print STDERR "samtools mpileup -l $covbed -I $bam_file 2>/dev/null\n";
+#<STDIN>;
+open(SAM, "samtools mpileup -l $covbed -I $bam_file 2>/dev/null |")
+  or die "Cannot run samtools: $!\n";;
+while(<SAM>){
+#while($depth_data =~ /([^\n]+)/sg){
+   if(not $quiet){
+     if($.%($cov_count*10) == 0){
+       print STDERR "|";
+     }
+     elsif($.%($cov_count*5) == 0){
+       print STDERR ":";
+     }
+     elsif($.%$cov_count == 0){
+       print STDERR ".";
+     }
+   }
+
+   chomp;
+   my @B = split /\t/, $_;
+   #my @B = split /\t/, $1;
+   my ($rsid, $m, $m2) = @{$cov_missing{"$B[0]:$B[1]"}};
+   my $depth = @B > 1 ? $B[3] : 0;
+   #print STDERR "Depth is $depth for $B[0]:$B[1]\n";
+   my $read_calls = uc($B[4]);
+   my %base_tot;
+   for my $read_call (split //, $read_calls){
+     $base_tot{$read_call}++;
+   }
+   $base_tot{$m} = 0 if not exists $base_tot{$m};
+   $base_tot{$m2} = 0 if not exists $base_tot{$m2};
+   if($depth <= $cov_cutoff){
+     $discordant{$rsid}->[0] = "no ngs call";
+     $discordant{$rsid}->[1] = "insufficient coverage";
+     $discordant{$rsid}->[3] = $base_tot{$m};
+     $discordant{$rsid}->[4] = $depth;
+   }
+   elsif($discordant{$rsid}->[0] eq "micro ref, ngs ref" or $discordant{$rsid}->[0] eq "micro homo, ngs ref"){
+     # concordant homo calls as reference or called homo var by microarray, but homo ref by NGS
+     $discordant{$rsid}->[3] = $base_tot{$m};
+     $discordant{$rsid}->[4] = $depth;
+   }
+   elsif($discordant{$rsid}->[0] eq "micro het, ngs ref" or $discordant{$rsid}->[0] eq "micro compound het, ngs ref"){
+     # called het var by microarray, but homo ref by NGS
+     $discordant{$rsid}->[3] = $base_tot{$m}.",".$base_tot{$m2};
+     $discordant{$rsid}->[4] = $depth;
+   }
+   else{
+     print STDERR "Didn't know how to deal with $B[0]:$B[1] -> ", join(" ", @{$discordant{$rsid}}),"\n" unless $quiet;
+   }
+}
+unlink $covbed;
+
+#Count false negatives
+my $false_neg_homo_count = 0;
+my $false_neg_het_count = 0;
+my $missing_count = 0;
+my $wrong_base_count = 0;
+my $wrong_base_caveat_count = 0;
+my $discordant_caveat_count = 0;
+my $tot_caveat_count = 0;
+my $false_homo_count = 0;
+my $false_het_count = 0;
+my $micro_fdr_estimate = 0;
+my %calls_by_method;
+my %concordance_by_method;
+for my $key (keys %discordant){
+  my $rec = $discordant{$key};
+  my $name = $rec->[7];
+  $tot_caveat_count++ if $rec->[2];
+  if($rec->[0] eq "micro het, ngs ref" or $rec->[0] eq "micro homo, ngs ref"){
+    if($rec->[0] eq "micro homo, ngs ref"){
+      $false_neg_homo_count++;
+    }
+    else{
+      $false_neg_het_count++;
+    }
+    # probably false micro call if no caveats, high coverage, and no/one mismatches in NGS
+    next unless defined $rec and defined $rec->[3] and defined $rec->[4];
+    if(not $rec->[2] and $rec->[4] >= 50 and $rec->[3] =~ /(\d{2,})/ and $1+1 >= $rec->[4]){
+      $micro_fdr_estimate++;
+      $name .= "*";
+    }
+  }
+  elsif($rec->[0] eq "micro compound het, ngs ref"){
+    $false_neg_het_count+=2;
+    # probably false micro call if no caveats, high coverage, and no/one mismatches in NGS
+    next unless defined $rec and defined $rec->[3] and defined $rec->[4];
+    if(not $rec->[2] and $rec->[4] >= 50 and $rec->[3] =~ /(\d{2,})/ and $1+1 >= $rec->[4]){
+      $micro_fdr_estimate+=2;
+      $name .= "*";
+    }
+  }
+  elsif($rec->[0] eq "no ngs call"){
+    $missing_count++;
+  }
+  elsif($rec->[0] =~ /^diff/){
+    $wrong_base_count++;
+    if($rec->[3] =~ /\S/){
+      $wrong_base_caveat_count++;
+    }
+  }
+  elsif($rec->[0] eq "micro het, ngs homo"){
+    $false_homo_count++;
+  }
+  elsif($rec->[0] eq "micro homo, ngs het"){
+    $false_het_count++;
+  }
+  else{
+    # calls concordant
+    $concordance_by_method{$rec->[1]}++;
+    next;
+  }
+  if($rec->[4] > $cov_cutoff and $rec->[2] and $name !~ /\*$/){
+    $discordant_caveat_count++;
+  }
+  my $chr = $rec->[5];
+  my $pos = $rec->[6];
+  my $score = $rec->[4];
+  $score = 1000 if $score > 1000;
+  print BED "$chr\t$pos\t$pos\t$name\t$score\n";
+}
+
+print SUMMARY "# NGS genotypes in $ARGV[2] vs. SNP microarray in $ARGV[3], minimum NGS coverage of ",$cov_cutoff+1, "\n";
+print SUMMARY "#Columns:  Measure\tCount\tPercentage\n";
+print SUMMARY "Total ignored SNP microarray calls due to NGS putative indels or MNPs\t$num_ignored_snps\n";
+print SUMMARY "Total usable SNP microarray calls\t$num_usable_snps\n";
+printf SUMMARY "Total targeted SNP microarray calls (based on target file %s)\t%d\t%.2f\n", $ARGV[5],$num_targeted_snps,$num_targeted_snps/$num_usable_snps*100 if keys %regions;
+printf SUMMARY "Targeted SNPs with insufficient NGS coverage (<=$cov_cutoff)\t%d\t%.2f\n", $missing_count, $missing_count/$num_targeted_snps*100;
+$num_targeted_snps -= $missing_count;
+my $tot_bad = $wrong_base_count+$false_neg_homo_count+$false_neg_het_count+$false_homo_count+$false_het_count;
+printf SUMMARY "Total discordance\t%d\t%.2f\n", $tot_bad, $tot_bad/$num_targeted_snps*100;
+$tot_bad -= $discordant_caveat_count+$micro_fdr_estimate;
+printf SUMMARY "Significant discordance (excludes NGS calls with caveats, microarray het FDR)\t%d\t%.2f\n", $tot_bad, $tot_bad/$num_targeted_snps*100;
+printf SUMMARY "Caveats discordant\t%d\t%.2f\n", $discordant_caveat_count, $tot_caveat_count == 0 ? 0 : $discordant_caveat_count/$tot_caveat_count*100;
+printf SUMMARY "Incorrect NGS base called\t%d\t%.2f\n", $wrong_base_count, $wrong_base_count/$num_targeted_snps*100;
+printf SUMMARY "Incorrect NGS base called, subset with caveats\t%d\t%.2f\n", $wrong_base_caveat_count, $wrong_base_count == 0 ? 0 : $wrong_base_caveat_count/$wrong_base_count*100;
+printf SUMMARY "False negative NGS homo\t%d\t%.2f\n", $false_neg_homo_count, $false_neg_homo_count/$num_targeted_snps*100;
+printf SUMMARY "False negative NGS het\t%d\t%.2f\n", $false_neg_het_count, $false_neg_het_count/$num_targeted_snps*100;
+printf SUMMARY "Microarray est. FDR het\t%d\t%.2f\n", $micro_fdr_estimate, $micro_fdr_estimate/$num_targeted_snps*100;
+printf SUMMARY "Het called NGS homo\t%d\t%.2f\n", $false_homo_count, $false_homo_count/$num_targeted_snps*100;
+printf SUMMARY "Homo called NGS het\t%d\t%.2f\n", $false_het_count, $false_het_count/$num_targeted_snps*100;
+for(sort keys %concordance_by_method){
+  printf SUMMARY "%s true positives\t%d\t%.2f\n", (length($_) ? $_ : "Reference"), $concordance_by_method{$_}, $concordance_by_method{$_}/$num_targeted_snps*100;
+}
+sub find_earliest_index{
+  # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start
+  my ($query, $array) = @_;
+
+  return 0 if $query < $array->[0]->[0];
+
+  my ($left, $right, $prevCenter) = (0, $#$array, -1);
+
+  while(1) {
+    my $center = int (($left + $right)/2);
+
+    my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1);
+
+    return $center if $cmp == 0;
+    if ($center == $prevCenter) {
+      return $left;
+    }
+    $right = $center if $cmp < 0;
+    $left  = $center if $cmp > 0;
+    $prevCenter = $center;
+  }
+}