Mercurial > repos > yusuf > microarray_report

#!/usr/bin/env perl

use strict;
use warnings;
use File::Basename;
# Assume the microarray file looks something like this...
## comment lines...
#Probe Set ID	11-01451_(GenomeWideSNP_5).brlmm-p.chp Forward Strand Base Calls	dbSNP RS ID	Chromosome	Chromosomal Position
#SNP_A-1780520	GG	rs16994928	20	48440771
#SNP_A-1780985	AG	rs3859360	18	34178190
#...end example input file
if(@ARGV == 1 and $ARGV[0] eq "-v"){
  print "Version 1.0\n";
  exit;
}

# configuration file stuff
my $dirname = dirname(__FILE__);
my %config;
my $tool_data =  shift @ARGV;
if(not -e "$tool_data/microarray_report.loc"){
  system("cp $dirname/tool-data/microarray_report.loc $tool_data/microarray_report.loc");
}
open CONFIG, '<', "$tool_data/microarray_report.loc";
while(<CONFIG>){
  (my $key, my $value) = split(/\s+/, $_);
  $config{$key} = $value;
}
my $dbs_dir = $config{"dbs_directory"};
close CONFIG;

my $quiet = 0;
if(@ARGV and $ARGV[0] =~ /^-q/){
  $quiet = 1;
  shift @ARGV;
}

@ARGV == 8 or @ARGV == 9 or die "Usage: $0 [-q(uiet)] <output discordant.bed> <output summary.txt> <input ngs hgvs.txt> <input microarray calls.txt> <input.bam> <reporting cds regions.gtf> <reference genome.fasta> <coverage cutoff for stats> [exome target ngs regions.bed]\n";

print STDERR "Reading in reference sequence...\n" unless $quiet;
my %seq;
open(FASTA, "$dbs_dir/$ARGV[6]" )
  or die "Cannot open $dbs_dir/$ARGV[6] for reading: $!\n";
$/ = "\n>";
while(<FASTA>){
  chomp;
  my ($name) = /^>?(\S+)/;
  s/^[^\n]+//;
  tr/\r\n//d;
  $seq{$name} = $_;
}
close(FASTA);
$/ = "\n";

my $cov_cutoff = $ARGV[7];
my %reporting;
print STDERR "Reading in reporting regions list...\n" unless $quiet;
open(GTF, $ARGV[5])
  or die "Cannot open $ARGV[5] for reading: $!\n";
while(<GTF>){
    next if /^\s*#/;
    my @fields = split /\t/, $_;

    if($fields[2] eq "exon"){
      if(not exists $reporting{$fields[0]}){
        $reporting{$fields[0]} = [[$fields[3], $fields[4]]];
        next;
      }
      push @{$reporting{$fields[0]}}, [$fields[3], $fields[4]];
    }
}
close(GTF);
for my $c (keys %reporting){
  $reporting{$c} = [sort {$a->[0] <=> $b->[0]} @{$reporting{$c}}];
}

my %regions;
if(@ARGV == 9){
  print STDERR "Reading in target regions list...\n" unless $quiet;
  open(BED, $ARGV[8])
    or die "Cannot open $ARGV[8] for reading: $!\n";
  while(<BED>){
    chomp;
    my @F = split /\t/, $_;
    next unless @F > 4;
    if(not exists $regions{$F[0]}){
      $regions{$F[0]} = [];
    }
    push @{$regions{$F[0]}}, [$F[1],$F[2]]; # assume they are in order
  }
}

open(BED, ">$ARGV[0]")
  or die "Cannot open $ARGV[0] for writing: $!\n";
print BED "track name=\"NGSvsMicroarrayDiscordance\" columns=\"chr pos pos microGenotype/ngsGenotype NGSReadDepth\" comment=\"asterisk after genotype indicates likely microarray false positive based on NGS coverage, lack of caveats and no mismatched NGS data\" useScore=\"colour gradient\"\n";

open(MICRO, $ARGV[3])
  or die "Cannot open microarray calls $ARGV[3] for reading: $!\n";
my $micro_count = 1;
do {$_ = <MICRO>} while /^#/; #header lines
$micro_count++ while <MICRO>; # get a line count for progress meter later
close(MICRO);
$micro_count = int($micro_count/100);

print STDERR "Reading in NGS calls...\n" unless $quiet;
open(MICRO, $ARGV[3])
  or die "Cannot open microarray calls $ARGV[3] for reading: $!\n";
open(HGVS, $ARGV[2])
  or die "Cannot open HGVS genotype calls $ARGV[2] for reading: $!\n";
open(SUMMARY, ">$ARGV[1]")
  or die "Cannot open $ARGV[1] for writing: $!\n";
my $bam_file = $ARGV[4];
die "Input BAM file does not exist\n" if not -e $bam_file;
die "Input BAM file is not readable\n" if not -r $bam_file;
die "Input BAM file is empty\n" if -z $bam_file;
<HGVS>; # header
my (%ngs_call, %ngs_caveats, %ngs_methods, %ngs_varreads, %ngs_depth, %key2pos, %ignore);
while(<HGVS>){
  chomp;
  my @F = split /\t/, $_;
  my $pos_key = "$F[4]:$F[5]";
  # ignore non-SNPs
  if($F[2] =~ /c.[\-*]?(\d+)_(\d+)/){ #multi-base events ignored, as microarray probes are likely to be reporting wrong here anyway
    my $modlength = $2-$1;
    for my $offset (0..$modlength){
      if($F[3] eq "-"){
        $ignore{"$F[4]:".($F[5]-$offset)} = 1;
      }
      else{
        $ignore{"$F[4]:".($F[5]+$offset)} = 1;
      }
    }
  }
  elsif($F[2] =~ /(?:del|ins|inv)/){
    next;
  }
  next unless $F[2] =~ />([ACGT])$/; # only looking for SNPs, to do add MNPs
  my $new_base = $1;
  $new_base =~ tr/ACGT/TGCA/ if $F[3] eq "-"; # rev comp cDNA HGVS
  my $rs_key = $F[11];   # use both position and rsID as patches to hg19 have shifted some positions
  my $ngs_call = ($F[6] =~ /homozygote/ ? $new_base : $F[10]) . $new_base;
  $ngs_call{$pos_key} = $ngs_call{$rs_key} = $ngs_call;
  $ngs_caveats{$pos_key} = $ngs_caveats{$rs_key} = $F[16] if $F[16] =~ /\S/;
  $ngs_methods{$pos_key} = $ngs_methods{$rs_key} = $F[18];
  $ngs_varreads{$pos_key} = $ngs_varreads{$rs_key} = $F[8];
  $ngs_depth{$pos_key} = $ngs_depth{$rs_key} = $F[9];
  $key2pos{$rs_key} = $key2pos{$pos_key} = [$F[4],$F[5]];
}
close(HGVS);

print STDERR "Comparing to microarray calls...\n" unless $quiet;
print STDERR "         10%       20%       30%       40%       50%       60%       70%       80%       90%       100%\n" unless $quiet;
my $num_ignored_snps = 0;
my $num_usable_snps = 0;
my $num_targeted_snps = 0;
my %discordant; # key => [zygosity, method, caveats, variant depth, total depth];
my %cov_missing; # List all sites with unknown coverage
do {$_ = <MICRO>} while /^#/; #header lines
while(<MICRO>){
 if(not $quiet){
   if($.%($micro_count*10) == 1){
     print STDERR "|";
   }
   elsif($.%($micro_count*5) == 1){
     print STDERR ":";
   }
   elsif($.%$micro_count == 1){
     print STDERR ".";
   }
 }
 chomp;
 my @F = split /\t/, $_;
 my $micro_call = $F[1];
 next if $micro_call eq "---";
 my $rsid = $F[2];
 my $chr = $F[3];
 next if $chr eq "---";
 my $pos = $F[4];
 if(exists $ignore{"chr$chr:$pos"}){ # multinucleotide events probably wrong on the microarray
   $num_ignored_snps++;
   next;
 }
 $num_usable_snps++;

 if(exists $reporting{"chr".$chr}){ # in a region reported in the annotation?
   my $in_region = 0;
   my $arrayref = $reporting{"chr".$chr};
   for(my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){
     my $interval = $arrayref->[$search_index];
     if($pos >= $interval->[0] and $pos <= $interval->[1]){
       $in_region = 1; last;
     }
     last if $pos < $interval->[0];
   }
   next unless $in_region;
 }

 if(exists $regions{"chr".$chr}){ # in the exome target region?
   my $in_region = 0;
   my $arrayref = $regions{"chr".$chr};
   for (my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){
     my $interval = $arrayref->[$search_index];
     if($pos >= $interval->[0] and $pos <= $interval->[1]){
       $in_region = 1; last;
     }
     last if $pos < $interval->[0];
   }

   next unless $in_region;
 }

 $num_targeted_snps++;
 my $m = substr($micro_call, 0, 1);
 my $m2 = substr($micro_call, 1, 1);
 my $key;
 if(exists $ngs_call{$rsid}){
   $key = $rsid;
 }
 elsif(exists $ngs_call{"chr$chr:$pos"}){
   $key = "chr$chr:$pos";
 }
 else{
   if(not exists $seq{"chr$chr"}){
     warn "Skipping microarray call, no reference sequence was provided for chr$chr\n";
     $num_targeted_snps--;
     next;
   }
   # 4 situations could be e.g. ref AA, but micro says AA or AG or GG or GC
   my $ref_base = uc(substr($seq{"chr$chr"}, $pos-1, 1));
   if($m eq $m2){
     if($micro_call eq $ref_base.$ref_base){
       # concordant homo calls as reference
       # undefs will be filled in later en masse in a single call to samtools for efficiency
       $discordant{$rsid} = ["micro ref, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$micro_call"];
     }
     else{
       # called homo var by microarray, but homo ref by NGS
       #print STDERR "chr$chr $pos micro $micro_call vs NGS homo ref $ref_base";
       $discordant{$rsid} = ["micro homo, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"];
     }
   }
   else { #$m ne $m2
     if($m eq $ref_base or $m2 eq $ref_base){
       # called het var by microarray, but homo ref by NGS
       $discordant{$rsid} = ["micro het, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"];
     }
     else{
       $discordant{$rsid} = ["micro compound het, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"];
     }
   }
   $cov_missing{"chr$chr:$pos"} = [$rsid, $m, $m2];
   next;
 }

 next if $ngs_depth{$key} <= $cov_cutoff;
 my $ngs_call = $ngs_call{$key};
 # if we get to here, there are both NGS and micro calls
 if($micro_call eq $ngs_call or $micro_call eq reverse($ngs_call)){
   #print STDERR "Concordant NGS call for chr$chr:$pos\n";
   if($m eq $m2){
     $discordant{$key} = ["micro homo, ngs homo", $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"];
   }
   else{
     $discordant{$key} = ["micro het, ngs het", $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"];
   }
   next;
 }
 else{
   # discordant
   #print STDERR "Discordant NGS call for chr$chr:$pos $micro_call vs. $ngs_call\n";
     # is it just a zygosity difference?
     my $n = substr($ngs_call, 0, 1);
     my $n2 = substr($ngs_call, 1, 1);
     my $discordance;
     if($micro_call eq $n.$n or $micro_call eq $n2.$n2){ # micro is homo for variant, ngs is het
       $discordance = "micro homo, ngs het";
     }
     elsif($m.$m eq $ngs_call or $m2.$m2 eq $ngs_call){ # micro is het for variant, ngs is homo
       $discordance = "micro het, ngs homo";
     }
     elsif($m eq $m2){
       if($n eq $n2){
         $discordance = "diff micro homo, ngs homo";
       }
       else{
         $discordance = "diff micro homo, ngs het";
       }
     }
     else{
       if($n eq $n2){
         $discordance = "diff micro het, ngs homo";
       }
       else{
         $discordance = "diff micro het, ngs het";
       }
     }
     $discordant{$key} = [$discordance, $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"];
 }
}
close(MICRO);
print STDERR "\n" unless $quiet;

print STDERR "Retrieving reference call depth of coverage stats...\n" unless $quiet;
my $covbed = "$$.bed";
open(TMPCOV, ">$covbed")
  or die "Cannot open temporary BED file for samtools: $!\n";
my $num_covmissing = 0;
for(sort keys %cov_missing){
  $num_covmissing++;
  my ($chr, $pos) = split /:/, $_;
  print TMPCOV "$chr\t$pos\n";
}
close(TMPCOV);

my $cov_count = int($num_covmissing/100);
print STDERR "         10%       20%       30%       40%       50%       60%       70%       80%       90%       100%\n" unless $quiet;
#print STDERR "samtools mpileup -l $covbed -I $bam_file 2>/dev/null\n";
#<STDIN>;
open(SAM, "samtools mpileup -l $covbed -I $bam_file 2>/dev/null |")
  or die "Cannot run samtools: $!\n";;
while(<SAM>){
#while($depth_data =~ /([^\n]+)/sg){
   if(not $quiet){
     if($.%($cov_count*10) == 0){
       print STDERR "|";
     }
     elsif($.%($cov_count*5) == 0){
       print STDERR ":";
     }
     elsif($.%$cov_count == 0){
       print STDERR ".";
     }
   }

   chomp;
   my @B = split /\t/, $_;
   #my @B = split /\t/, $1;
   my ($rsid, $m, $m2) = @{$cov_missing{"$B[0]:$B[1]"}};
   my $depth = @B > 1 ? $B[3] : 0;
   #print STDERR "Depth is $depth for $B[0]:$B[1]\n";
   my $read_calls = uc($B[4]);
   my %base_tot;
   for my $read_call (split //, $read_calls){
     $base_tot{$read_call}++;
   }
   $base_tot{$m} = 0 if not exists $base_tot{$m};
   $base_tot{$m2} = 0 if not exists $base_tot{$m2};
   if($depth <= $cov_cutoff){
     $discordant{$rsid}->[0] = "no ngs call";
     $discordant{$rsid}->[1] = "insufficient coverage";
     $discordant{$rsid}->[3] = $base_tot{$m};
     $discordant{$rsid}->[4] = $depth;
   }
   elsif($discordant{$rsid}->[0] eq "micro ref, ngs ref" or $discordant{$rsid}->[0] eq "micro homo, ngs ref"){
     # concordant homo calls as reference or called homo var by microarray, but homo ref by NGS
     $discordant{$rsid}->[3] = $base_tot{$m};
     $discordant{$rsid}->[4] = $depth;
   }
   elsif($discordant{$rsid}->[0] eq "micro het, ngs ref" or $discordant{$rsid}->[0] eq "micro compound het, ngs ref"){
     # called het var by microarray, but homo ref by NGS
     $discordant{$rsid}->[3] = $base_tot{$m}.",".$base_tot{$m2};
     $discordant{$rsid}->[4] = $depth;
   }
   else{
     print STDERR "Didn't know how to deal with $B[0]:$B[1] -> ", join(" ", @{$discordant{$rsid}}),"\n" unless $quiet;
   }
}
unlink $covbed;

#Count false negatives
my $false_neg_homo_count = 0;
my $false_neg_het_count = 0;
my $missing_count = 0;
my $wrong_base_count = 0;
my $wrong_base_caveat_count = 0;
my $discordant_caveat_count = 0;
my $tot_caveat_count = 0;
my $false_homo_count = 0;
my $false_het_count = 0;
my $micro_fdr_estimate = 0;
my %calls_by_method;
my %concordance_by_method;
for my $key (keys %discordant){
  my $rec = $discordant{$key};
  my $name = $rec->[7];
  $tot_caveat_count++ if $rec->[2];
  if($rec->[0] eq "micro het, ngs ref" or $rec->[0] eq "micro homo, ngs ref"){
    if($rec->[0] eq "micro homo, ngs ref"){
      $false_neg_homo_count++;
    }
    else{
      $false_neg_het_count++;
    }
    # probably false micro call if no caveats, high coverage, and no/one mismatches in NGS
    next unless defined $rec and defined $rec->[3] and defined $rec->[4];
    if(not $rec->[2] and $rec->[4] >= 50 and $rec->[3] =~ /(\d{2,})/ and $1+1 >= $rec->[4]){
      $micro_fdr_estimate++;
      $name .= "*";
    }
  }
  elsif($rec->[0] eq "micro compound het, ngs ref"){
    $false_neg_het_count+=2;
    # probably false micro call if no caveats, high coverage, and no/one mismatches in NGS
    next unless defined $rec and defined $rec->[3] and defined $rec->[4];
    if(not $rec->[2] and $rec->[4] >= 50 and $rec->[3] =~ /(\d{2,})/ and $1+1 >= $rec->[4]){
      $micro_fdr_estimate+=2;
      $name .= "*";
    }
  }
  elsif($rec->[0] eq "no ngs call"){
    $missing_count++;
  }
  elsif($rec->[0] =~ /^diff/){
    $wrong_base_count++;
    if($rec->[3] =~ /\S/){
      $wrong_base_caveat_count++;
    }
  }
  elsif($rec->[0] eq "micro het, ngs homo"){
    $false_homo_count++;
  }
  elsif($rec->[0] eq "micro homo, ngs het"){
    $false_het_count++;
  }
  else{
    # calls concordant
    $concordance_by_method{$rec->[1]}++;
    next;
  }
  if($rec->[4] > $cov_cutoff and $rec->[2] and $name !~ /\*$/){
    $discordant_caveat_count++;
  }
  my $chr = $rec->[5];
  my $pos = $rec->[6];
  my $score = $rec->[4];
  $score = 1000 if $score > 1000;
  print BED "$chr\t$pos\t$pos\t$name\t$score\n";
}

print SUMMARY "# NGS genotypes in $ARGV[2] vs. SNP microarray in $ARGV[3], minimum NGS coverage of ",$cov_cutoff+1, "\n";
print SUMMARY "#Columns:  Measure\tCount\tPercentage\n";
print SUMMARY "Total ignored SNP microarray calls due to NGS putative indels or MNPs\t$num_ignored_snps\n";
print SUMMARY "Total usable SNP microarray calls\t$num_usable_snps\n";
printf SUMMARY "Total targeted SNP microarray calls (based on target file %s)\t%d\t%.2f\n", $ARGV[5],$num_targeted_snps,$num_targeted_snps/$num_usable_snps*100 if keys %regions;
printf SUMMARY "Targeted SNPs with insufficient NGS coverage (<=$cov_cutoff)\t%d\t%.2f\n", $missing_count, $missing_count/$num_targeted_snps*100;
$num_targeted_snps -= $missing_count;
my $tot_bad = $wrong_base_count+$false_neg_homo_count+$false_neg_het_count+$false_homo_count+$false_het_count;
printf SUMMARY "Total discordance\t%d\t%.2f\n", $tot_bad, $tot_bad/$num_targeted_snps*100;
$tot_bad -= $discordant_caveat_count+$micro_fdr_estimate;
printf SUMMARY "Significant discordance (excludes NGS calls with caveats, microarray het FDR)\t%d\t%.2f\n", $tot_bad, $tot_bad/$num_targeted_snps*100;
printf SUMMARY "Caveats discordant\t%d\t%.2f\n", $discordant_caveat_count, $tot_caveat_count == 0 ? 0 : $discordant_caveat_count/$tot_caveat_count*100;
printf SUMMARY "Incorrect NGS base called\t%d\t%.2f\n", $wrong_base_count, $wrong_base_count/$num_targeted_snps*100;
printf SUMMARY "Incorrect NGS base called, subset with caveats\t%d\t%.2f\n", $wrong_base_caveat_count, $wrong_base_count == 0 ? 0 : $wrong_base_caveat_count/$wrong_base_count*100;
printf SUMMARY "False negative NGS homo\t%d\t%.2f\n", $false_neg_homo_count, $false_neg_homo_count/$num_targeted_snps*100;
printf SUMMARY "False negative NGS het\t%d\t%.2f\n", $false_neg_het_count, $false_neg_het_count/$num_targeted_snps*100;
printf SUMMARY "Microarray est. FDR het\t%d\t%.2f\n", $micro_fdr_estimate, $micro_fdr_estimate/$num_targeted_snps*100;
printf SUMMARY "Het called NGS homo\t%d\t%.2f\n", $false_homo_count, $false_homo_count/$num_targeted_snps*100;
printf SUMMARY "Homo called NGS het\t%d\t%.2f\n", $false_het_count, $false_het_count/$num_targeted_snps*100;
for(sort keys %concordance_by_method){
  printf SUMMARY "%s true positives\t%d\t%.2f\n", (length($_) ? $_ : "Reference"), $concordance_by_method{$_}, $concordance_by_method{$_}/$num_targeted_snps*100;
}
sub find_earliest_index{
  # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start
  my ($query, $array) = @_;

  return 0 if $query < $array->[0]->[0];

  my ($left, $right, $prevCenter) = (0, $#$array, -1);

  while(1) {
    my $center = int (($left + $right)/2);

    my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1);

    return $center if $cmp == 0;
    if ($center == $prevCenter) {
      return $left;
    }
    $right = $center if $cmp < 0;
    $left  = $center if $cmp > 0;
    $prevCenter = $center;
  }
}
author	Yusuf Ali <ali@yusuf.email>
date	Wed, 25 Mar 2015 13:40:00 -0600
parents
children