| 
0
 | 
     1 #!/usr/bin/env perl
 | 
| 
 | 
     2 
 | 
| 
 | 
     3 use strict;
 | 
| 
 | 
     4 use warnings;
 | 
| 
 | 
     5 use File::Basename;
 | 
| 
 | 
     6 # Assume the microarray file looks something like this...
 | 
| 
 | 
     7 ## comment lines...
 | 
| 
 | 
     8 #Probe Set ID	11-01451_(GenomeWideSNP_5).brlmm-p.chp Forward Strand Base Calls	dbSNP RS ID	Chromosome	Chromosomal Position
 | 
| 
 | 
     9 #SNP_A-1780520	GG	rs16994928	20	48440771
 | 
| 
 | 
    10 #SNP_A-1780985	AG	rs3859360	18	34178190
 | 
| 
 | 
    11 #...end example input file
 | 
| 
 | 
    12 if(@ARGV == 1 and $ARGV[0] eq "-v"){
 | 
| 
 | 
    13   print "Version 1.0\n";
 | 
| 
 | 
    14   exit;
 | 
| 
 | 
    15 }
 | 
| 
 | 
    16 
 | 
| 
 | 
    17 # configuration file stuff
 | 
| 
 | 
    18 my $dirname = dirname(__FILE__);
 | 
| 
 | 
    19 my %config;
 | 
| 
 | 
    20 my $tool_data =  shift @ARGV;
 | 
| 
 | 
    21 if(not -e "$tool_data/microarray_report.loc"){
 | 
| 
 | 
    22   system("cp $dirname/tool-data/microarray_report.loc $tool_data/microarray_report.loc");
 | 
| 
 | 
    23 }
 | 
| 
 | 
    24 open CONFIG, '<', "$tool_data/microarray_report.loc";
 | 
| 
 | 
    25 while(<CONFIG>){
 | 
| 
 | 
    26   (my $key, my $value) = split(/\s+/, $_);
 | 
| 
 | 
    27   $config{$key} = $value;
 | 
| 
 | 
    28 }
 | 
| 
 | 
    29 my $dbs_dir = $config{"dbs_directory"};
 | 
| 
 | 
    30 close CONFIG;
 | 
| 
 | 
    31 
 | 
| 
 | 
    32 my $quiet = 0;
 | 
| 
 | 
    33 if(@ARGV and $ARGV[0] =~ /^-q/){
 | 
| 
 | 
    34   $quiet = 1;
 | 
| 
 | 
    35   shift @ARGV;
 | 
| 
 | 
    36 }
 | 
| 
 | 
    37 
 | 
| 
 | 
    38 @ARGV == 8 or @ARGV == 9 or die "Usage: $0 [-q(uiet)] <output discordant.bed> <output summary.txt> <input ngs hgvs.txt> <input microarray calls.txt> <input.bam> <reporting cds regions.gtf> <reference genome.fasta> <coverage cutoff for stats> [exome target ngs regions.bed]\n";
 | 
| 
 | 
    39 
 | 
| 
 | 
    40 print STDERR "Reading in reference sequence...\n" unless $quiet;
 | 
| 
 | 
    41 my %seq;
 | 
| 
 | 
    42 open(FASTA, "$dbs_dir/$ARGV[6]" )
 | 
| 
 | 
    43   or die "Cannot open $dbs_dir/$ARGV[6] for reading: $!\n";
 | 
| 
 | 
    44 $/ = "\n>";
 | 
| 
 | 
    45 while(<FASTA>){
 | 
| 
 | 
    46   chomp;
 | 
| 
 | 
    47   my ($name) = /^>?(\S+)/;
 | 
| 
 | 
    48   s/^[^\n]+//;
 | 
| 
 | 
    49   tr/\r\n//d;
 | 
| 
 | 
    50   $seq{$name} = $_;
 | 
| 
 | 
    51 }
 | 
| 
 | 
    52 close(FASTA);
 | 
| 
 | 
    53 $/ = "\n";
 | 
| 
 | 
    54 
 | 
| 
 | 
    55 my $cov_cutoff = $ARGV[7];
 | 
| 
 | 
    56 my %reporting;
 | 
| 
 | 
    57 print STDERR "Reading in reporting regions list...\n" unless $quiet;
 | 
| 
 | 
    58 open(GTF, $ARGV[5])
 | 
| 
 | 
    59   or die "Cannot open $ARGV[5] for reading: $!\n";
 | 
| 
 | 
    60 while(<GTF>){
 | 
| 
 | 
    61     next if /^\s*#/;
 | 
| 
 | 
    62     my @fields = split /\t/, $_;
 | 
| 
 | 
    63 
 | 
| 
 | 
    64     if($fields[2] eq "exon"){
 | 
| 
 | 
    65       if(not exists $reporting{$fields[0]}){
 | 
| 
 | 
    66         $reporting{$fields[0]} = [[$fields[3], $fields[4]]];
 | 
| 
 | 
    67         next;
 | 
| 
 | 
    68       }
 | 
| 
 | 
    69       push @{$reporting{$fields[0]}}, [$fields[3], $fields[4]];
 | 
| 
 | 
    70     }
 | 
| 
 | 
    71 }
 | 
| 
 | 
    72 close(GTF);
 | 
| 
 | 
    73 for my $c (keys %reporting){
 | 
| 
 | 
    74   $reporting{$c} = [sort {$a->[0] <=> $b->[0]} @{$reporting{$c}}];
 | 
| 
 | 
    75 }
 | 
| 
 | 
    76 
 | 
| 
 | 
    77 my %regions;
 | 
| 
 | 
    78 if(@ARGV == 9){
 | 
| 
 | 
    79   print STDERR "Reading in target regions list...\n" unless $quiet;
 | 
| 
 | 
    80   open(BED, $ARGV[8])
 | 
| 
 | 
    81     or die "Cannot open $ARGV[8] for reading: $!\n";
 | 
| 
 | 
    82   while(<BED>){
 | 
| 
 | 
    83     chomp;
 | 
| 
 | 
    84     my @F = split /\t/, $_;
 | 
| 
 | 
    85     next unless @F > 4;
 | 
| 
 | 
    86     if(not exists $regions{$F[0]}){
 | 
| 
 | 
    87       $regions{$F[0]} = [];
 | 
| 
 | 
    88     }
 | 
| 
 | 
    89     push @{$regions{$F[0]}}, [$F[1],$F[2]]; # assume they are in order
 | 
| 
 | 
    90   }
 | 
| 
 | 
    91 }
 | 
| 
 | 
    92 
 | 
| 
 | 
    93 open(BED, ">$ARGV[0]")
 | 
| 
 | 
    94   or die "Cannot open $ARGV[0] for writing: $!\n";
 | 
| 
 | 
    95 print BED "track name=\"NGSvsMicroarrayDiscordance\" columns=\"chr pos pos microGenotype/ngsGenotype NGSReadDepth\" comment=\"asterisk after genotype indicates likely microarray false positive based on NGS coverage, lack of caveats and no mismatched NGS data\" useScore=\"colour gradient\"\n";
 | 
| 
 | 
    96 
 | 
| 
 | 
    97 open(MICRO, $ARGV[3])
 | 
| 
 | 
    98   or die "Cannot open microarray calls $ARGV[3] for reading: $!\n";
 | 
| 
 | 
    99 my $micro_count = 1;
 | 
| 
 | 
   100 do {$_ = <MICRO>} while /^#/; #header lines
 | 
| 
 | 
   101 $micro_count++ while <MICRO>; # get a line count for progress meter later
 | 
| 
 | 
   102 close(MICRO);
 | 
| 
 | 
   103 $micro_count = int($micro_count/100);
 | 
| 
 | 
   104 
 | 
| 
 | 
   105 print STDERR "Reading in NGS calls...\n" unless $quiet;
 | 
| 
 | 
   106 open(MICRO, $ARGV[3])
 | 
| 
 | 
   107   or die "Cannot open microarray calls $ARGV[3] for reading: $!\n";
 | 
| 
 | 
   108 open(HGVS, $ARGV[2])
 | 
| 
 | 
   109   or die "Cannot open HGVS genotype calls $ARGV[2] for reading: $!\n";
 | 
| 
 | 
   110 open(SUMMARY, ">$ARGV[1]")
 | 
| 
 | 
   111   or die "Cannot open $ARGV[1] for writing: $!\n";
 | 
| 
 | 
   112 my $bam_file = $ARGV[4];
 | 
| 
 | 
   113 die "Input BAM file does not exist\n" if not -e $bam_file;
 | 
| 
 | 
   114 die "Input BAM file is not readable\n" if not -r $bam_file;
 | 
| 
 | 
   115 die "Input BAM file is empty\n" if -z $bam_file;
 | 
| 
 | 
   116 <HGVS>; # header
 | 
| 
 | 
   117 my (%ngs_call, %ngs_caveats, %ngs_methods, %ngs_varreads, %ngs_depth, %key2pos, %ignore);
 | 
| 
 | 
   118 while(<HGVS>){
 | 
| 
 | 
   119   chomp;
 | 
| 
 | 
   120   my @F = split /\t/, $_;
 | 
| 
 | 
   121   my $pos_key = "$F[4]:$F[5]";
 | 
| 
 | 
   122   # ignore non-SNPs
 | 
| 
 | 
   123   if($F[2] =~ /c.[\-*]?(\d+)_(\d+)/){ #multi-base events ignored, as microarray probes are likely to be reporting wrong here anyway
 | 
| 
 | 
   124     my $modlength = $2-$1;
 | 
| 
 | 
   125     for my $offset (0..$modlength){
 | 
| 
 | 
   126       if($F[3] eq "-"){
 | 
| 
 | 
   127         $ignore{"$F[4]:".($F[5]-$offset)} = 1;
 | 
| 
 | 
   128       }
 | 
| 
 | 
   129       else{
 | 
| 
 | 
   130         $ignore{"$F[4]:".($F[5]+$offset)} = 1;
 | 
| 
 | 
   131       }
 | 
| 
 | 
   132     }
 | 
| 
 | 
   133   }
 | 
| 
 | 
   134   elsif($F[2] =~ /(?:del|ins|inv)/){
 | 
| 
 | 
   135     next;
 | 
| 
 | 
   136   }
 | 
| 
 | 
   137   next unless $F[2] =~ />([ACGT])$/; # only looking for SNPs, to do add MNPs
 | 
| 
 | 
   138   my $new_base = $1;
 | 
| 
 | 
   139   $new_base =~ tr/ACGT/TGCA/ if $F[3] eq "-"; # rev comp cDNA HGVS
 | 
| 
 | 
   140   my $rs_key = $F[11];   # use both position and rsID as patches to hg19 have shifted some positions
 | 
| 
 | 
   141   my $ngs_call = ($F[6] =~ /homozygote/ ? $new_base : $F[10]) . $new_base;
 | 
| 
 | 
   142   $ngs_call{$pos_key} = $ngs_call{$rs_key} = $ngs_call;
 | 
| 
 | 
   143   $ngs_caveats{$pos_key} = $ngs_caveats{$rs_key} = $F[16] if $F[16] =~ /\S/;
 | 
| 
 | 
   144   $ngs_methods{$pos_key} = $ngs_methods{$rs_key} = $F[18];
 | 
| 
 | 
   145   $ngs_varreads{$pos_key} = $ngs_varreads{$rs_key} = $F[8];
 | 
| 
 | 
   146   $ngs_depth{$pos_key} = $ngs_depth{$rs_key} = $F[9];
 | 
| 
 | 
   147   $key2pos{$rs_key} = $key2pos{$pos_key} = [$F[4],$F[5]];
 | 
| 
 | 
   148 }
 | 
| 
 | 
   149 close(HGVS);
 | 
| 
 | 
   150 
 | 
| 
 | 
   151 print STDERR "Comparing to microarray calls...\n" unless $quiet;
 | 
| 
 | 
   152 print STDERR "         10%       20%       30%       40%       50%       60%       70%       80%       90%       100%\n" unless $quiet;
 | 
| 
 | 
   153 my $num_ignored_snps = 0;
 | 
| 
 | 
   154 my $num_usable_snps = 0;
 | 
| 
 | 
   155 my $num_targeted_snps = 0;
 | 
| 
 | 
   156 my %discordant; # key => [zygosity, method, caveats, variant depth, total depth];
 | 
| 
 | 
   157 my %cov_missing; # List all sites with unknown coverage
 | 
| 
 | 
   158 do {$_ = <MICRO>} while /^#/; #header lines
 | 
| 
 | 
   159 while(<MICRO>){
 | 
| 
 | 
   160  if(not $quiet){
 | 
| 
 | 
   161    if($.%($micro_count*10) == 1){
 | 
| 
 | 
   162      print STDERR "|";
 | 
| 
 | 
   163    }
 | 
| 
 | 
   164    elsif($.%($micro_count*5) == 1){
 | 
| 
 | 
   165      print STDERR ":";
 | 
| 
 | 
   166    }
 | 
| 
 | 
   167    elsif($.%$micro_count == 1){
 | 
| 
 | 
   168      print STDERR ".";
 | 
| 
 | 
   169    }
 | 
| 
 | 
   170  }
 | 
| 
 | 
   171  chomp;
 | 
| 
 | 
   172  my @F = split /\t/, $_;
 | 
| 
 | 
   173  my $micro_call = $F[1];
 | 
| 
 | 
   174  next if $micro_call eq "---";
 | 
| 
 | 
   175  my $rsid = $F[2];
 | 
| 
 | 
   176  my $chr = $F[3];
 | 
| 
 | 
   177  next if $chr eq "---";
 | 
| 
 | 
   178  my $pos = $F[4];
 | 
| 
 | 
   179  if(exists $ignore{"chr$chr:$pos"}){ # multinucleotide events probably wrong on the microarray
 | 
| 
 | 
   180    $num_ignored_snps++;
 | 
| 
 | 
   181    next;
 | 
| 
 | 
   182  }
 | 
| 
 | 
   183  $num_usable_snps++;
 | 
| 
 | 
   184 
 | 
| 
 | 
   185  if(exists $reporting{"chr".$chr}){ # in a region reported in the annotation?
 | 
| 
 | 
   186    my $in_region = 0;
 | 
| 
 | 
   187    my $arrayref = $reporting{"chr".$chr};
 | 
| 
 | 
   188    for(my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){
 | 
| 
 | 
   189      my $interval = $arrayref->[$search_index];
 | 
| 
 | 
   190      if($pos >= $interval->[0] and $pos <= $interval->[1]){
 | 
| 
 | 
   191        $in_region = 1; last;
 | 
| 
 | 
   192      }
 | 
| 
 | 
   193      last if $pos < $interval->[0];
 | 
| 
 | 
   194    }
 | 
| 
 | 
   195    next unless $in_region;
 | 
| 
 | 
   196  }
 | 
| 
 | 
   197 
 | 
| 
 | 
   198  if(exists $regions{"chr".$chr}){ # in the exome target region?
 | 
| 
 | 
   199    my $in_region = 0;
 | 
| 
 | 
   200    my $arrayref = $regions{"chr".$chr};
 | 
| 
 | 
   201    for (my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){
 | 
| 
 | 
   202      my $interval = $arrayref->[$search_index];
 | 
| 
 | 
   203      if($pos >= $interval->[0] and $pos <= $interval->[1]){
 | 
| 
 | 
   204        $in_region = 1; last;
 | 
| 
 | 
   205      }
 | 
| 
 | 
   206      last if $pos < $interval->[0];
 | 
| 
 | 
   207    }
 | 
| 
 | 
   208 
 | 
| 
 | 
   209    next unless $in_region;
 | 
| 
 | 
   210  }
 | 
| 
 | 
   211 
 | 
| 
 | 
   212  $num_targeted_snps++;
 | 
| 
 | 
   213  my $m = substr($micro_call, 0, 1);
 | 
| 
 | 
   214  my $m2 = substr($micro_call, 1, 1);
 | 
| 
 | 
   215  my $key;
 | 
| 
 | 
   216  if(exists $ngs_call{$rsid}){
 | 
| 
 | 
   217    $key = $rsid;
 | 
| 
 | 
   218  }
 | 
| 
 | 
   219  elsif(exists $ngs_call{"chr$chr:$pos"}){
 | 
| 
 | 
   220    $key = "chr$chr:$pos";
 | 
| 
 | 
   221  }
 | 
| 
 | 
   222  else{
 | 
| 
 | 
   223    if(not exists $seq{"chr$chr"}){
 | 
| 
 | 
   224      warn "Skipping microarray call, no reference sequence was provided for chr$chr\n";
 | 
| 
 | 
   225      $num_targeted_snps--;
 | 
| 
 | 
   226      next;
 | 
| 
 | 
   227    }
 | 
| 
 | 
   228    # 4 situations could be e.g. ref AA, but micro says AA or AG or GG or GC
 | 
| 
 | 
   229    my $ref_base = uc(substr($seq{"chr$chr"}, $pos-1, 1));
 | 
| 
 | 
   230    if($m eq $m2){
 | 
| 
 | 
   231      if($micro_call eq $ref_base.$ref_base){
 | 
| 
 | 
   232        # concordant homo calls as reference
 | 
| 
 | 
   233        # undefs will be filled in later en masse in a single call to samtools for efficiency
 | 
| 
 | 
   234        $discordant{$rsid} = ["micro ref, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$micro_call"];
 | 
| 
 | 
   235      }
 | 
| 
 | 
   236      else{
 | 
| 
 | 
   237        # called homo var by microarray, but homo ref by NGS
 | 
| 
 | 
   238        #print STDERR "chr$chr $pos micro $micro_call vs NGS homo ref $ref_base";
 | 
| 
 | 
   239        $discordant{$rsid} = ["micro homo, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"];
 | 
| 
 | 
   240      }
 | 
| 
 | 
   241    }
 | 
| 
 | 
   242    else { #$m ne $m2 
 | 
| 
 | 
   243      if($m eq $ref_base or $m2 eq $ref_base){
 | 
| 
 | 
   244        # called het var by microarray, but homo ref by NGS
 | 
| 
 | 
   245        $discordant{$rsid} = ["micro het, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"];
 | 
| 
 | 
   246      }
 | 
| 
 | 
   247      else{
 | 
| 
 | 
   248        $discordant{$rsid} = ["micro compound het, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"];
 | 
| 
 | 
   249      }
 | 
| 
 | 
   250    }
 | 
| 
 | 
   251    $cov_missing{"chr$chr:$pos"} = [$rsid, $m, $m2];
 | 
| 
 | 
   252    next;
 | 
| 
 | 
   253  }
 | 
| 
 | 
   254 
 | 
| 
 | 
   255  next if $ngs_depth{$key} <= $cov_cutoff;
 | 
| 
 | 
   256  my $ngs_call = $ngs_call{$key};
 | 
| 
 | 
   257  # if we get to here, there are both NGS and micro calls
 | 
| 
 | 
   258  if($micro_call eq $ngs_call or $micro_call eq reverse($ngs_call)){
 | 
| 
 | 
   259    #print STDERR "Concordant NGS call for chr$chr:$pos\n";
 | 
| 
 | 
   260    if($m eq $m2){
 | 
| 
 | 
   261      $discordant{$key} = ["micro homo, ngs homo", $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"];
 | 
| 
 | 
   262    }
 | 
| 
 | 
   263    else{
 | 
| 
 | 
   264      $discordant{$key} = ["micro het, ngs het", $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"];
 | 
| 
 | 
   265    }
 | 
| 
 | 
   266    next;
 | 
| 
 | 
   267  }
 | 
| 
 | 
   268  else{
 | 
| 
 | 
   269    # discordant
 | 
| 
 | 
   270    #print STDERR "Discordant NGS call for chr$chr:$pos $micro_call vs. $ngs_call\n";
 | 
| 
 | 
   271      # is it just a zygosity difference?
 | 
| 
 | 
   272      my $n = substr($ngs_call, 0, 1);
 | 
| 
 | 
   273      my $n2 = substr($ngs_call, 1, 1);
 | 
| 
 | 
   274      my $discordance;
 | 
| 
 | 
   275      if($micro_call eq $n.$n or $micro_call eq $n2.$n2){ # micro is homo for variant, ngs is het
 | 
| 
 | 
   276        $discordance = "micro homo, ngs het";
 | 
| 
 | 
   277      }
 | 
| 
 | 
   278      elsif($m.$m eq $ngs_call or $m2.$m2 eq $ngs_call){ # micro is het for variant, ngs is homo
 | 
| 
 | 
   279        $discordance = "micro het, ngs homo";
 | 
| 
 | 
   280      }
 | 
| 
 | 
   281      elsif($m eq $m2){
 | 
| 
 | 
   282        if($n eq $n2){
 | 
| 
 | 
   283          $discordance = "diff micro homo, ngs homo";
 | 
| 
 | 
   284        }
 | 
| 
 | 
   285        else{
 | 
| 
 | 
   286          $discordance = "diff micro homo, ngs het";
 | 
| 
 | 
   287        }
 | 
| 
 | 
   288      }
 | 
| 
 | 
   289      else{
 | 
| 
 | 
   290        if($n eq $n2){
 | 
| 
 | 
   291          $discordance = "diff micro het, ngs homo";
 | 
| 
 | 
   292        }
 | 
| 
 | 
   293        else{
 | 
| 
 | 
   294          $discordance = "diff micro het, ngs het";
 | 
| 
 | 
   295        }
 | 
| 
 | 
   296      }
 | 
| 
 | 
   297      $discordant{$key} = [$discordance, $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"];
 | 
| 
 | 
   298  }
 | 
| 
 | 
   299 }
 | 
| 
 | 
   300 close(MICRO);
 | 
| 
 | 
   301 print STDERR "\n" unless $quiet;
 | 
| 
 | 
   302 
 | 
| 
 | 
   303 print STDERR "Retrieving reference call depth of coverage stats...\n" unless $quiet;
 | 
| 
 | 
   304 my $covbed = "$$.bed";
 | 
| 
 | 
   305 open(TMPCOV, ">$covbed")
 | 
| 
 | 
   306   or die "Cannot open temporary BED file for samtools: $!\n";
 | 
| 
 | 
   307 my $num_covmissing = 0;
 | 
| 
 | 
   308 for(sort keys %cov_missing){
 | 
| 
 | 
   309   $num_covmissing++;
 | 
| 
 | 
   310   my ($chr, $pos) = split /:/, $_;
 | 
| 
 | 
   311   print TMPCOV "$chr\t$pos\n";
 | 
| 
 | 
   312 }
 | 
| 
 | 
   313 close(TMPCOV);
 | 
| 
 | 
   314 
 | 
| 
 | 
   315 my $cov_count = int($num_covmissing/100);
 | 
| 
 | 
   316 print STDERR "         10%       20%       30%       40%       50%       60%       70%       80%       90%       100%\n" unless $quiet;
 | 
| 
 | 
   317 #print STDERR "samtools mpileup -l $covbed -I $bam_file 2>/dev/null\n";
 | 
| 
 | 
   318 #<STDIN>;
 | 
| 
 | 
   319 open(SAM, "samtools mpileup -l $covbed -I $bam_file 2>/dev/null |")
 | 
| 
 | 
   320   or die "Cannot run samtools: $!\n";;
 | 
| 
 | 
   321 while(<SAM>){
 | 
| 
 | 
   322 #while($depth_data =~ /([^\n]+)/sg){
 | 
| 
 | 
   323    if(not $quiet){
 | 
| 
 | 
   324      if($.%($cov_count*10) == 0){
 | 
| 
 | 
   325        print STDERR "|";
 | 
| 
 | 
   326      }
 | 
| 
 | 
   327      elsif($.%($cov_count*5) == 0){
 | 
| 
 | 
   328        print STDERR ":";
 | 
| 
 | 
   329      }
 | 
| 
 | 
   330      elsif($.%$cov_count == 0){
 | 
| 
 | 
   331        print STDERR ".";
 | 
| 
 | 
   332      }
 | 
| 
 | 
   333    }
 | 
| 
 | 
   334 
 | 
| 
 | 
   335    chomp;
 | 
| 
 | 
   336    my @B = split /\t/, $_;
 | 
| 
 | 
   337    #my @B = split /\t/, $1;
 | 
| 
 | 
   338    my ($rsid, $m, $m2) = @{$cov_missing{"$B[0]:$B[1]"}};
 | 
| 
 | 
   339    my $depth = @B > 1 ? $B[3] : 0;
 | 
| 
 | 
   340    #print STDERR "Depth is $depth for $B[0]:$B[1]\n";
 | 
| 
 | 
   341    my $read_calls = uc($B[4]);
 | 
| 
 | 
   342    my %base_tot;
 | 
| 
 | 
   343    for my $read_call (split //, $read_calls){
 | 
| 
 | 
   344      $base_tot{$read_call}++;
 | 
| 
 | 
   345    }
 | 
| 
 | 
   346    $base_tot{$m} = 0 if not exists $base_tot{$m};
 | 
| 
 | 
   347    $base_tot{$m2} = 0 if not exists $base_tot{$m2};
 | 
| 
 | 
   348    if($depth <= $cov_cutoff){
 | 
| 
 | 
   349      $discordant{$rsid}->[0] = "no ngs call";
 | 
| 
 | 
   350      $discordant{$rsid}->[1] = "insufficient coverage";
 | 
| 
 | 
   351      $discordant{$rsid}->[3] = $base_tot{$m};
 | 
| 
 | 
   352      $discordant{$rsid}->[4] = $depth;
 | 
| 
 | 
   353    }
 | 
| 
 | 
   354    elsif($discordant{$rsid}->[0] eq "micro ref, ngs ref" or $discordant{$rsid}->[0] eq "micro homo, ngs ref"){
 | 
| 
 | 
   355      # concordant homo calls as reference or called homo var by microarray, but homo ref by NGS
 | 
| 
 | 
   356      $discordant{$rsid}->[3] = $base_tot{$m};
 | 
| 
 | 
   357      $discordant{$rsid}->[4] = $depth;
 | 
| 
 | 
   358    }
 | 
| 
 | 
   359    elsif($discordant{$rsid}->[0] eq "micro het, ngs ref" or $discordant{$rsid}->[0] eq "micro compound het, ngs ref"){
 | 
| 
 | 
   360      # called het var by microarray, but homo ref by NGS
 | 
| 
 | 
   361      $discordant{$rsid}->[3] = $base_tot{$m}.",".$base_tot{$m2};
 | 
| 
 | 
   362      $discordant{$rsid}->[4] = $depth;
 | 
| 
 | 
   363    }
 | 
| 
 | 
   364    else{
 | 
| 
 | 
   365      print STDERR "Didn't know how to deal with $B[0]:$B[1] -> ", join(" ", @{$discordant{$rsid}}),"\n" unless $quiet;
 | 
| 
 | 
   366    }
 | 
| 
 | 
   367 }
 | 
| 
 | 
   368 unlink $covbed;
 | 
| 
 | 
   369 
 | 
| 
 | 
   370 #Count false negatives
 | 
| 
 | 
   371 my $false_neg_homo_count = 0;
 | 
| 
 | 
   372 my $false_neg_het_count = 0;
 | 
| 
 | 
   373 my $missing_count = 0;
 | 
| 
 | 
   374 my $wrong_base_count = 0;
 | 
| 
 | 
   375 my $wrong_base_caveat_count = 0;
 | 
| 
 | 
   376 my $discordant_caveat_count = 0;
 | 
| 
 | 
   377 my $tot_caveat_count = 0;
 | 
| 
 | 
   378 my $false_homo_count = 0;
 | 
| 
 | 
   379 my $false_het_count = 0;
 | 
| 
 | 
   380 my $micro_fdr_estimate = 0;
 | 
| 
 | 
   381 my %calls_by_method;
 | 
| 
 | 
   382 my %concordance_by_method;
 | 
| 
 | 
   383 for my $key (keys %discordant){
 | 
| 
 | 
   384   my $rec = $discordant{$key};
 | 
| 
 | 
   385   my $name = $rec->[7];
 | 
| 
 | 
   386   $tot_caveat_count++ if $rec->[2];
 | 
| 
 | 
   387   if($rec->[0] eq "micro het, ngs ref" or $rec->[0] eq "micro homo, ngs ref"){
 | 
| 
 | 
   388     if($rec->[0] eq "micro homo, ngs ref"){
 | 
| 
 | 
   389       $false_neg_homo_count++;
 | 
| 
 | 
   390     }
 | 
| 
 | 
   391     else{
 | 
| 
 | 
   392       $false_neg_het_count++;
 | 
| 
 | 
   393     }
 | 
| 
 | 
   394     # probably false micro call if no caveats, high coverage, and no/one mismatches in NGS
 | 
| 
 | 
   395     next unless defined $rec and defined $rec->[3] and defined $rec->[4];
 | 
| 
 | 
   396     if(not $rec->[2] and $rec->[4] >= 50 and $rec->[3] =~ /(\d{2,})/ and $1+1 >= $rec->[4]){
 | 
| 
 | 
   397       $micro_fdr_estimate++;
 | 
| 
 | 
   398       $name .= "*";
 | 
| 
 | 
   399     }
 | 
| 
 | 
   400   }
 | 
| 
 | 
   401   elsif($rec->[0] eq "micro compound het, ngs ref"){
 | 
| 
 | 
   402     $false_neg_het_count+=2;
 | 
| 
 | 
   403     # probably false micro call if no caveats, high coverage, and no/one mismatches in NGS
 | 
| 
 | 
   404     next unless defined $rec and defined $rec->[3] and defined $rec->[4];
 | 
| 
 | 
   405     if(not $rec->[2] and $rec->[4] >= 50 and $rec->[3] =~ /(\d{2,})/ and $1+1 >= $rec->[4]){
 | 
| 
 | 
   406       $micro_fdr_estimate+=2;
 | 
| 
 | 
   407       $name .= "*";
 | 
| 
 | 
   408     }
 | 
| 
 | 
   409   }
 | 
| 
 | 
   410   elsif($rec->[0] eq "no ngs call"){
 | 
| 
 | 
   411     $missing_count++;
 | 
| 
 | 
   412   }
 | 
| 
 | 
   413   elsif($rec->[0] =~ /^diff/){
 | 
| 
 | 
   414     $wrong_base_count++;
 | 
| 
 | 
   415     if($rec->[3] =~ /\S/){
 | 
| 
 | 
   416       $wrong_base_caveat_count++;
 | 
| 
 | 
   417     }
 | 
| 
 | 
   418   }
 | 
| 
 | 
   419   elsif($rec->[0] eq "micro het, ngs homo"){
 | 
| 
 | 
   420     $false_homo_count++;
 | 
| 
 | 
   421   }
 | 
| 
 | 
   422   elsif($rec->[0] eq "micro homo, ngs het"){
 | 
| 
 | 
   423     $false_het_count++;
 | 
| 
 | 
   424   }
 | 
| 
 | 
   425   else{
 | 
| 
 | 
   426     # calls concordant
 | 
| 
 | 
   427     $concordance_by_method{$rec->[1]}++;
 | 
| 
 | 
   428     next;
 | 
| 
 | 
   429   }
 | 
| 
 | 
   430   if($rec->[4] > $cov_cutoff and $rec->[2] and $name !~ /\*$/){
 | 
| 
 | 
   431     $discordant_caveat_count++;
 | 
| 
 | 
   432   }
 | 
| 
 | 
   433   my $chr = $rec->[5];
 | 
| 
 | 
   434   my $pos = $rec->[6];
 | 
| 
 | 
   435   my $score = $rec->[4];
 | 
| 
 | 
   436   $score = 1000 if $score > 1000;
 | 
| 
 | 
   437   print BED "$chr\t$pos\t$pos\t$name\t$score\n";
 | 
| 
 | 
   438 }
 | 
| 
 | 
   439 
 | 
| 
 | 
   440 print SUMMARY "# NGS genotypes in $ARGV[2] vs. SNP microarray in $ARGV[3], minimum NGS coverage of ",$cov_cutoff+1, "\n";
 | 
| 
 | 
   441 print SUMMARY "#Columns:  Measure\tCount\tPercentage\n";
 | 
| 
 | 
   442 print SUMMARY "Total ignored SNP microarray calls due to NGS putative indels or MNPs\t$num_ignored_snps\n";
 | 
| 
 | 
   443 print SUMMARY "Total usable SNP microarray calls\t$num_usable_snps\n";
 | 
| 
 | 
   444 printf SUMMARY "Total targeted SNP microarray calls (based on target file %s)\t%d\t%.2f\n", $ARGV[5],$num_targeted_snps,$num_targeted_snps/$num_usable_snps*100 if keys %regions;
 | 
| 
 | 
   445 printf SUMMARY "Targeted SNPs with insufficient NGS coverage (<=$cov_cutoff)\t%d\t%.2f\n", $missing_count, $missing_count/$num_targeted_snps*100;
 | 
| 
 | 
   446 $num_targeted_snps -= $missing_count;
 | 
| 
 | 
   447 my $tot_bad = $wrong_base_count+$false_neg_homo_count+$false_neg_het_count+$false_homo_count+$false_het_count;
 | 
| 
 | 
   448 printf SUMMARY "Total discordance\t%d\t%.2f\n", $tot_bad, $tot_bad/$num_targeted_snps*100;
 | 
| 
 | 
   449 $tot_bad -= $discordant_caveat_count+$micro_fdr_estimate;
 | 
| 
 | 
   450 printf SUMMARY "Significant discordance (excludes NGS calls with caveats, microarray het FDR)\t%d\t%.2f\n", $tot_bad, $tot_bad/$num_targeted_snps*100;
 | 
| 
 | 
   451 printf SUMMARY "Caveats discordant\t%d\t%.2f\n", $discordant_caveat_count, $tot_caveat_count == 0 ? 0 : $discordant_caveat_count/$tot_caveat_count*100;
 | 
| 
 | 
   452 printf SUMMARY "Incorrect NGS base called\t%d\t%.2f\n", $wrong_base_count, $wrong_base_count/$num_targeted_snps*100;
 | 
| 
 | 
   453 printf SUMMARY "Incorrect NGS base called, subset with caveats\t%d\t%.2f\n", $wrong_base_caveat_count, $wrong_base_count == 0 ? 0 : $wrong_base_caveat_count/$wrong_base_count*100;
 | 
| 
 | 
   454 printf SUMMARY "False negative NGS homo\t%d\t%.2f\n", $false_neg_homo_count, $false_neg_homo_count/$num_targeted_snps*100;
 | 
| 
 | 
   455 printf SUMMARY "False negative NGS het\t%d\t%.2f\n", $false_neg_het_count, $false_neg_het_count/$num_targeted_snps*100;
 | 
| 
 | 
   456 printf SUMMARY "Microarray est. FDR het\t%d\t%.2f\n", $micro_fdr_estimate, $micro_fdr_estimate/$num_targeted_snps*100;
 | 
| 
 | 
   457 printf SUMMARY "Het called NGS homo\t%d\t%.2f\n", $false_homo_count, $false_homo_count/$num_targeted_snps*100;
 | 
| 
 | 
   458 printf SUMMARY "Homo called NGS het\t%d\t%.2f\n", $false_het_count, $false_het_count/$num_targeted_snps*100;
 | 
| 
 | 
   459 for(sort keys %concordance_by_method){
 | 
| 
 | 
   460   printf SUMMARY "%s true positives\t%d\t%.2f\n", (length($_) ? $_ : "Reference"), $concordance_by_method{$_}, $concordance_by_method{$_}/$num_targeted_snps*100;
 | 
| 
 | 
   461 }
 | 
| 
 | 
   462 sub find_earliest_index{
 | 
| 
 | 
   463   # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start
 | 
| 
 | 
   464   my ($query, $array) = @_;
 | 
| 
 | 
   465 
 | 
| 
 | 
   466   return 0 if $query < $array->[0]->[0];
 | 
| 
 | 
   467 
 | 
| 
 | 
   468   my ($left, $right, $prevCenter) = (0, $#$array, -1);
 | 
| 
 | 
   469 
 | 
| 
 | 
   470   while(1) {
 | 
| 
 | 
   471     my $center = int (($left + $right)/2);
 | 
| 
 | 
   472 
 | 
| 
 | 
   473     my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1);
 | 
| 
 | 
   474 
 | 
| 
 | 
   475     return $center if $cmp == 0;
 | 
| 
 | 
   476     if ($center == $prevCenter) {
 | 
| 
 | 
   477       return $left;
 | 
| 
 | 
   478     }
 | 
| 
 | 
   479     $right = $center if $cmp < 0;
 | 
| 
 | 
   480     $left  = $center if $cmp > 0;
 | 
| 
 | 
   481     $prevCenter = $center;
 | 
| 
 | 
   482   }
 | 
| 
 | 
   483 }
 |