| 0 | 1 #!/usr/bin/env perl | 
|  | 2 | 
|  | 3 use strict; | 
|  | 4 use warnings; | 
|  | 5 use File::Basename; | 
|  | 6 # Assume the microarray file looks something like this... | 
|  | 7 ## comment lines... | 
|  | 8 #Probe Set ID	11-01451_(GenomeWideSNP_5).brlmm-p.chp Forward Strand Base Calls	dbSNP RS ID	Chromosome	Chromosomal Position | 
|  | 9 #SNP_A-1780520	GG	rs16994928	20	48440771 | 
|  | 10 #SNP_A-1780985	AG	rs3859360	18	34178190 | 
|  | 11 #...end example input file | 
|  | 12 if(@ARGV == 1 and $ARGV[0] eq "-v"){ | 
|  | 13   print "Version 1.0\n"; | 
|  | 14   exit; | 
|  | 15 } | 
|  | 16 | 
|  | 17 # configuration file stuff | 
|  | 18 my $dirname = dirname(__FILE__); | 
|  | 19 my %config; | 
|  | 20 my $tool_data =  shift @ARGV; | 
|  | 21 if(not -e "$tool_data/microarray_report.loc"){ | 
|  | 22   system("cp $dirname/tool-data/microarray_report.loc $tool_data/microarray_report.loc"); | 
|  | 23 } | 
|  | 24 open CONFIG, '<', "$tool_data/microarray_report.loc"; | 
|  | 25 while(<CONFIG>){ | 
|  | 26   (my $key, my $value) = split(/\s+/, $_); | 
|  | 27   $config{$key} = $value; | 
|  | 28 } | 
|  | 29 my $dbs_dir = $config{"dbs_directory"}; | 
|  | 30 close CONFIG; | 
|  | 31 | 
|  | 32 my $quiet = 0; | 
|  | 33 if(@ARGV and $ARGV[0] =~ /^-q/){ | 
|  | 34   $quiet = 1; | 
|  | 35   shift @ARGV; | 
|  | 36 } | 
|  | 37 | 
|  | 38 @ARGV == 8 or @ARGV == 9 or die "Usage: $0 [-q(uiet)] <output discordant.bed> <output summary.txt> <input ngs hgvs.txt> <input microarray calls.txt> <input.bam> <reporting cds regions.gtf> <reference genome.fasta> <coverage cutoff for stats> [exome target ngs regions.bed]\n"; | 
|  | 39 | 
|  | 40 print STDERR "Reading in reference sequence...\n" unless $quiet; | 
|  | 41 my %seq; | 
|  | 42 open(FASTA, "$dbs_dir/$ARGV[6]" ) | 
|  | 43   or die "Cannot open $dbs_dir/$ARGV[6] for reading: $!\n"; | 
|  | 44 $/ = "\n>"; | 
|  | 45 while(<FASTA>){ | 
|  | 46   chomp; | 
|  | 47   my ($name) = /^>?(\S+)/; | 
|  | 48   s/^[^\n]+//; | 
|  | 49   tr/\r\n//d; | 
|  | 50   $seq{$name} = $_; | 
|  | 51 } | 
|  | 52 close(FASTA); | 
|  | 53 $/ = "\n"; | 
|  | 54 | 
|  | 55 my $cov_cutoff = $ARGV[7]; | 
|  | 56 my %reporting; | 
|  | 57 print STDERR "Reading in reporting regions list...\n" unless $quiet; | 
|  | 58 open(GTF, $ARGV[5]) | 
|  | 59   or die "Cannot open $ARGV[5] for reading: $!\n"; | 
|  | 60 while(<GTF>){ | 
|  | 61     next if /^\s*#/; | 
|  | 62     my @fields = split /\t/, $_; | 
|  | 63 | 
|  | 64     if($fields[2] eq "exon"){ | 
|  | 65       if(not exists $reporting{$fields[0]}){ | 
|  | 66         $reporting{$fields[0]} = [[$fields[3], $fields[4]]]; | 
|  | 67         next; | 
|  | 68       } | 
|  | 69       push @{$reporting{$fields[0]}}, [$fields[3], $fields[4]]; | 
|  | 70     } | 
|  | 71 } | 
|  | 72 close(GTF); | 
|  | 73 for my $c (keys %reporting){ | 
|  | 74   $reporting{$c} = [sort {$a->[0] <=> $b->[0]} @{$reporting{$c}}]; | 
|  | 75 } | 
|  | 76 | 
|  | 77 my %regions; | 
|  | 78 if(@ARGV == 9){ | 
|  | 79   print STDERR "Reading in target regions list...\n" unless $quiet; | 
|  | 80   open(BED, $ARGV[8]) | 
|  | 81     or die "Cannot open $ARGV[8] for reading: $!\n"; | 
|  | 82   while(<BED>){ | 
|  | 83     chomp; | 
|  | 84     my @F = split /\t/, $_; | 
|  | 85     next unless @F > 4; | 
|  | 86     if(not exists $regions{$F[0]}){ | 
|  | 87       $regions{$F[0]} = []; | 
|  | 88     } | 
|  | 89     push @{$regions{$F[0]}}, [$F[1],$F[2]]; # assume they are in order | 
|  | 90   } | 
|  | 91 } | 
|  | 92 | 
|  | 93 open(BED, ">$ARGV[0]") | 
|  | 94   or die "Cannot open $ARGV[0] for writing: $!\n"; | 
|  | 95 print BED "track name=\"NGSvsMicroarrayDiscordance\" columns=\"chr pos pos microGenotype/ngsGenotype NGSReadDepth\" comment=\"asterisk after genotype indicates likely microarray false positive based on NGS coverage, lack of caveats and no mismatched NGS data\" useScore=\"colour gradient\"\n"; | 
|  | 96 | 
|  | 97 open(MICRO, $ARGV[3]) | 
|  | 98   or die "Cannot open microarray calls $ARGV[3] for reading: $!\n"; | 
|  | 99 my $micro_count = 1; | 
|  | 100 do {$_ = <MICRO>} while /^#/; #header lines | 
|  | 101 $micro_count++ while <MICRO>; # get a line count for progress meter later | 
|  | 102 close(MICRO); | 
|  | 103 $micro_count = int($micro_count/100); | 
|  | 104 | 
|  | 105 print STDERR "Reading in NGS calls...\n" unless $quiet; | 
|  | 106 open(MICRO, $ARGV[3]) | 
|  | 107   or die "Cannot open microarray calls $ARGV[3] for reading: $!\n"; | 
|  | 108 open(HGVS, $ARGV[2]) | 
|  | 109   or die "Cannot open HGVS genotype calls $ARGV[2] for reading: $!\n"; | 
|  | 110 open(SUMMARY, ">$ARGV[1]") | 
|  | 111   or die "Cannot open $ARGV[1] for writing: $!\n"; | 
|  | 112 my $bam_file = $ARGV[4]; | 
|  | 113 die "Input BAM file does not exist\n" if not -e $bam_file; | 
|  | 114 die "Input BAM file is not readable\n" if not -r $bam_file; | 
|  | 115 die "Input BAM file is empty\n" if -z $bam_file; | 
|  | 116 <HGVS>; # header | 
|  | 117 my (%ngs_call, %ngs_caveats, %ngs_methods, %ngs_varreads, %ngs_depth, %key2pos, %ignore); | 
|  | 118 while(<HGVS>){ | 
|  | 119   chomp; | 
|  | 120   my @F = split /\t/, $_; | 
|  | 121   my $pos_key = "$F[4]:$F[5]"; | 
|  | 122   # ignore non-SNPs | 
|  | 123   if($F[2] =~ /c.[\-*]?(\d+)_(\d+)/){ #multi-base events ignored, as microarray probes are likely to be reporting wrong here anyway | 
|  | 124     my $modlength = $2-$1; | 
|  | 125     for my $offset (0..$modlength){ | 
|  | 126       if($F[3] eq "-"){ | 
|  | 127         $ignore{"$F[4]:".($F[5]-$offset)} = 1; | 
|  | 128       } | 
|  | 129       else{ | 
|  | 130         $ignore{"$F[4]:".($F[5]+$offset)} = 1; | 
|  | 131       } | 
|  | 132     } | 
|  | 133   } | 
|  | 134   elsif($F[2] =~ /(?:del|ins|inv)/){ | 
|  | 135     next; | 
|  | 136   } | 
|  | 137   next unless $F[2] =~ />([ACGT])$/; # only looking for SNPs, to do add MNPs | 
|  | 138   my $new_base = $1; | 
|  | 139   $new_base =~ tr/ACGT/TGCA/ if $F[3] eq "-"; # rev comp cDNA HGVS | 
|  | 140   my $rs_key = $F[11];   # use both position and rsID as patches to hg19 have shifted some positions | 
|  | 141   my $ngs_call = ($F[6] =~ /homozygote/ ? $new_base : $F[10]) . $new_base; | 
|  | 142   $ngs_call{$pos_key} = $ngs_call{$rs_key} = $ngs_call; | 
|  | 143   $ngs_caveats{$pos_key} = $ngs_caveats{$rs_key} = $F[16] if $F[16] =~ /\S/; | 
|  | 144   $ngs_methods{$pos_key} = $ngs_methods{$rs_key} = $F[18]; | 
|  | 145   $ngs_varreads{$pos_key} = $ngs_varreads{$rs_key} = $F[8]; | 
|  | 146   $ngs_depth{$pos_key} = $ngs_depth{$rs_key} = $F[9]; | 
|  | 147   $key2pos{$rs_key} = $key2pos{$pos_key} = [$F[4],$F[5]]; | 
|  | 148 } | 
|  | 149 close(HGVS); | 
|  | 150 | 
|  | 151 print STDERR "Comparing to microarray calls...\n" unless $quiet; | 
|  | 152 print STDERR "         10%       20%       30%       40%       50%       60%       70%       80%       90%       100%\n" unless $quiet; | 
|  | 153 my $num_ignored_snps = 0; | 
|  | 154 my $num_usable_snps = 0; | 
|  | 155 my $num_targeted_snps = 0; | 
|  | 156 my %discordant; # key => [zygosity, method, caveats, variant depth, total depth]; | 
|  | 157 my %cov_missing; # List all sites with unknown coverage | 
|  | 158 do {$_ = <MICRO>} while /^#/; #header lines | 
|  | 159 while(<MICRO>){ | 
|  | 160  if(not $quiet){ | 
|  | 161    if($.%($micro_count*10) == 1){ | 
|  | 162      print STDERR "|"; | 
|  | 163    } | 
|  | 164    elsif($.%($micro_count*5) == 1){ | 
|  | 165      print STDERR ":"; | 
|  | 166    } | 
|  | 167    elsif($.%$micro_count == 1){ | 
|  | 168      print STDERR "."; | 
|  | 169    } | 
|  | 170  } | 
|  | 171  chomp; | 
|  | 172  my @F = split /\t/, $_; | 
|  | 173  my $micro_call = $F[1]; | 
|  | 174  next if $micro_call eq "---"; | 
|  | 175  my $rsid = $F[2]; | 
|  | 176  my $chr = $F[3]; | 
|  | 177  next if $chr eq "---"; | 
|  | 178  my $pos = $F[4]; | 
|  | 179  if(exists $ignore{"chr$chr:$pos"}){ # multinucleotide events probably wrong on the microarray | 
|  | 180    $num_ignored_snps++; | 
|  | 181    next; | 
|  | 182  } | 
|  | 183  $num_usable_snps++; | 
|  | 184 | 
|  | 185  if(exists $reporting{"chr".$chr}){ # in a region reported in the annotation? | 
|  | 186    my $in_region = 0; | 
|  | 187    my $arrayref = $reporting{"chr".$chr}; | 
|  | 188    for(my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){ | 
|  | 189      my $interval = $arrayref->[$search_index]; | 
|  | 190      if($pos >= $interval->[0] and $pos <= $interval->[1]){ | 
|  | 191        $in_region = 1; last; | 
|  | 192      } | 
|  | 193      last if $pos < $interval->[0]; | 
|  | 194    } | 
|  | 195    next unless $in_region; | 
|  | 196  } | 
|  | 197 | 
|  | 198  if(exists $regions{"chr".$chr}){ # in the exome target region? | 
|  | 199    my $in_region = 0; | 
|  | 200    my $arrayref = $regions{"chr".$chr}; | 
|  | 201    for (my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){ | 
|  | 202      my $interval = $arrayref->[$search_index]; | 
|  | 203      if($pos >= $interval->[0] and $pos <= $interval->[1]){ | 
|  | 204        $in_region = 1; last; | 
|  | 205      } | 
|  | 206      last if $pos < $interval->[0]; | 
|  | 207    } | 
|  | 208 | 
|  | 209    next unless $in_region; | 
|  | 210  } | 
|  | 211 | 
|  | 212  $num_targeted_snps++; | 
|  | 213  my $m = substr($micro_call, 0, 1); | 
|  | 214  my $m2 = substr($micro_call, 1, 1); | 
|  | 215  my $key; | 
|  | 216  if(exists $ngs_call{$rsid}){ | 
|  | 217    $key = $rsid; | 
|  | 218  } | 
|  | 219  elsif(exists $ngs_call{"chr$chr:$pos"}){ | 
|  | 220    $key = "chr$chr:$pos"; | 
|  | 221  } | 
|  | 222  else{ | 
|  | 223    if(not exists $seq{"chr$chr"}){ | 
|  | 224      warn "Skipping microarray call, no reference sequence was provided for chr$chr\n"; | 
|  | 225      $num_targeted_snps--; | 
|  | 226      next; | 
|  | 227    } | 
|  | 228    # 4 situations could be e.g. ref AA, but micro says AA or AG or GG or GC | 
|  | 229    my $ref_base = uc(substr($seq{"chr$chr"}, $pos-1, 1)); | 
|  | 230    if($m eq $m2){ | 
|  | 231      if($micro_call eq $ref_base.$ref_base){ | 
|  | 232        # concordant homo calls as reference | 
|  | 233        # undefs will be filled in later en masse in a single call to samtools for efficiency | 
|  | 234        $discordant{$rsid} = ["micro ref, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$micro_call"]; | 
|  | 235      } | 
|  | 236      else{ | 
|  | 237        # called homo var by microarray, but homo ref by NGS | 
|  | 238        #print STDERR "chr$chr $pos micro $micro_call vs NGS homo ref $ref_base"; | 
|  | 239        $discordant{$rsid} = ["micro homo, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"]; | 
|  | 240      } | 
|  | 241    } | 
|  | 242    else { #$m ne $m2 | 
|  | 243      if($m eq $ref_base or $m2 eq $ref_base){ | 
|  | 244        # called het var by microarray, but homo ref by NGS | 
|  | 245        $discordant{$rsid} = ["micro het, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"]; | 
|  | 246      } | 
|  | 247      else{ | 
|  | 248        $discordant{$rsid} = ["micro compound het, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"]; | 
|  | 249      } | 
|  | 250    } | 
|  | 251    $cov_missing{"chr$chr:$pos"} = [$rsid, $m, $m2]; | 
|  | 252    next; | 
|  | 253  } | 
|  | 254 | 
|  | 255  next if $ngs_depth{$key} <= $cov_cutoff; | 
|  | 256  my $ngs_call = $ngs_call{$key}; | 
|  | 257  # if we get to here, there are both NGS and micro calls | 
|  | 258  if($micro_call eq $ngs_call or $micro_call eq reverse($ngs_call)){ | 
|  | 259    #print STDERR "Concordant NGS call for chr$chr:$pos\n"; | 
|  | 260    if($m eq $m2){ | 
|  | 261      $discordant{$key} = ["micro homo, ngs homo", $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"]; | 
|  | 262    } | 
|  | 263    else{ | 
|  | 264      $discordant{$key} = ["micro het, ngs het", $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"]; | 
|  | 265    } | 
|  | 266    next; | 
|  | 267  } | 
|  | 268  else{ | 
|  | 269    # discordant | 
|  | 270    #print STDERR "Discordant NGS call for chr$chr:$pos $micro_call vs. $ngs_call\n"; | 
|  | 271      # is it just a zygosity difference? | 
|  | 272      my $n = substr($ngs_call, 0, 1); | 
|  | 273      my $n2 = substr($ngs_call, 1, 1); | 
|  | 274      my $discordance; | 
|  | 275      if($micro_call eq $n.$n or $micro_call eq $n2.$n2){ # micro is homo for variant, ngs is het | 
|  | 276        $discordance = "micro homo, ngs het"; | 
|  | 277      } | 
|  | 278      elsif($m.$m eq $ngs_call or $m2.$m2 eq $ngs_call){ # micro is het for variant, ngs is homo | 
|  | 279        $discordance = "micro het, ngs homo"; | 
|  | 280      } | 
|  | 281      elsif($m eq $m2){ | 
|  | 282        if($n eq $n2){ | 
|  | 283          $discordance = "diff micro homo, ngs homo"; | 
|  | 284        } | 
|  | 285        else{ | 
|  | 286          $discordance = "diff micro homo, ngs het"; | 
|  | 287        } | 
|  | 288      } | 
|  | 289      else{ | 
|  | 290        if($n eq $n2){ | 
|  | 291          $discordance = "diff micro het, ngs homo"; | 
|  | 292        } | 
|  | 293        else{ | 
|  | 294          $discordance = "diff micro het, ngs het"; | 
|  | 295        } | 
|  | 296      } | 
|  | 297      $discordant{$key} = [$discordance, $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"]; | 
|  | 298  } | 
|  | 299 } | 
|  | 300 close(MICRO); | 
|  | 301 print STDERR "\n" unless $quiet; | 
|  | 302 | 
|  | 303 print STDERR "Retrieving reference call depth of coverage stats...\n" unless $quiet; | 
|  | 304 my $covbed = "$$.bed"; | 
|  | 305 open(TMPCOV, ">$covbed") | 
|  | 306   or die "Cannot open temporary BED file for samtools: $!\n"; | 
|  | 307 my $num_covmissing = 0; | 
|  | 308 for(sort keys %cov_missing){ | 
|  | 309   $num_covmissing++; | 
|  | 310   my ($chr, $pos) = split /:/, $_; | 
|  | 311   print TMPCOV "$chr\t$pos\n"; | 
|  | 312 } | 
|  | 313 close(TMPCOV); | 
|  | 314 | 
|  | 315 my $cov_count = int($num_covmissing/100); | 
|  | 316 print STDERR "         10%       20%       30%       40%       50%       60%       70%       80%       90%       100%\n" unless $quiet; | 
|  | 317 #print STDERR "samtools mpileup -l $covbed -I $bam_file 2>/dev/null\n"; | 
|  | 318 #<STDIN>; | 
|  | 319 open(SAM, "samtools mpileup -l $covbed -I $bam_file 2>/dev/null |") | 
|  | 320   or die "Cannot run samtools: $!\n";; | 
|  | 321 while(<SAM>){ | 
|  | 322 #while($depth_data =~ /([^\n]+)/sg){ | 
|  | 323    if(not $quiet){ | 
|  | 324      if($.%($cov_count*10) == 0){ | 
|  | 325        print STDERR "|"; | 
|  | 326      } | 
|  | 327      elsif($.%($cov_count*5) == 0){ | 
|  | 328        print STDERR ":"; | 
|  | 329      } | 
|  | 330      elsif($.%$cov_count == 0){ | 
|  | 331        print STDERR "."; | 
|  | 332      } | 
|  | 333    } | 
|  | 334 | 
|  | 335    chomp; | 
|  | 336    my @B = split /\t/, $_; | 
|  | 337    #my @B = split /\t/, $1; | 
|  | 338    my ($rsid, $m, $m2) = @{$cov_missing{"$B[0]:$B[1]"}}; | 
|  | 339    my $depth = @B > 1 ? $B[3] : 0; | 
|  | 340    #print STDERR "Depth is $depth for $B[0]:$B[1]\n"; | 
|  | 341    my $read_calls = uc($B[4]); | 
|  | 342    my %base_tot; | 
|  | 343    for my $read_call (split //, $read_calls){ | 
|  | 344      $base_tot{$read_call}++; | 
|  | 345    } | 
|  | 346    $base_tot{$m} = 0 if not exists $base_tot{$m}; | 
|  | 347    $base_tot{$m2} = 0 if not exists $base_tot{$m2}; | 
|  | 348    if($depth <= $cov_cutoff){ | 
|  | 349      $discordant{$rsid}->[0] = "no ngs call"; | 
|  | 350      $discordant{$rsid}->[1] = "insufficient coverage"; | 
|  | 351      $discordant{$rsid}->[3] = $base_tot{$m}; | 
|  | 352      $discordant{$rsid}->[4] = $depth; | 
|  | 353    } | 
|  | 354    elsif($discordant{$rsid}->[0] eq "micro ref, ngs ref" or $discordant{$rsid}->[0] eq "micro homo, ngs ref"){ | 
|  | 355      # concordant homo calls as reference or called homo var by microarray, but homo ref by NGS | 
|  | 356      $discordant{$rsid}->[3] = $base_tot{$m}; | 
|  | 357      $discordant{$rsid}->[4] = $depth; | 
|  | 358    } | 
|  | 359    elsif($discordant{$rsid}->[0] eq "micro het, ngs ref" or $discordant{$rsid}->[0] eq "micro compound het, ngs ref"){ | 
|  | 360      # called het var by microarray, but homo ref by NGS | 
|  | 361      $discordant{$rsid}->[3] = $base_tot{$m}.",".$base_tot{$m2}; | 
|  | 362      $discordant{$rsid}->[4] = $depth; | 
|  | 363    } | 
|  | 364    else{ | 
|  | 365      print STDERR "Didn't know how to deal with $B[0]:$B[1] -> ", join(" ", @{$discordant{$rsid}}),"\n" unless $quiet; | 
|  | 366    } | 
|  | 367 } | 
|  | 368 unlink $covbed; | 
|  | 369 | 
|  | 370 #Count false negatives | 
|  | 371 my $false_neg_homo_count = 0; | 
|  | 372 my $false_neg_het_count = 0; | 
|  | 373 my $missing_count = 0; | 
|  | 374 my $wrong_base_count = 0; | 
|  | 375 my $wrong_base_caveat_count = 0; | 
|  | 376 my $discordant_caveat_count = 0; | 
|  | 377 my $tot_caveat_count = 0; | 
|  | 378 my $false_homo_count = 0; | 
|  | 379 my $false_het_count = 0; | 
|  | 380 my $micro_fdr_estimate = 0; | 
|  | 381 my %calls_by_method; | 
|  | 382 my %concordance_by_method; | 
|  | 383 for my $key (keys %discordant){ | 
|  | 384   my $rec = $discordant{$key}; | 
|  | 385   my $name = $rec->[7]; | 
|  | 386   $tot_caveat_count++ if $rec->[2]; | 
|  | 387   if($rec->[0] eq "micro het, ngs ref" or $rec->[0] eq "micro homo, ngs ref"){ | 
|  | 388     if($rec->[0] eq "micro homo, ngs ref"){ | 
|  | 389       $false_neg_homo_count++; | 
|  | 390     } | 
|  | 391     else{ | 
|  | 392       $false_neg_het_count++; | 
|  | 393     } | 
|  | 394     # probably false micro call if no caveats, high coverage, and no/one mismatches in NGS | 
|  | 395     next unless defined $rec and defined $rec->[3] and defined $rec->[4]; | 
|  | 396     if(not $rec->[2] and $rec->[4] >= 50 and $rec->[3] =~ /(\d{2,})/ and $1+1 >= $rec->[4]){ | 
|  | 397       $micro_fdr_estimate++; | 
|  | 398       $name .= "*"; | 
|  | 399     } | 
|  | 400   } | 
|  | 401   elsif($rec->[0] eq "micro compound het, ngs ref"){ | 
|  | 402     $false_neg_het_count+=2; | 
|  | 403     # probably false micro call if no caveats, high coverage, and no/one mismatches in NGS | 
|  | 404     next unless defined $rec and defined $rec->[3] and defined $rec->[4]; | 
|  | 405     if(not $rec->[2] and $rec->[4] >= 50 and $rec->[3] =~ /(\d{2,})/ and $1+1 >= $rec->[4]){ | 
|  | 406       $micro_fdr_estimate+=2; | 
|  | 407       $name .= "*"; | 
|  | 408     } | 
|  | 409   } | 
|  | 410   elsif($rec->[0] eq "no ngs call"){ | 
|  | 411     $missing_count++; | 
|  | 412   } | 
|  | 413   elsif($rec->[0] =~ /^diff/){ | 
|  | 414     $wrong_base_count++; | 
|  | 415     if($rec->[3] =~ /\S/){ | 
|  | 416       $wrong_base_caveat_count++; | 
|  | 417     } | 
|  | 418   } | 
|  | 419   elsif($rec->[0] eq "micro het, ngs homo"){ | 
|  | 420     $false_homo_count++; | 
|  | 421   } | 
|  | 422   elsif($rec->[0] eq "micro homo, ngs het"){ | 
|  | 423     $false_het_count++; | 
|  | 424   } | 
|  | 425   else{ | 
|  | 426     # calls concordant | 
|  | 427     $concordance_by_method{$rec->[1]}++; | 
|  | 428     next; | 
|  | 429   } | 
|  | 430   if($rec->[4] > $cov_cutoff and $rec->[2] and $name !~ /\*$/){ | 
|  | 431     $discordant_caveat_count++; | 
|  | 432   } | 
|  | 433   my $chr = $rec->[5]; | 
|  | 434   my $pos = $rec->[6]; | 
|  | 435   my $score = $rec->[4]; | 
|  | 436   $score = 1000 if $score > 1000; | 
|  | 437   print BED "$chr\t$pos\t$pos\t$name\t$score\n"; | 
|  | 438 } | 
|  | 439 | 
|  | 440 print SUMMARY "# NGS genotypes in $ARGV[2] vs. SNP microarray in $ARGV[3], minimum NGS coverage of ",$cov_cutoff+1, "\n"; | 
|  | 441 print SUMMARY "#Columns:  Measure\tCount\tPercentage\n"; | 
|  | 442 print SUMMARY "Total ignored SNP microarray calls due to NGS putative indels or MNPs\t$num_ignored_snps\n"; | 
|  | 443 print SUMMARY "Total usable SNP microarray calls\t$num_usable_snps\n"; | 
|  | 444 printf SUMMARY "Total targeted SNP microarray calls (based on target file %s)\t%d\t%.2f\n", $ARGV[5],$num_targeted_snps,$num_targeted_snps/$num_usable_snps*100 if keys %regions; | 
|  | 445 printf SUMMARY "Targeted SNPs with insufficient NGS coverage (<=$cov_cutoff)\t%d\t%.2f\n", $missing_count, $missing_count/$num_targeted_snps*100; | 
|  | 446 $num_targeted_snps -= $missing_count; | 
|  | 447 my $tot_bad = $wrong_base_count+$false_neg_homo_count+$false_neg_het_count+$false_homo_count+$false_het_count; | 
|  | 448 printf SUMMARY "Total discordance\t%d\t%.2f\n", $tot_bad, $tot_bad/$num_targeted_snps*100; | 
|  | 449 $tot_bad -= $discordant_caveat_count+$micro_fdr_estimate; | 
|  | 450 printf SUMMARY "Significant discordance (excludes NGS calls with caveats, microarray het FDR)\t%d\t%.2f\n", $tot_bad, $tot_bad/$num_targeted_snps*100; | 
|  | 451 printf SUMMARY "Caveats discordant\t%d\t%.2f\n", $discordant_caveat_count, $tot_caveat_count == 0 ? 0 : $discordant_caveat_count/$tot_caveat_count*100; | 
|  | 452 printf SUMMARY "Incorrect NGS base called\t%d\t%.2f\n", $wrong_base_count, $wrong_base_count/$num_targeted_snps*100; | 
|  | 453 printf SUMMARY "Incorrect NGS base called, subset with caveats\t%d\t%.2f\n", $wrong_base_caveat_count, $wrong_base_count == 0 ? 0 : $wrong_base_caveat_count/$wrong_base_count*100; | 
|  | 454 printf SUMMARY "False negative NGS homo\t%d\t%.2f\n", $false_neg_homo_count, $false_neg_homo_count/$num_targeted_snps*100; | 
|  | 455 printf SUMMARY "False negative NGS het\t%d\t%.2f\n", $false_neg_het_count, $false_neg_het_count/$num_targeted_snps*100; | 
|  | 456 printf SUMMARY "Microarray est. FDR het\t%d\t%.2f\n", $micro_fdr_estimate, $micro_fdr_estimate/$num_targeted_snps*100; | 
|  | 457 printf SUMMARY "Het called NGS homo\t%d\t%.2f\n", $false_homo_count, $false_homo_count/$num_targeted_snps*100; | 
|  | 458 printf SUMMARY "Homo called NGS het\t%d\t%.2f\n", $false_het_count, $false_het_count/$num_targeted_snps*100; | 
|  | 459 for(sort keys %concordance_by_method){ | 
|  | 460   printf SUMMARY "%s true positives\t%d\t%.2f\n", (length($_) ? $_ : "Reference"), $concordance_by_method{$_}, $concordance_by_method{$_}/$num_targeted_snps*100; | 
|  | 461 } | 
|  | 462 sub find_earliest_index{ | 
|  | 463   # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start | 
|  | 464   my ($query, $array) = @_; | 
|  | 465 | 
|  | 466   return 0 if $query < $array->[0]->[0]; | 
|  | 467 | 
|  | 468   my ($left, $right, $prevCenter) = (0, $#$array, -1); | 
|  | 469 | 
|  | 470   while(1) { | 
|  | 471     my $center = int (($left + $right)/2); | 
|  | 472 | 
|  | 473     my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1); | 
|  | 474 | 
|  | 475     return $center if $cmp == 0; | 
|  | 476     if ($center == $prevCenter) { | 
|  | 477       return $left; | 
|  | 478     } | 
|  | 479     $right = $center if $cmp < 0; | 
|  | 480     $left  = $center if $cmp > 0; | 
|  | 481     $prevCenter = $center; | 
|  | 482   } | 
|  | 483 } |