Mercurial > repos > yusuf > microarray_report
comparison microarray_reports @ 0:882d119ede1f default tip
initial commit
| author | Yusuf Ali <ali@yusuf.email> |
|---|---|
| date | Wed, 25 Mar 2015 13:40:00 -0600 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:882d119ede1f |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 use strict; | |
| 4 use warnings; | |
| 5 use File::Basename; | |
| 6 # Assume the microarray file looks something like this... | |
| 7 ## comment lines... | |
| 8 #Probe Set ID 11-01451_(GenomeWideSNP_5).brlmm-p.chp Forward Strand Base Calls dbSNP RS ID Chromosome Chromosomal Position | |
| 9 #SNP_A-1780520 GG rs16994928 20 48440771 | |
| 10 #SNP_A-1780985 AG rs3859360 18 34178190 | |
| 11 #...end example input file | |
| 12 if(@ARGV == 1 and $ARGV[0] eq "-v"){ | |
| 13 print "Version 1.0\n"; | |
| 14 exit; | |
| 15 } | |
| 16 | |
| 17 # configuration file stuff | |
| 18 my $dirname = dirname(__FILE__); | |
| 19 my %config; | |
| 20 my $tool_data = shift @ARGV; | |
| 21 if(not -e "$tool_data/microarray_report.loc"){ | |
| 22 system("cp $dirname/tool-data/microarray_report.loc $tool_data/microarray_report.loc"); | |
| 23 } | |
| 24 open CONFIG, '<', "$tool_data/microarray_report.loc"; | |
| 25 while(<CONFIG>){ | |
| 26 (my $key, my $value) = split(/\s+/, $_); | |
| 27 $config{$key} = $value; | |
| 28 } | |
| 29 my $dbs_dir = $config{"dbs_directory"}; | |
| 30 close CONFIG; | |
| 31 | |
| 32 my $quiet = 0; | |
| 33 if(@ARGV and $ARGV[0] =~ /^-q/){ | |
| 34 $quiet = 1; | |
| 35 shift @ARGV; | |
| 36 } | |
| 37 | |
| 38 @ARGV == 8 or @ARGV == 9 or die "Usage: $0 [-q(uiet)] <output discordant.bed> <output summary.txt> <input ngs hgvs.txt> <input microarray calls.txt> <input.bam> <reporting cds regions.gtf> <reference genome.fasta> <coverage cutoff for stats> [exome target ngs regions.bed]\n"; | |
| 39 | |
| 40 print STDERR "Reading in reference sequence...\n" unless $quiet; | |
| 41 my %seq; | |
| 42 open(FASTA, "$dbs_dir/$ARGV[6]" ) | |
| 43 or die "Cannot open $dbs_dir/$ARGV[6] for reading: $!\n"; | |
| 44 $/ = "\n>"; | |
| 45 while(<FASTA>){ | |
| 46 chomp; | |
| 47 my ($name) = /^>?(\S+)/; | |
| 48 s/^[^\n]+//; | |
| 49 tr/\r\n//d; | |
| 50 $seq{$name} = $_; | |
| 51 } | |
| 52 close(FASTA); | |
| 53 $/ = "\n"; | |
| 54 | |
| 55 my $cov_cutoff = $ARGV[7]; | |
| 56 my %reporting; | |
| 57 print STDERR "Reading in reporting regions list...\n" unless $quiet; | |
| 58 open(GTF, $ARGV[5]) | |
| 59 or die "Cannot open $ARGV[5] for reading: $!\n"; | |
| 60 while(<GTF>){ | |
| 61 next if /^\s*#/; | |
| 62 my @fields = split /\t/, $_; | |
| 63 | |
| 64 if($fields[2] eq "exon"){ | |
| 65 if(not exists $reporting{$fields[0]}){ | |
| 66 $reporting{$fields[0]} = [[$fields[3], $fields[4]]]; | |
| 67 next; | |
| 68 } | |
| 69 push @{$reporting{$fields[0]}}, [$fields[3], $fields[4]]; | |
| 70 } | |
| 71 } | |
| 72 close(GTF); | |
| 73 for my $c (keys %reporting){ | |
| 74 $reporting{$c} = [sort {$a->[0] <=> $b->[0]} @{$reporting{$c}}]; | |
| 75 } | |
| 76 | |
| 77 my %regions; | |
| 78 if(@ARGV == 9){ | |
| 79 print STDERR "Reading in target regions list...\n" unless $quiet; | |
| 80 open(BED, $ARGV[8]) | |
| 81 or die "Cannot open $ARGV[8] for reading: $!\n"; | |
| 82 while(<BED>){ | |
| 83 chomp; | |
| 84 my @F = split /\t/, $_; | |
| 85 next unless @F > 4; | |
| 86 if(not exists $regions{$F[0]}){ | |
| 87 $regions{$F[0]} = []; | |
| 88 } | |
| 89 push @{$regions{$F[0]}}, [$F[1],$F[2]]; # assume they are in order | |
| 90 } | |
| 91 } | |
| 92 | |
| 93 open(BED, ">$ARGV[0]") | |
| 94 or die "Cannot open $ARGV[0] for writing: $!\n"; | |
| 95 print BED "track name=\"NGSvsMicroarrayDiscordance\" columns=\"chr pos pos microGenotype/ngsGenotype NGSReadDepth\" comment=\"asterisk after genotype indicates likely microarray false positive based on NGS coverage, lack of caveats and no mismatched NGS data\" useScore=\"colour gradient\"\n"; | |
| 96 | |
| 97 open(MICRO, $ARGV[3]) | |
| 98 or die "Cannot open microarray calls $ARGV[3] for reading: $!\n"; | |
| 99 my $micro_count = 1; | |
| 100 do {$_ = <MICRO>} while /^#/; #header lines | |
| 101 $micro_count++ while <MICRO>; # get a line count for progress meter later | |
| 102 close(MICRO); | |
| 103 $micro_count = int($micro_count/100); | |
| 104 | |
| 105 print STDERR "Reading in NGS calls...\n" unless $quiet; | |
| 106 open(MICRO, $ARGV[3]) | |
| 107 or die "Cannot open microarray calls $ARGV[3] for reading: $!\n"; | |
| 108 open(HGVS, $ARGV[2]) | |
| 109 or die "Cannot open HGVS genotype calls $ARGV[2] for reading: $!\n"; | |
| 110 open(SUMMARY, ">$ARGV[1]") | |
| 111 or die "Cannot open $ARGV[1] for writing: $!\n"; | |
| 112 my $bam_file = $ARGV[4]; | |
| 113 die "Input BAM file does not exist\n" if not -e $bam_file; | |
| 114 die "Input BAM file is not readable\n" if not -r $bam_file; | |
| 115 die "Input BAM file is empty\n" if -z $bam_file; | |
| 116 <HGVS>; # header | |
| 117 my (%ngs_call, %ngs_caveats, %ngs_methods, %ngs_varreads, %ngs_depth, %key2pos, %ignore); | |
| 118 while(<HGVS>){ | |
| 119 chomp; | |
| 120 my @F = split /\t/, $_; | |
| 121 my $pos_key = "$F[4]:$F[5]"; | |
| 122 # ignore non-SNPs | |
| 123 if($F[2] =~ /c.[\-*]?(\d+)_(\d+)/){ #multi-base events ignored, as microarray probes are likely to be reporting wrong here anyway | |
| 124 my $modlength = $2-$1; | |
| 125 for my $offset (0..$modlength){ | |
| 126 if($F[3] eq "-"){ | |
| 127 $ignore{"$F[4]:".($F[5]-$offset)} = 1; | |
| 128 } | |
| 129 else{ | |
| 130 $ignore{"$F[4]:".($F[5]+$offset)} = 1; | |
| 131 } | |
| 132 } | |
| 133 } | |
| 134 elsif($F[2] =~ /(?:del|ins|inv)/){ | |
| 135 next; | |
| 136 } | |
| 137 next unless $F[2] =~ />([ACGT])$/; # only looking for SNPs, to do add MNPs | |
| 138 my $new_base = $1; | |
| 139 $new_base =~ tr/ACGT/TGCA/ if $F[3] eq "-"; # rev comp cDNA HGVS | |
| 140 my $rs_key = $F[11]; # use both position and rsID as patches to hg19 have shifted some positions | |
| 141 my $ngs_call = ($F[6] =~ /homozygote/ ? $new_base : $F[10]) . $new_base; | |
| 142 $ngs_call{$pos_key} = $ngs_call{$rs_key} = $ngs_call; | |
| 143 $ngs_caveats{$pos_key} = $ngs_caveats{$rs_key} = $F[16] if $F[16] =~ /\S/; | |
| 144 $ngs_methods{$pos_key} = $ngs_methods{$rs_key} = $F[18]; | |
| 145 $ngs_varreads{$pos_key} = $ngs_varreads{$rs_key} = $F[8]; | |
| 146 $ngs_depth{$pos_key} = $ngs_depth{$rs_key} = $F[9]; | |
| 147 $key2pos{$rs_key} = $key2pos{$pos_key} = [$F[4],$F[5]]; | |
| 148 } | |
| 149 close(HGVS); | |
| 150 | |
| 151 print STDERR "Comparing to microarray calls...\n" unless $quiet; | |
| 152 print STDERR " 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%\n" unless $quiet; | |
| 153 my $num_ignored_snps = 0; | |
| 154 my $num_usable_snps = 0; | |
| 155 my $num_targeted_snps = 0; | |
| 156 my %discordant; # key => [zygosity, method, caveats, variant depth, total depth]; | |
| 157 my %cov_missing; # List all sites with unknown coverage | |
| 158 do {$_ = <MICRO>} while /^#/; #header lines | |
| 159 while(<MICRO>){ | |
| 160 if(not $quiet){ | |
| 161 if($.%($micro_count*10) == 1){ | |
| 162 print STDERR "|"; | |
| 163 } | |
| 164 elsif($.%($micro_count*5) == 1){ | |
| 165 print STDERR ":"; | |
| 166 } | |
| 167 elsif($.%$micro_count == 1){ | |
| 168 print STDERR "."; | |
| 169 } | |
| 170 } | |
| 171 chomp; | |
| 172 my @F = split /\t/, $_; | |
| 173 my $micro_call = $F[1]; | |
| 174 next if $micro_call eq "---"; | |
| 175 my $rsid = $F[2]; | |
| 176 my $chr = $F[3]; | |
| 177 next if $chr eq "---"; | |
| 178 my $pos = $F[4]; | |
| 179 if(exists $ignore{"chr$chr:$pos"}){ # multinucleotide events probably wrong on the microarray | |
| 180 $num_ignored_snps++; | |
| 181 next; | |
| 182 } | |
| 183 $num_usable_snps++; | |
| 184 | |
| 185 if(exists $reporting{"chr".$chr}){ # in a region reported in the annotation? | |
| 186 my $in_region = 0; | |
| 187 my $arrayref = $reporting{"chr".$chr}; | |
| 188 for(my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){ | |
| 189 my $interval = $arrayref->[$search_index]; | |
| 190 if($pos >= $interval->[0] and $pos <= $interval->[1]){ | |
| 191 $in_region = 1; last; | |
| 192 } | |
| 193 last if $pos < $interval->[0]; | |
| 194 } | |
| 195 next unless $in_region; | |
| 196 } | |
| 197 | |
| 198 if(exists $regions{"chr".$chr}){ # in the exome target region? | |
| 199 my $in_region = 0; | |
| 200 my $arrayref = $regions{"chr".$chr}; | |
| 201 for (my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){ | |
| 202 my $interval = $arrayref->[$search_index]; | |
| 203 if($pos >= $interval->[0] and $pos <= $interval->[1]){ | |
| 204 $in_region = 1; last; | |
| 205 } | |
| 206 last if $pos < $interval->[0]; | |
| 207 } | |
| 208 | |
| 209 next unless $in_region; | |
| 210 } | |
| 211 | |
| 212 $num_targeted_snps++; | |
| 213 my $m = substr($micro_call, 0, 1); | |
| 214 my $m2 = substr($micro_call, 1, 1); | |
| 215 my $key; | |
| 216 if(exists $ngs_call{$rsid}){ | |
| 217 $key = $rsid; | |
| 218 } | |
| 219 elsif(exists $ngs_call{"chr$chr:$pos"}){ | |
| 220 $key = "chr$chr:$pos"; | |
| 221 } | |
| 222 else{ | |
| 223 if(not exists $seq{"chr$chr"}){ | |
| 224 warn "Skipping microarray call, no reference sequence was provided for chr$chr\n"; | |
| 225 $num_targeted_snps--; | |
| 226 next; | |
| 227 } | |
| 228 # 4 situations could be e.g. ref AA, but micro says AA or AG or GG or GC | |
| 229 my $ref_base = uc(substr($seq{"chr$chr"}, $pos-1, 1)); | |
| 230 if($m eq $m2){ | |
| 231 if($micro_call eq $ref_base.$ref_base){ | |
| 232 # concordant homo calls as reference | |
| 233 # undefs will be filled in later en masse in a single call to samtools for efficiency | |
| 234 $discordant{$rsid} = ["micro ref, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$micro_call"]; | |
| 235 } | |
| 236 else{ | |
| 237 # called homo var by microarray, but homo ref by NGS | |
| 238 #print STDERR "chr$chr $pos micro $micro_call vs NGS homo ref $ref_base"; | |
| 239 $discordant{$rsid} = ["micro homo, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"]; | |
| 240 } | |
| 241 } | |
| 242 else { #$m ne $m2 | |
| 243 if($m eq $ref_base or $m2 eq $ref_base){ | |
| 244 # called het var by microarray, but homo ref by NGS | |
| 245 $discordant{$rsid} = ["micro het, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"]; | |
| 246 } | |
| 247 else{ | |
| 248 $discordant{$rsid} = ["micro compound het, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"]; | |
| 249 } | |
| 250 } | |
| 251 $cov_missing{"chr$chr:$pos"} = [$rsid, $m, $m2]; | |
| 252 next; | |
| 253 } | |
| 254 | |
| 255 next if $ngs_depth{$key} <= $cov_cutoff; | |
| 256 my $ngs_call = $ngs_call{$key}; | |
| 257 # if we get to here, there are both NGS and micro calls | |
| 258 if($micro_call eq $ngs_call or $micro_call eq reverse($ngs_call)){ | |
| 259 #print STDERR "Concordant NGS call for chr$chr:$pos\n"; | |
| 260 if($m eq $m2){ | |
| 261 $discordant{$key} = ["micro homo, ngs homo", $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"]; | |
| 262 } | |
| 263 else{ | |
| 264 $discordant{$key} = ["micro het, ngs het", $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"]; | |
| 265 } | |
| 266 next; | |
| 267 } | |
| 268 else{ | |
| 269 # discordant | |
| 270 #print STDERR "Discordant NGS call for chr$chr:$pos $micro_call vs. $ngs_call\n"; | |
| 271 # is it just a zygosity difference? | |
| 272 my $n = substr($ngs_call, 0, 1); | |
| 273 my $n2 = substr($ngs_call, 1, 1); | |
| 274 my $discordance; | |
| 275 if($micro_call eq $n.$n or $micro_call eq $n2.$n2){ # micro is homo for variant, ngs is het | |
| 276 $discordance = "micro homo, ngs het"; | |
| 277 } | |
| 278 elsif($m.$m eq $ngs_call or $m2.$m2 eq $ngs_call){ # micro is het for variant, ngs is homo | |
| 279 $discordance = "micro het, ngs homo"; | |
| 280 } | |
| 281 elsif($m eq $m2){ | |
| 282 if($n eq $n2){ | |
| 283 $discordance = "diff micro homo, ngs homo"; | |
| 284 } | |
| 285 else{ | |
| 286 $discordance = "diff micro homo, ngs het"; | |
| 287 } | |
| 288 } | |
| 289 else{ | |
| 290 if($n eq $n2){ | |
| 291 $discordance = "diff micro het, ngs homo"; | |
| 292 } | |
| 293 else{ | |
| 294 $discordance = "diff micro het, ngs het"; | |
| 295 } | |
| 296 } | |
| 297 $discordant{$key} = [$discordance, $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"]; | |
| 298 } | |
| 299 } | |
| 300 close(MICRO); | |
| 301 print STDERR "\n" unless $quiet; | |
| 302 | |
| 303 print STDERR "Retrieving reference call depth of coverage stats...\n" unless $quiet; | |
| 304 my $covbed = "$$.bed"; | |
| 305 open(TMPCOV, ">$covbed") | |
| 306 or die "Cannot open temporary BED file for samtools: $!\n"; | |
| 307 my $num_covmissing = 0; | |
| 308 for(sort keys %cov_missing){ | |
| 309 $num_covmissing++; | |
| 310 my ($chr, $pos) = split /:/, $_; | |
| 311 print TMPCOV "$chr\t$pos\n"; | |
| 312 } | |
| 313 close(TMPCOV); | |
| 314 | |
| 315 my $cov_count = int($num_covmissing/100); | |
| 316 print STDERR " 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%\n" unless $quiet; | |
| 317 #print STDERR "samtools mpileup -l $covbed -I $bam_file 2>/dev/null\n"; | |
| 318 #<STDIN>; | |
| 319 open(SAM, "samtools mpileup -l $covbed -I $bam_file 2>/dev/null |") | |
| 320 or die "Cannot run samtools: $!\n";; | |
| 321 while(<SAM>){ | |
| 322 #while($depth_data =~ /([^\n]+)/sg){ | |
| 323 if(not $quiet){ | |
| 324 if($.%($cov_count*10) == 0){ | |
| 325 print STDERR "|"; | |
| 326 } | |
| 327 elsif($.%($cov_count*5) == 0){ | |
| 328 print STDERR ":"; | |
| 329 } | |
| 330 elsif($.%$cov_count == 0){ | |
| 331 print STDERR "."; | |
| 332 } | |
| 333 } | |
| 334 | |
| 335 chomp; | |
| 336 my @B = split /\t/, $_; | |
| 337 #my @B = split /\t/, $1; | |
| 338 my ($rsid, $m, $m2) = @{$cov_missing{"$B[0]:$B[1]"}}; | |
| 339 my $depth = @B > 1 ? $B[3] : 0; | |
| 340 #print STDERR "Depth is $depth for $B[0]:$B[1]\n"; | |
| 341 my $read_calls = uc($B[4]); | |
| 342 my %base_tot; | |
| 343 for my $read_call (split //, $read_calls){ | |
| 344 $base_tot{$read_call}++; | |
| 345 } | |
| 346 $base_tot{$m} = 0 if not exists $base_tot{$m}; | |
| 347 $base_tot{$m2} = 0 if not exists $base_tot{$m2}; | |
| 348 if($depth <= $cov_cutoff){ | |
| 349 $discordant{$rsid}->[0] = "no ngs call"; | |
| 350 $discordant{$rsid}->[1] = "insufficient coverage"; | |
| 351 $discordant{$rsid}->[3] = $base_tot{$m}; | |
| 352 $discordant{$rsid}->[4] = $depth; | |
| 353 } | |
| 354 elsif($discordant{$rsid}->[0] eq "micro ref, ngs ref" or $discordant{$rsid}->[0] eq "micro homo, ngs ref"){ | |
| 355 # concordant homo calls as reference or called homo var by microarray, but homo ref by NGS | |
| 356 $discordant{$rsid}->[3] = $base_tot{$m}; | |
| 357 $discordant{$rsid}->[4] = $depth; | |
| 358 } | |
| 359 elsif($discordant{$rsid}->[0] eq "micro het, ngs ref" or $discordant{$rsid}->[0] eq "micro compound het, ngs ref"){ | |
| 360 # called het var by microarray, but homo ref by NGS | |
| 361 $discordant{$rsid}->[3] = $base_tot{$m}.",".$base_tot{$m2}; | |
| 362 $discordant{$rsid}->[4] = $depth; | |
| 363 } | |
| 364 else{ | |
| 365 print STDERR "Didn't know how to deal with $B[0]:$B[1] -> ", join(" ", @{$discordant{$rsid}}),"\n" unless $quiet; | |
| 366 } | |
| 367 } | |
| 368 unlink $covbed; | |
| 369 | |
| 370 #Count false negatives | |
| 371 my $false_neg_homo_count = 0; | |
| 372 my $false_neg_het_count = 0; | |
| 373 my $missing_count = 0; | |
| 374 my $wrong_base_count = 0; | |
| 375 my $wrong_base_caveat_count = 0; | |
| 376 my $discordant_caveat_count = 0; | |
| 377 my $tot_caveat_count = 0; | |
| 378 my $false_homo_count = 0; | |
| 379 my $false_het_count = 0; | |
| 380 my $micro_fdr_estimate = 0; | |
| 381 my %calls_by_method; | |
| 382 my %concordance_by_method; | |
| 383 for my $key (keys %discordant){ | |
| 384 my $rec = $discordant{$key}; | |
| 385 my $name = $rec->[7]; | |
| 386 $tot_caveat_count++ if $rec->[2]; | |
| 387 if($rec->[0] eq "micro het, ngs ref" or $rec->[0] eq "micro homo, ngs ref"){ | |
| 388 if($rec->[0] eq "micro homo, ngs ref"){ | |
| 389 $false_neg_homo_count++; | |
| 390 } | |
| 391 else{ | |
| 392 $false_neg_het_count++; | |
| 393 } | |
| 394 # probably false micro call if no caveats, high coverage, and no/one mismatches in NGS | |
| 395 next unless defined $rec and defined $rec->[3] and defined $rec->[4]; | |
| 396 if(not $rec->[2] and $rec->[4] >= 50 and $rec->[3] =~ /(\d{2,})/ and $1+1 >= $rec->[4]){ | |
| 397 $micro_fdr_estimate++; | |
| 398 $name .= "*"; | |
| 399 } | |
| 400 } | |
| 401 elsif($rec->[0] eq "micro compound het, ngs ref"){ | |
| 402 $false_neg_het_count+=2; | |
| 403 # probably false micro call if no caveats, high coverage, and no/one mismatches in NGS | |
| 404 next unless defined $rec and defined $rec->[3] and defined $rec->[4]; | |
| 405 if(not $rec->[2] and $rec->[4] >= 50 and $rec->[3] =~ /(\d{2,})/ and $1+1 >= $rec->[4]){ | |
| 406 $micro_fdr_estimate+=2; | |
| 407 $name .= "*"; | |
| 408 } | |
| 409 } | |
| 410 elsif($rec->[0] eq "no ngs call"){ | |
| 411 $missing_count++; | |
| 412 } | |
| 413 elsif($rec->[0] =~ /^diff/){ | |
| 414 $wrong_base_count++; | |
| 415 if($rec->[3] =~ /\S/){ | |
| 416 $wrong_base_caveat_count++; | |
| 417 } | |
| 418 } | |
| 419 elsif($rec->[0] eq "micro het, ngs homo"){ | |
| 420 $false_homo_count++; | |
| 421 } | |
| 422 elsif($rec->[0] eq "micro homo, ngs het"){ | |
| 423 $false_het_count++; | |
| 424 } | |
| 425 else{ | |
| 426 # calls concordant | |
| 427 $concordance_by_method{$rec->[1]}++; | |
| 428 next; | |
| 429 } | |
| 430 if($rec->[4] > $cov_cutoff and $rec->[2] and $name !~ /\*$/){ | |
| 431 $discordant_caveat_count++; | |
| 432 } | |
| 433 my $chr = $rec->[5]; | |
| 434 my $pos = $rec->[6]; | |
| 435 my $score = $rec->[4]; | |
| 436 $score = 1000 if $score > 1000; | |
| 437 print BED "$chr\t$pos\t$pos\t$name\t$score\n"; | |
| 438 } | |
| 439 | |
| 440 print SUMMARY "# NGS genotypes in $ARGV[2] vs. SNP microarray in $ARGV[3], minimum NGS coverage of ",$cov_cutoff+1, "\n"; | |
| 441 print SUMMARY "#Columns: Measure\tCount\tPercentage\n"; | |
| 442 print SUMMARY "Total ignored SNP microarray calls due to NGS putative indels or MNPs\t$num_ignored_snps\n"; | |
| 443 print SUMMARY "Total usable SNP microarray calls\t$num_usable_snps\n"; | |
| 444 printf SUMMARY "Total targeted SNP microarray calls (based on target file %s)\t%d\t%.2f\n", $ARGV[5],$num_targeted_snps,$num_targeted_snps/$num_usable_snps*100 if keys %regions; | |
| 445 printf SUMMARY "Targeted SNPs with insufficient NGS coverage (<=$cov_cutoff)\t%d\t%.2f\n", $missing_count, $missing_count/$num_targeted_snps*100; | |
| 446 $num_targeted_snps -= $missing_count; | |
| 447 my $tot_bad = $wrong_base_count+$false_neg_homo_count+$false_neg_het_count+$false_homo_count+$false_het_count; | |
| 448 printf SUMMARY "Total discordance\t%d\t%.2f\n", $tot_bad, $tot_bad/$num_targeted_snps*100; | |
| 449 $tot_bad -= $discordant_caveat_count+$micro_fdr_estimate; | |
| 450 printf SUMMARY "Significant discordance (excludes NGS calls with caveats, microarray het FDR)\t%d\t%.2f\n", $tot_bad, $tot_bad/$num_targeted_snps*100; | |
| 451 printf SUMMARY "Caveats discordant\t%d\t%.2f\n", $discordant_caveat_count, $tot_caveat_count == 0 ? 0 : $discordant_caveat_count/$tot_caveat_count*100; | |
| 452 printf SUMMARY "Incorrect NGS base called\t%d\t%.2f\n", $wrong_base_count, $wrong_base_count/$num_targeted_snps*100; | |
| 453 printf SUMMARY "Incorrect NGS base called, subset with caveats\t%d\t%.2f\n", $wrong_base_caveat_count, $wrong_base_count == 0 ? 0 : $wrong_base_caveat_count/$wrong_base_count*100; | |
| 454 printf SUMMARY "False negative NGS homo\t%d\t%.2f\n", $false_neg_homo_count, $false_neg_homo_count/$num_targeted_snps*100; | |
| 455 printf SUMMARY "False negative NGS het\t%d\t%.2f\n", $false_neg_het_count, $false_neg_het_count/$num_targeted_snps*100; | |
| 456 printf SUMMARY "Microarray est. FDR het\t%d\t%.2f\n", $micro_fdr_estimate, $micro_fdr_estimate/$num_targeted_snps*100; | |
| 457 printf SUMMARY "Het called NGS homo\t%d\t%.2f\n", $false_homo_count, $false_homo_count/$num_targeted_snps*100; | |
| 458 printf SUMMARY "Homo called NGS het\t%d\t%.2f\n", $false_het_count, $false_het_count/$num_targeted_snps*100; | |
| 459 for(sort keys %concordance_by_method){ | |
| 460 printf SUMMARY "%s true positives\t%d\t%.2f\n", (length($_) ? $_ : "Reference"), $concordance_by_method{$_}, $concordance_by_method{$_}/$num_targeted_snps*100; | |
| 461 } | |
| 462 sub find_earliest_index{ | |
| 463 # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start | |
| 464 my ($query, $array) = @_; | |
| 465 | |
| 466 return 0 if $query < $array->[0]->[0]; | |
| 467 | |
| 468 my ($left, $right, $prevCenter) = (0, $#$array, -1); | |
| 469 | |
| 470 while(1) { | |
| 471 my $center = int (($left + $right)/2); | |
| 472 | |
| 473 my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1); | |
| 474 | |
| 475 return $center if $cmp == 0; | |
| 476 if ($center == $prevCenter) { | |
| 477 return $left; | |
| 478 } | |
| 479 $right = $center if $cmp < 0; | |
| 480 $left = $center if $cmp > 0; | |
| 481 $prevCenter = $center; | |
| 482 } | |
| 483 } |
