# HG changeset patch # User Yusuf Ali # Date 1427312400 21600 # Node ID 882d119ede1fcd71b696820adff1441b6f5f3441 initial commit diff -r 000000000000 -r 882d119ede1f MicroarrayReports.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/MicroarrayReports.xml Wed Mar 25 13:40:00 2015 -0600 @@ -0,0 +1,44 @@ + + + + over targeted regions in a genome + microarray_reports -v + microarray_reports $__tool_data_path__ -q $out_discordant_bed $out_summary_table $input_ngs_calls $input_micro_calls $input_bam $target_gtf $ref_genome.fa $coverage_cutoff $target_bed + + + + + + + + + + + + + + + + + + +**What it does** + +This tool reports several statistics comparing the genotypes derived from a next-gen sequencing run to a microarray genotyping result +for the same individual. When coverage is good (10x for colorspace, 20x for base space), false positive and false negative rates should +be below 2%, assuming 99% accuracy of both the microarray and the NGS genotyping. + +------ + +** Example** + +The microarray file should have five columns (ProbeID, forward strand genotypes, dbSNP ID, chr, pos), e.g.:: + + #Comment lines... + Probe set header line + SNP_A-1780520 GG rs16994928 20 48440771 + SNP_A-1780985 AG rs3859360 18 34178190 + + + diff -r 000000000000 -r 882d119ede1f microarray_reports --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/microarray_reports Wed Mar 25 13:40:00 2015 -0600 @@ -0,0 +1,483 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Basename; +# Assume the microarray file looks something like this... +## comment lines... +#Probe Set ID 11-01451_(GenomeWideSNP_5).brlmm-p.chp Forward Strand Base Calls dbSNP RS ID Chromosome Chromosomal Position +#SNP_A-1780520 GG rs16994928 20 48440771 +#SNP_A-1780985 AG rs3859360 18 34178190 +#...end example input file +if(@ARGV == 1 and $ARGV[0] eq "-v"){ + print "Version 1.0\n"; + exit; +} + +# configuration file stuff +my $dirname = dirname(__FILE__); +my %config; +my $tool_data = shift @ARGV; +if(not -e "$tool_data/microarray_report.loc"){ + system("cp $dirname/tool-data/microarray_report.loc $tool_data/microarray_report.loc"); +} +open CONFIG, '<', "$tool_data/microarray_report.loc"; +while(){ + (my $key, my $value) = split(/\s+/, $_); + $config{$key} = $value; +} +my $dbs_dir = $config{"dbs_directory"}; +close CONFIG; + +my $quiet = 0; +if(@ARGV and $ARGV[0] =~ /^-q/){ + $quiet = 1; + shift @ARGV; +} + +@ARGV == 8 or @ARGV == 9 or die "Usage: $0 [-q(uiet)] [exome target ngs regions.bed]\n"; + +print STDERR "Reading in reference sequence...\n" unless $quiet; +my %seq; +open(FASTA, "$dbs_dir/$ARGV[6]" ) + or die "Cannot open $dbs_dir/$ARGV[6] for reading: $!\n"; +$/ = "\n>"; +while(){ + chomp; + my ($name) = /^>?(\S+)/; + s/^[^\n]+//; + tr/\r\n//d; + $seq{$name} = $_; +} +close(FASTA); +$/ = "\n"; + +my $cov_cutoff = $ARGV[7]; +my %reporting; +print STDERR "Reading in reporting regions list...\n" unless $quiet; +open(GTF, $ARGV[5]) + or die "Cannot open $ARGV[5] for reading: $!\n"; +while(){ + next if /^\s*#/; + my @fields = split /\t/, $_; + + if($fields[2] eq "exon"){ + if(not exists $reporting{$fields[0]}){ + $reporting{$fields[0]} = [[$fields[3], $fields[4]]]; + next; + } + push @{$reporting{$fields[0]}}, [$fields[3], $fields[4]]; + } +} +close(GTF); +for my $c (keys %reporting){ + $reporting{$c} = [sort {$a->[0] <=> $b->[0]} @{$reporting{$c}}]; +} + +my %regions; +if(@ARGV == 9){ + print STDERR "Reading in target regions list...\n" unless $quiet; + open(BED, $ARGV[8]) + or die "Cannot open $ARGV[8] for reading: $!\n"; + while(){ + chomp; + my @F = split /\t/, $_; + next unless @F > 4; + if(not exists $regions{$F[0]}){ + $regions{$F[0]} = []; + } + push @{$regions{$F[0]}}, [$F[1],$F[2]]; # assume they are in order + } +} + +open(BED, ">$ARGV[0]") + or die "Cannot open $ARGV[0] for writing: $!\n"; +print BED "track name=\"NGSvsMicroarrayDiscordance\" columns=\"chr pos pos microGenotype/ngsGenotype NGSReadDepth\" comment=\"asterisk after genotype indicates likely microarray false positive based on NGS coverage, lack of caveats and no mismatched NGS data\" useScore=\"colour gradient\"\n"; + +open(MICRO, $ARGV[3]) + or die "Cannot open microarray calls $ARGV[3] for reading: $!\n"; +my $micro_count = 1; +do {$_ = } while /^#/; #header lines +$micro_count++ while ; # get a line count for progress meter later +close(MICRO); +$micro_count = int($micro_count/100); + +print STDERR "Reading in NGS calls...\n" unless $quiet; +open(MICRO, $ARGV[3]) + or die "Cannot open microarray calls $ARGV[3] for reading: $!\n"; +open(HGVS, $ARGV[2]) + or die "Cannot open HGVS genotype calls $ARGV[2] for reading: $!\n"; +open(SUMMARY, ">$ARGV[1]") + or die "Cannot open $ARGV[1] for writing: $!\n"; +my $bam_file = $ARGV[4]; +die "Input BAM file does not exist\n" if not -e $bam_file; +die "Input BAM file is not readable\n" if not -r $bam_file; +die "Input BAM file is empty\n" if -z $bam_file; +; # header +my (%ngs_call, %ngs_caveats, %ngs_methods, %ngs_varreads, %ngs_depth, %key2pos, %ignore); +while(){ + chomp; + my @F = split /\t/, $_; + my $pos_key = "$F[4]:$F[5]"; + # ignore non-SNPs + if($F[2] =~ /c.[\-*]?(\d+)_(\d+)/){ #multi-base events ignored, as microarray probes are likely to be reporting wrong here anyway + my $modlength = $2-$1; + for my $offset (0..$modlength){ + if($F[3] eq "-"){ + $ignore{"$F[4]:".($F[5]-$offset)} = 1; + } + else{ + $ignore{"$F[4]:".($F[5]+$offset)} = 1; + } + } + } + elsif($F[2] =~ /(?:del|ins|inv)/){ + next; + } + next unless $F[2] =~ />([ACGT])$/; # only looking for SNPs, to do add MNPs + my $new_base = $1; + $new_base =~ tr/ACGT/TGCA/ if $F[3] eq "-"; # rev comp cDNA HGVS + my $rs_key = $F[11]; # use both position and rsID as patches to hg19 have shifted some positions + my $ngs_call = ($F[6] =~ /homozygote/ ? $new_base : $F[10]) . $new_base; + $ngs_call{$pos_key} = $ngs_call{$rs_key} = $ngs_call; + $ngs_caveats{$pos_key} = $ngs_caveats{$rs_key} = $F[16] if $F[16] =~ /\S/; + $ngs_methods{$pos_key} = $ngs_methods{$rs_key} = $F[18]; + $ngs_varreads{$pos_key} = $ngs_varreads{$rs_key} = $F[8]; + $ngs_depth{$pos_key} = $ngs_depth{$rs_key} = $F[9]; + $key2pos{$rs_key} = $key2pos{$pos_key} = [$F[4],$F[5]]; +} +close(HGVS); + +print STDERR "Comparing to microarray calls...\n" unless $quiet; +print STDERR " 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%\n" unless $quiet; +my $num_ignored_snps = 0; +my $num_usable_snps = 0; +my $num_targeted_snps = 0; +my %discordant; # key => [zygosity, method, caveats, variant depth, total depth]; +my %cov_missing; # List all sites with unknown coverage +do {$_ = } while /^#/; #header lines +while(){ + if(not $quiet){ + if($.%($micro_count*10) == 1){ + print STDERR "|"; + } + elsif($.%($micro_count*5) == 1){ + print STDERR ":"; + } + elsif($.%$micro_count == 1){ + print STDERR "."; + } + } + chomp; + my @F = split /\t/, $_; + my $micro_call = $F[1]; + next if $micro_call eq "---"; + my $rsid = $F[2]; + my $chr = $F[3]; + next if $chr eq "---"; + my $pos = $F[4]; + if(exists $ignore{"chr$chr:$pos"}){ # multinucleotide events probably wrong on the microarray + $num_ignored_snps++; + next; + } + $num_usable_snps++; + + if(exists $reporting{"chr".$chr}){ # in a region reported in the annotation? + my $in_region = 0; + my $arrayref = $reporting{"chr".$chr}; + for(my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){ + my $interval = $arrayref->[$search_index]; + if($pos >= $interval->[0] and $pos <= $interval->[1]){ + $in_region = 1; last; + } + last if $pos < $interval->[0]; + } + next unless $in_region; + } + + if(exists $regions{"chr".$chr}){ # in the exome target region? + my $in_region = 0; + my $arrayref = $regions{"chr".$chr}; + for (my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){ + my $interval = $arrayref->[$search_index]; + if($pos >= $interval->[0] and $pos <= $interval->[1]){ + $in_region = 1; last; + } + last if $pos < $interval->[0]; + } + + next unless $in_region; + } + + $num_targeted_snps++; + my $m = substr($micro_call, 0, 1); + my $m2 = substr($micro_call, 1, 1); + my $key; + if(exists $ngs_call{$rsid}){ + $key = $rsid; + } + elsif(exists $ngs_call{"chr$chr:$pos"}){ + $key = "chr$chr:$pos"; + } + else{ + if(not exists $seq{"chr$chr"}){ + warn "Skipping microarray call, no reference sequence was provided for chr$chr\n"; + $num_targeted_snps--; + next; + } + # 4 situations could be e.g. ref AA, but micro says AA or AG or GG or GC + my $ref_base = uc(substr($seq{"chr$chr"}, $pos-1, 1)); + if($m eq $m2){ + if($micro_call eq $ref_base.$ref_base){ + # concordant homo calls as reference + # undefs will be filled in later en masse in a single call to samtools for efficiency + $discordant{$rsid} = ["micro ref, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$micro_call"]; + } + else{ + # called homo var by microarray, but homo ref by NGS + #print STDERR "chr$chr $pos micro $micro_call vs NGS homo ref $ref_base"; + $discordant{$rsid} = ["micro homo, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"]; + } + } + else { #$m ne $m2 + if($m eq $ref_base or $m2 eq $ref_base){ + # called het var by microarray, but homo ref by NGS + $discordant{$rsid} = ["micro het, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"]; + } + else{ + $discordant{$rsid} = ["micro compound het, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"]; + } + } + $cov_missing{"chr$chr:$pos"} = [$rsid, $m, $m2]; + next; + } + + next if $ngs_depth{$key} <= $cov_cutoff; + my $ngs_call = $ngs_call{$key}; + # if we get to here, there are both NGS and micro calls + if($micro_call eq $ngs_call or $micro_call eq reverse($ngs_call)){ + #print STDERR "Concordant NGS call for chr$chr:$pos\n"; + if($m eq $m2){ + $discordant{$key} = ["micro homo, ngs homo", $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"]; + } + else{ + $discordant{$key} = ["micro het, ngs het", $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"]; + } + next; + } + else{ + # discordant + #print STDERR "Discordant NGS call for chr$chr:$pos $micro_call vs. $ngs_call\n"; + # is it just a zygosity difference? + my $n = substr($ngs_call, 0, 1); + my $n2 = substr($ngs_call, 1, 1); + my $discordance; + if($micro_call eq $n.$n or $micro_call eq $n2.$n2){ # micro is homo for variant, ngs is het + $discordance = "micro homo, ngs het"; + } + elsif($m.$m eq $ngs_call or $m2.$m2 eq $ngs_call){ # micro is het for variant, ngs is homo + $discordance = "micro het, ngs homo"; + } + elsif($m eq $m2){ + if($n eq $n2){ + $discordance = "diff micro homo, ngs homo"; + } + else{ + $discordance = "diff micro homo, ngs het"; + } + } + else{ + if($n eq $n2){ + $discordance = "diff micro het, ngs homo"; + } + else{ + $discordance = "diff micro het, ngs het"; + } + } + $discordant{$key} = [$discordance, $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"]; + } +} +close(MICRO); +print STDERR "\n" unless $quiet; + +print STDERR "Retrieving reference call depth of coverage stats...\n" unless $quiet; +my $covbed = "$$.bed"; +open(TMPCOV, ">$covbed") + or die "Cannot open temporary BED file for samtools: $!\n"; +my $num_covmissing = 0; +for(sort keys %cov_missing){ + $num_covmissing++; + my ($chr, $pos) = split /:/, $_; + print TMPCOV "$chr\t$pos\n"; +} +close(TMPCOV); + +my $cov_count = int($num_covmissing/100); +print STDERR " 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%\n" unless $quiet; +#print STDERR "samtools mpileup -l $covbed -I $bam_file 2>/dev/null\n"; +#; +open(SAM, "samtools mpileup -l $covbed -I $bam_file 2>/dev/null |") + or die "Cannot run samtools: $!\n";; +while(){ +#while($depth_data =~ /([^\n]+)/sg){ + if(not $quiet){ + if($.%($cov_count*10) == 0){ + print STDERR "|"; + } + elsif($.%($cov_count*5) == 0){ + print STDERR ":"; + } + elsif($.%$cov_count == 0){ + print STDERR "."; + } + } + + chomp; + my @B = split /\t/, $_; + #my @B = split /\t/, $1; + my ($rsid, $m, $m2) = @{$cov_missing{"$B[0]:$B[1]"}}; + my $depth = @B > 1 ? $B[3] : 0; + #print STDERR "Depth is $depth for $B[0]:$B[1]\n"; + my $read_calls = uc($B[4]); + my %base_tot; + for my $read_call (split //, $read_calls){ + $base_tot{$read_call}++; + } + $base_tot{$m} = 0 if not exists $base_tot{$m}; + $base_tot{$m2} = 0 if not exists $base_tot{$m2}; + if($depth <= $cov_cutoff){ + $discordant{$rsid}->[0] = "no ngs call"; + $discordant{$rsid}->[1] = "insufficient coverage"; + $discordant{$rsid}->[3] = $base_tot{$m}; + $discordant{$rsid}->[4] = $depth; + } + elsif($discordant{$rsid}->[0] eq "micro ref, ngs ref" or $discordant{$rsid}->[0] eq "micro homo, ngs ref"){ + # concordant homo calls as reference or called homo var by microarray, but homo ref by NGS + $discordant{$rsid}->[3] = $base_tot{$m}; + $discordant{$rsid}->[4] = $depth; + } + elsif($discordant{$rsid}->[0] eq "micro het, ngs ref" or $discordant{$rsid}->[0] eq "micro compound het, ngs ref"){ + # called het var by microarray, but homo ref by NGS + $discordant{$rsid}->[3] = $base_tot{$m}.",".$base_tot{$m2}; + $discordant{$rsid}->[4] = $depth; + } + else{ + print STDERR "Didn't know how to deal with $B[0]:$B[1] -> ", join(" ", @{$discordant{$rsid}}),"\n" unless $quiet; + } +} +unlink $covbed; + +#Count false negatives +my $false_neg_homo_count = 0; +my $false_neg_het_count = 0; +my $missing_count = 0; +my $wrong_base_count = 0; +my $wrong_base_caveat_count = 0; +my $discordant_caveat_count = 0; +my $tot_caveat_count = 0; +my $false_homo_count = 0; +my $false_het_count = 0; +my $micro_fdr_estimate = 0; +my %calls_by_method; +my %concordance_by_method; +for my $key (keys %discordant){ + my $rec = $discordant{$key}; + my $name = $rec->[7]; + $tot_caveat_count++ if $rec->[2]; + if($rec->[0] eq "micro het, ngs ref" or $rec->[0] eq "micro homo, ngs ref"){ + if($rec->[0] eq "micro homo, ngs ref"){ + $false_neg_homo_count++; + } + else{ + $false_neg_het_count++; + } + # probably false micro call if no caveats, high coverage, and no/one mismatches in NGS + next unless defined $rec and defined $rec->[3] and defined $rec->[4]; + if(not $rec->[2] and $rec->[4] >= 50 and $rec->[3] =~ /(\d{2,})/ and $1+1 >= $rec->[4]){ + $micro_fdr_estimate++; + $name .= "*"; + } + } + elsif($rec->[0] eq "micro compound het, ngs ref"){ + $false_neg_het_count+=2; + # probably false micro call if no caveats, high coverage, and no/one mismatches in NGS + next unless defined $rec and defined $rec->[3] and defined $rec->[4]; + if(not $rec->[2] and $rec->[4] >= 50 and $rec->[3] =~ /(\d{2,})/ and $1+1 >= $rec->[4]){ + $micro_fdr_estimate+=2; + $name .= "*"; + } + } + elsif($rec->[0] eq "no ngs call"){ + $missing_count++; + } + elsif($rec->[0] =~ /^diff/){ + $wrong_base_count++; + if($rec->[3] =~ /\S/){ + $wrong_base_caveat_count++; + } + } + elsif($rec->[0] eq "micro het, ngs homo"){ + $false_homo_count++; + } + elsif($rec->[0] eq "micro homo, ngs het"){ + $false_het_count++; + } + else{ + # calls concordant + $concordance_by_method{$rec->[1]}++; + next; + } + if($rec->[4] > $cov_cutoff and $rec->[2] and $name !~ /\*$/){ + $discordant_caveat_count++; + } + my $chr = $rec->[5]; + my $pos = $rec->[6]; + my $score = $rec->[4]; + $score = 1000 if $score > 1000; + print BED "$chr\t$pos\t$pos\t$name\t$score\n"; +} + +print SUMMARY "# NGS genotypes in $ARGV[2] vs. SNP microarray in $ARGV[3], minimum NGS coverage of ",$cov_cutoff+1, "\n"; +print SUMMARY "#Columns: Measure\tCount\tPercentage\n"; +print SUMMARY "Total ignored SNP microarray calls due to NGS putative indels or MNPs\t$num_ignored_snps\n"; +print SUMMARY "Total usable SNP microarray calls\t$num_usable_snps\n"; +printf SUMMARY "Total targeted SNP microarray calls (based on target file %s)\t%d\t%.2f\n", $ARGV[5],$num_targeted_snps,$num_targeted_snps/$num_usable_snps*100 if keys %regions; +printf SUMMARY "Targeted SNPs with insufficient NGS coverage (<=$cov_cutoff)\t%d\t%.2f\n", $missing_count, $missing_count/$num_targeted_snps*100; +$num_targeted_snps -= $missing_count; +my $tot_bad = $wrong_base_count+$false_neg_homo_count+$false_neg_het_count+$false_homo_count+$false_het_count; +printf SUMMARY "Total discordance\t%d\t%.2f\n", $tot_bad, $tot_bad/$num_targeted_snps*100; +$tot_bad -= $discordant_caveat_count+$micro_fdr_estimate; +printf SUMMARY "Significant discordance (excludes NGS calls with caveats, microarray het FDR)\t%d\t%.2f\n", $tot_bad, $tot_bad/$num_targeted_snps*100; +printf SUMMARY "Caveats discordant\t%d\t%.2f\n", $discordant_caveat_count, $tot_caveat_count == 0 ? 0 : $discordant_caveat_count/$tot_caveat_count*100; +printf SUMMARY "Incorrect NGS base called\t%d\t%.2f\n", $wrong_base_count, $wrong_base_count/$num_targeted_snps*100; +printf SUMMARY "Incorrect NGS base called, subset with caveats\t%d\t%.2f\n", $wrong_base_caveat_count, $wrong_base_count == 0 ? 0 : $wrong_base_caveat_count/$wrong_base_count*100; +printf SUMMARY "False negative NGS homo\t%d\t%.2f\n", $false_neg_homo_count, $false_neg_homo_count/$num_targeted_snps*100; +printf SUMMARY "False negative NGS het\t%d\t%.2f\n", $false_neg_het_count, $false_neg_het_count/$num_targeted_snps*100; +printf SUMMARY "Microarray est. FDR het\t%d\t%.2f\n", $micro_fdr_estimate, $micro_fdr_estimate/$num_targeted_snps*100; +printf SUMMARY "Het called NGS homo\t%d\t%.2f\n", $false_homo_count, $false_homo_count/$num_targeted_snps*100; +printf SUMMARY "Homo called NGS het\t%d\t%.2f\n", $false_het_count, $false_het_count/$num_targeted_snps*100; +for(sort keys %concordance_by_method){ + printf SUMMARY "%s true positives\t%d\t%.2f\n", (length($_) ? $_ : "Reference"), $concordance_by_method{$_}, $concordance_by_method{$_}/$num_targeted_snps*100; +} +sub find_earliest_index{ + # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start + my ($query, $array) = @_; + + return 0 if $query < $array->[0]->[0]; + + my ($left, $right, $prevCenter) = (0, $#$array, -1); + + while(1) { + my $center = int (($left + $right)/2); + + my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1); + + return $center if $cmp == 0; + if ($center == $prevCenter) { + return $left; + } + $right = $center if $cmp < 0; + $left = $center if $cmp > 0; + $prevCenter = $center; + } +} diff -r 000000000000 -r 882d119ede1f tool-data/microarray_report.loc --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/microarray_report.loc Wed Mar 25 13:40:00 2015 -0600 @@ -0,0 +1,1 @@ +dbs_directory /export/achri_galaxy/dbs/