annotate microarray_reports @ 0:882d119ede1f default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:40:00 -0600
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1 #!/usr/bin/env perl
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
3 use strict;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
4 use warnings;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
5 use File::Basename;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
6 # Assume the microarray file looks something like this...
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
7 ## comment lines...
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
8 #Probe Set ID 11-01451_(GenomeWideSNP_5).brlmm-p.chp Forward Strand Base Calls dbSNP RS ID Chromosome Chromosomal Position
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
9 #SNP_A-1780520 GG rs16994928 20 48440771
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
10 #SNP_A-1780985 AG rs3859360 18 34178190
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
11 #...end example input file
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
12 if(@ARGV == 1 and $ARGV[0] eq "-v"){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
13 print "Version 1.0\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
14 exit;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
15 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
16
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
17 # configuration file stuff
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
18 my $dirname = dirname(__FILE__);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
19 my %config;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
20 my $tool_data = shift @ARGV;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
21 if(not -e "$tool_data/microarray_report.loc"){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
22 system("cp $dirname/tool-data/microarray_report.loc $tool_data/microarray_report.loc");
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
23 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
24 open CONFIG, '<', "$tool_data/microarray_report.loc";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
25 while(<CONFIG>){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
26 (my $key, my $value) = split(/\s+/, $_);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
27 $config{$key} = $value;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
28 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
29 my $dbs_dir = $config{"dbs_directory"};
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
30 close CONFIG;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
31
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
32 my $quiet = 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
33 if(@ARGV and $ARGV[0] =~ /^-q/){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
34 $quiet = 1;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
35 shift @ARGV;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
36 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
37
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
38 @ARGV == 8 or @ARGV == 9 or die "Usage: $0 [-q(uiet)] <output discordant.bed> <output summary.txt> <input ngs hgvs.txt> <input microarray calls.txt> <input.bam> <reporting cds regions.gtf> <reference genome.fasta> <coverage cutoff for stats> [exome target ngs regions.bed]\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
39
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
40 print STDERR "Reading in reference sequence...\n" unless $quiet;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
41 my %seq;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
42 open(FASTA, "$dbs_dir/$ARGV[6]" )
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
43 or die "Cannot open $dbs_dir/$ARGV[6] for reading: $!\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
44 $/ = "\n>";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
45 while(<FASTA>){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
46 chomp;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
47 my ($name) = /^>?(\S+)/;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
48 s/^[^\n]+//;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
49 tr/\r\n//d;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
50 $seq{$name} = $_;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
51 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
52 close(FASTA);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
53 $/ = "\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
54
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
55 my $cov_cutoff = $ARGV[7];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
56 my %reporting;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
57 print STDERR "Reading in reporting regions list...\n" unless $quiet;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
58 open(GTF, $ARGV[5])
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
59 or die "Cannot open $ARGV[5] for reading: $!\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
60 while(<GTF>){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
61 next if /^\s*#/;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
62 my @fields = split /\t/, $_;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
63
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
64 if($fields[2] eq "exon"){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
65 if(not exists $reporting{$fields[0]}){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
66 $reporting{$fields[0]} = [[$fields[3], $fields[4]]];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
67 next;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
68 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
69 push @{$reporting{$fields[0]}}, [$fields[3], $fields[4]];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
70 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
71 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
72 close(GTF);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
73 for my $c (keys %reporting){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
74 $reporting{$c} = [sort {$a->[0] <=> $b->[0]} @{$reporting{$c}}];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
75 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
76
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
77 my %regions;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
78 if(@ARGV == 9){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
79 print STDERR "Reading in target regions list...\n" unless $quiet;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
80 open(BED, $ARGV[8])
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
81 or die "Cannot open $ARGV[8] for reading: $!\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
82 while(<BED>){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
83 chomp;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
84 my @F = split /\t/, $_;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
85 next unless @F > 4;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
86 if(not exists $regions{$F[0]}){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
87 $regions{$F[0]} = [];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
88 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
89 push @{$regions{$F[0]}}, [$F[1],$F[2]]; # assume they are in order
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
90 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
91 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
92
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
93 open(BED, ">$ARGV[0]")
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
94 or die "Cannot open $ARGV[0] for writing: $!\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
95 print BED "track name=\"NGSvsMicroarrayDiscordance\" columns=\"chr pos pos microGenotype/ngsGenotype NGSReadDepth\" comment=\"asterisk after genotype indicates likely microarray false positive based on NGS coverage, lack of caveats and no mismatched NGS data\" useScore=\"colour gradient\"\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
96
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
97 open(MICRO, $ARGV[3])
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
98 or die "Cannot open microarray calls $ARGV[3] for reading: $!\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
99 my $micro_count = 1;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
100 do {$_ = <MICRO>} while /^#/; #header lines
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
101 $micro_count++ while <MICRO>; # get a line count for progress meter later
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
102 close(MICRO);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
103 $micro_count = int($micro_count/100);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
104
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
105 print STDERR "Reading in NGS calls...\n" unless $quiet;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
106 open(MICRO, $ARGV[3])
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
107 or die "Cannot open microarray calls $ARGV[3] for reading: $!\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
108 open(HGVS, $ARGV[2])
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
109 or die "Cannot open HGVS genotype calls $ARGV[2] for reading: $!\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
110 open(SUMMARY, ">$ARGV[1]")
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
111 or die "Cannot open $ARGV[1] for writing: $!\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
112 my $bam_file = $ARGV[4];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
113 die "Input BAM file does not exist\n" if not -e $bam_file;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
114 die "Input BAM file is not readable\n" if not -r $bam_file;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
115 die "Input BAM file is empty\n" if -z $bam_file;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
116 <HGVS>; # header
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
117 my (%ngs_call, %ngs_caveats, %ngs_methods, %ngs_varreads, %ngs_depth, %key2pos, %ignore);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
118 while(<HGVS>){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
119 chomp;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
120 my @F = split /\t/, $_;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
121 my $pos_key = "$F[4]:$F[5]";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
122 # ignore non-SNPs
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
123 if($F[2] =~ /c.[\-*]?(\d+)_(\d+)/){ #multi-base events ignored, as microarray probes are likely to be reporting wrong here anyway
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
124 my $modlength = $2-$1;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
125 for my $offset (0..$modlength){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
126 if($F[3] eq "-"){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
127 $ignore{"$F[4]:".($F[5]-$offset)} = 1;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
128 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
129 else{
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
130 $ignore{"$F[4]:".($F[5]+$offset)} = 1;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
131 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
132 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
133 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
134 elsif($F[2] =~ /(?:del|ins|inv)/){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
135 next;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
136 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
137 next unless $F[2] =~ />([ACGT])$/; # only looking for SNPs, to do add MNPs
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
138 my $new_base = $1;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
139 $new_base =~ tr/ACGT/TGCA/ if $F[3] eq "-"; # rev comp cDNA HGVS
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
140 my $rs_key = $F[11]; # use both position and rsID as patches to hg19 have shifted some positions
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
141 my $ngs_call = ($F[6] =~ /homozygote/ ? $new_base : $F[10]) . $new_base;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
142 $ngs_call{$pos_key} = $ngs_call{$rs_key} = $ngs_call;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
143 $ngs_caveats{$pos_key} = $ngs_caveats{$rs_key} = $F[16] if $F[16] =~ /\S/;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
144 $ngs_methods{$pos_key} = $ngs_methods{$rs_key} = $F[18];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
145 $ngs_varreads{$pos_key} = $ngs_varreads{$rs_key} = $F[8];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
146 $ngs_depth{$pos_key} = $ngs_depth{$rs_key} = $F[9];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
147 $key2pos{$rs_key} = $key2pos{$pos_key} = [$F[4],$F[5]];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
148 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
149 close(HGVS);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
150
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
151 print STDERR "Comparing to microarray calls...\n" unless $quiet;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
152 print STDERR " 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%\n" unless $quiet;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
153 my $num_ignored_snps = 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
154 my $num_usable_snps = 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
155 my $num_targeted_snps = 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
156 my %discordant; # key => [zygosity, method, caveats, variant depth, total depth];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
157 my %cov_missing; # List all sites with unknown coverage
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
158 do {$_ = <MICRO>} while /^#/; #header lines
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
159 while(<MICRO>){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
160 if(not $quiet){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
161 if($.%($micro_count*10) == 1){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
162 print STDERR "|";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
163 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
164 elsif($.%($micro_count*5) == 1){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
165 print STDERR ":";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
166 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
167 elsif($.%$micro_count == 1){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
168 print STDERR ".";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
169 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
170 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
171 chomp;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
172 my @F = split /\t/, $_;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
173 my $micro_call = $F[1];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
174 next if $micro_call eq "---";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
175 my $rsid = $F[2];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
176 my $chr = $F[3];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
177 next if $chr eq "---";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
178 my $pos = $F[4];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
179 if(exists $ignore{"chr$chr:$pos"}){ # multinucleotide events probably wrong on the microarray
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
180 $num_ignored_snps++;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
181 next;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
182 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
183 $num_usable_snps++;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
184
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
185 if(exists $reporting{"chr".$chr}){ # in a region reported in the annotation?
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
186 my $in_region = 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
187 my $arrayref = $reporting{"chr".$chr};
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
188 for(my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
189 my $interval = $arrayref->[$search_index];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
190 if($pos >= $interval->[0] and $pos <= $interval->[1]){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
191 $in_region = 1; last;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
192 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
193 last if $pos < $interval->[0];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
194 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
195 next unless $in_region;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
196 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
197
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
198 if(exists $regions{"chr".$chr}){ # in the exome target region?
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
199 my $in_region = 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
200 my $arrayref = $regions{"chr".$chr};
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
201 for (my $search_index = find_earliest_index($pos, $arrayref);$search_index <= $#{$arrayref}; $search_index++){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
202 my $interval = $arrayref->[$search_index];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
203 if($pos >= $interval->[0] and $pos <= $interval->[1]){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
204 $in_region = 1; last;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
205 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
206 last if $pos < $interval->[0];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
207 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
208
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
209 next unless $in_region;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
210 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
211
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
212 $num_targeted_snps++;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
213 my $m = substr($micro_call, 0, 1);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
214 my $m2 = substr($micro_call, 1, 1);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
215 my $key;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
216 if(exists $ngs_call{$rsid}){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
217 $key = $rsid;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
218 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
219 elsif(exists $ngs_call{"chr$chr:$pos"}){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
220 $key = "chr$chr:$pos";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
221 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
222 else{
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
223 if(not exists $seq{"chr$chr"}){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
224 warn "Skipping microarray call, no reference sequence was provided for chr$chr\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
225 $num_targeted_snps--;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
226 next;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
227 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
228 # 4 situations could be e.g. ref AA, but micro says AA or AG or GG or GC
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
229 my $ref_base = uc(substr($seq{"chr$chr"}, $pos-1, 1));
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
230 if($m eq $m2){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
231 if($micro_call eq $ref_base.$ref_base){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
232 # concordant homo calls as reference
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
233 # undefs will be filled in later en masse in a single call to samtools for efficiency
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
234 $discordant{$rsid} = ["micro ref, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$micro_call"];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
235 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
236 else{
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
237 # called homo var by microarray, but homo ref by NGS
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
238 #print STDERR "chr$chr $pos micro $micro_call vs NGS homo ref $ref_base";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
239 $discordant{$rsid} = ["micro homo, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
240 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
241 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
242 else { #$m ne $m2
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
243 if($m eq $ref_base or $m2 eq $ref_base){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
244 # called het var by microarray, but homo ref by NGS
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
245 $discordant{$rsid} = ["micro het, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
246 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
247 else{
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
248 $discordant{$rsid} = ["micro compound het, ngs ref", "", "", undef, undef, "chr$chr", $pos, "$micro_call/$ref_base$ref_base"];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
249 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
250 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
251 $cov_missing{"chr$chr:$pos"} = [$rsid, $m, $m2];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
252 next;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
253 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
254
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
255 next if $ngs_depth{$key} <= $cov_cutoff;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
256 my $ngs_call = $ngs_call{$key};
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
257 # if we get to here, there are both NGS and micro calls
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
258 if($micro_call eq $ngs_call or $micro_call eq reverse($ngs_call)){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
259 #print STDERR "Concordant NGS call for chr$chr:$pos\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
260 if($m eq $m2){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
261 $discordant{$key} = ["micro homo, ngs homo", $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
262 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
263 else{
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
264 $discordant{$key} = ["micro het, ngs het", $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
265 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
266 next;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
267 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
268 else{
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
269 # discordant
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
270 #print STDERR "Discordant NGS call for chr$chr:$pos $micro_call vs. $ngs_call\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
271 # is it just a zygosity difference?
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
272 my $n = substr($ngs_call, 0, 1);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
273 my $n2 = substr($ngs_call, 1, 1);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
274 my $discordance;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
275 if($micro_call eq $n.$n or $micro_call eq $n2.$n2){ # micro is homo for variant, ngs is het
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
276 $discordance = "micro homo, ngs het";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
277 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
278 elsif($m.$m eq $ngs_call or $m2.$m2 eq $ngs_call){ # micro is het for variant, ngs is homo
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
279 $discordance = "micro het, ngs homo";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
280 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
281 elsif($m eq $m2){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
282 if($n eq $n2){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
283 $discordance = "diff micro homo, ngs homo";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
284 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
285 else{
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
286 $discordance = "diff micro homo, ngs het";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
287 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
288 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
289 else{
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
290 if($n eq $n2){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
291 $discordance = "diff micro het, ngs homo";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
292 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
293 else{
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
294 $discordance = "diff micro het, ngs het";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
295 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
296 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
297 $discordant{$key} = [$discordance, $ngs_methods{$key}, $ngs_caveats{$key}, $ngs_varreads{$key}, $ngs_depth{$key}, "chr$chr", $pos, "$micro_call/$ngs_call"];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
298 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
299 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
300 close(MICRO);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
301 print STDERR "\n" unless $quiet;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
302
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
303 print STDERR "Retrieving reference call depth of coverage stats...\n" unless $quiet;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
304 my $covbed = "$$.bed";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
305 open(TMPCOV, ">$covbed")
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
306 or die "Cannot open temporary BED file for samtools: $!\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
307 my $num_covmissing = 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
308 for(sort keys %cov_missing){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
309 $num_covmissing++;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
310 my ($chr, $pos) = split /:/, $_;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
311 print TMPCOV "$chr\t$pos\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
312 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
313 close(TMPCOV);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
314
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
315 my $cov_count = int($num_covmissing/100);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
316 print STDERR " 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%\n" unless $quiet;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
317 #print STDERR "samtools mpileup -l $covbed -I $bam_file 2>/dev/null\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
318 #<STDIN>;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
319 open(SAM, "samtools mpileup -l $covbed -I $bam_file 2>/dev/null |")
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
320 or die "Cannot run samtools: $!\n";;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
321 while(<SAM>){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
322 #while($depth_data =~ /([^\n]+)/sg){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
323 if(not $quiet){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
324 if($.%($cov_count*10) == 0){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
325 print STDERR "|";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
326 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
327 elsif($.%($cov_count*5) == 0){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
328 print STDERR ":";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
329 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
330 elsif($.%$cov_count == 0){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
331 print STDERR ".";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
332 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
333 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
334
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
335 chomp;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
336 my @B = split /\t/, $_;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
337 #my @B = split /\t/, $1;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
338 my ($rsid, $m, $m2) = @{$cov_missing{"$B[0]:$B[1]"}};
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
339 my $depth = @B > 1 ? $B[3] : 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
340 #print STDERR "Depth is $depth for $B[0]:$B[1]\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
341 my $read_calls = uc($B[4]);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
342 my %base_tot;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
343 for my $read_call (split //, $read_calls){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
344 $base_tot{$read_call}++;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
345 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
346 $base_tot{$m} = 0 if not exists $base_tot{$m};
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
347 $base_tot{$m2} = 0 if not exists $base_tot{$m2};
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
348 if($depth <= $cov_cutoff){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
349 $discordant{$rsid}->[0] = "no ngs call";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
350 $discordant{$rsid}->[1] = "insufficient coverage";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
351 $discordant{$rsid}->[3] = $base_tot{$m};
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
352 $discordant{$rsid}->[4] = $depth;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
353 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
354 elsif($discordant{$rsid}->[0] eq "micro ref, ngs ref" or $discordant{$rsid}->[0] eq "micro homo, ngs ref"){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
355 # concordant homo calls as reference or called homo var by microarray, but homo ref by NGS
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
356 $discordant{$rsid}->[3] = $base_tot{$m};
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
357 $discordant{$rsid}->[4] = $depth;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
358 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
359 elsif($discordant{$rsid}->[0] eq "micro het, ngs ref" or $discordant{$rsid}->[0] eq "micro compound het, ngs ref"){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
360 # called het var by microarray, but homo ref by NGS
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
361 $discordant{$rsid}->[3] = $base_tot{$m}.",".$base_tot{$m2};
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
362 $discordant{$rsid}->[4] = $depth;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
363 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
364 else{
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
365 print STDERR "Didn't know how to deal with $B[0]:$B[1] -> ", join(" ", @{$discordant{$rsid}}),"\n" unless $quiet;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
366 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
367 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
368 unlink $covbed;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
369
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
370 #Count false negatives
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
371 my $false_neg_homo_count = 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
372 my $false_neg_het_count = 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
373 my $missing_count = 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
374 my $wrong_base_count = 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
375 my $wrong_base_caveat_count = 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
376 my $discordant_caveat_count = 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
377 my $tot_caveat_count = 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
378 my $false_homo_count = 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
379 my $false_het_count = 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
380 my $micro_fdr_estimate = 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
381 my %calls_by_method;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
382 my %concordance_by_method;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
383 for my $key (keys %discordant){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
384 my $rec = $discordant{$key};
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
385 my $name = $rec->[7];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
386 $tot_caveat_count++ if $rec->[2];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
387 if($rec->[0] eq "micro het, ngs ref" or $rec->[0] eq "micro homo, ngs ref"){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
388 if($rec->[0] eq "micro homo, ngs ref"){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
389 $false_neg_homo_count++;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
390 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
391 else{
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
392 $false_neg_het_count++;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
393 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
394 # probably false micro call if no caveats, high coverage, and no/one mismatches in NGS
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
395 next unless defined $rec and defined $rec->[3] and defined $rec->[4];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
396 if(not $rec->[2] and $rec->[4] >= 50 and $rec->[3] =~ /(\d{2,})/ and $1+1 >= $rec->[4]){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
397 $micro_fdr_estimate++;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
398 $name .= "*";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
399 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
400 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
401 elsif($rec->[0] eq "micro compound het, ngs ref"){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
402 $false_neg_het_count+=2;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
403 # probably false micro call if no caveats, high coverage, and no/one mismatches in NGS
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
404 next unless defined $rec and defined $rec->[3] and defined $rec->[4];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
405 if(not $rec->[2] and $rec->[4] >= 50 and $rec->[3] =~ /(\d{2,})/ and $1+1 >= $rec->[4]){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
406 $micro_fdr_estimate+=2;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
407 $name .= "*";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
408 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
409 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
410 elsif($rec->[0] eq "no ngs call"){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
411 $missing_count++;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
412 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
413 elsif($rec->[0] =~ /^diff/){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
414 $wrong_base_count++;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
415 if($rec->[3] =~ /\S/){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
416 $wrong_base_caveat_count++;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
417 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
418 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
419 elsif($rec->[0] eq "micro het, ngs homo"){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
420 $false_homo_count++;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
421 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
422 elsif($rec->[0] eq "micro homo, ngs het"){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
423 $false_het_count++;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
424 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
425 else{
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
426 # calls concordant
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
427 $concordance_by_method{$rec->[1]}++;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
428 next;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
429 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
430 if($rec->[4] > $cov_cutoff and $rec->[2] and $name !~ /\*$/){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
431 $discordant_caveat_count++;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
432 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
433 my $chr = $rec->[5];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
434 my $pos = $rec->[6];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
435 my $score = $rec->[4];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
436 $score = 1000 if $score > 1000;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
437 print BED "$chr\t$pos\t$pos\t$name\t$score\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
438 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
439
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
440 print SUMMARY "# NGS genotypes in $ARGV[2] vs. SNP microarray in $ARGV[3], minimum NGS coverage of ",$cov_cutoff+1, "\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
441 print SUMMARY "#Columns: Measure\tCount\tPercentage\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
442 print SUMMARY "Total ignored SNP microarray calls due to NGS putative indels or MNPs\t$num_ignored_snps\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
443 print SUMMARY "Total usable SNP microarray calls\t$num_usable_snps\n";
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
444 printf SUMMARY "Total targeted SNP microarray calls (based on target file %s)\t%d\t%.2f\n", $ARGV[5],$num_targeted_snps,$num_targeted_snps/$num_usable_snps*100 if keys %regions;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
445 printf SUMMARY "Targeted SNPs with insufficient NGS coverage (<=$cov_cutoff)\t%d\t%.2f\n", $missing_count, $missing_count/$num_targeted_snps*100;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
446 $num_targeted_snps -= $missing_count;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
447 my $tot_bad = $wrong_base_count+$false_neg_homo_count+$false_neg_het_count+$false_homo_count+$false_het_count;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
448 printf SUMMARY "Total discordance\t%d\t%.2f\n", $tot_bad, $tot_bad/$num_targeted_snps*100;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
449 $tot_bad -= $discordant_caveat_count+$micro_fdr_estimate;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
450 printf SUMMARY "Significant discordance (excludes NGS calls with caveats, microarray het FDR)\t%d\t%.2f\n", $tot_bad, $tot_bad/$num_targeted_snps*100;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
451 printf SUMMARY "Caveats discordant\t%d\t%.2f\n", $discordant_caveat_count, $tot_caveat_count == 0 ? 0 : $discordant_caveat_count/$tot_caveat_count*100;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
452 printf SUMMARY "Incorrect NGS base called\t%d\t%.2f\n", $wrong_base_count, $wrong_base_count/$num_targeted_snps*100;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
453 printf SUMMARY "Incorrect NGS base called, subset with caveats\t%d\t%.2f\n", $wrong_base_caveat_count, $wrong_base_count == 0 ? 0 : $wrong_base_caveat_count/$wrong_base_count*100;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
454 printf SUMMARY "False negative NGS homo\t%d\t%.2f\n", $false_neg_homo_count, $false_neg_homo_count/$num_targeted_snps*100;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
455 printf SUMMARY "False negative NGS het\t%d\t%.2f\n", $false_neg_het_count, $false_neg_het_count/$num_targeted_snps*100;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
456 printf SUMMARY "Microarray est. FDR het\t%d\t%.2f\n", $micro_fdr_estimate, $micro_fdr_estimate/$num_targeted_snps*100;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
457 printf SUMMARY "Het called NGS homo\t%d\t%.2f\n", $false_homo_count, $false_homo_count/$num_targeted_snps*100;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
458 printf SUMMARY "Homo called NGS het\t%d\t%.2f\n", $false_het_count, $false_het_count/$num_targeted_snps*100;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
459 for(sort keys %concordance_by_method){
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
460 printf SUMMARY "%s true positives\t%d\t%.2f\n", (length($_) ? $_ : "Reference"), $concordance_by_method{$_}, $concordance_by_method{$_}/$num_targeted_snps*100;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
461 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
462 sub find_earliest_index{
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
463 # employs a binary search to find the smallest index that must be the starting point of a search of [start,end] elements sorted in an array by start
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
464 my ($query, $array) = @_;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
465
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
466 return 0 if $query < $array->[0]->[0];
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
467
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
468 my ($left, $right, $prevCenter) = (0, $#$array, -1);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
469
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
470 while(1) {
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
471 my $center = int (($left + $right)/2);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
472
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
473 my $cmp = $query <=> $array->[$center]->[0] || ($center == 0 || $query != $array->[$center-1]->[0] ? 0 : -1);
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
474
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
475 return $center if $cmp == 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
476 if ($center == $prevCenter) {
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
477 return $left;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
478 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
479 $right = $center if $cmp < 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
480 $left = $center if $cmp > 0;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
481 $prevCenter = $center;
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
482 }
882d119ede1f initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
483 }