11
|
1 #!/usr/bin/perl
|
|
2 #############################
|
|
3 ## DEEP COVERAGE GENOTYPER ##
|
|
4 #############################
|
|
5 # version : 0.0.1
|
|
6 # Principle:
|
|
7 # 1. get allele counts on all positions in specified targets (bed) using igvtools. Only SNPs !!
|
|
8 # 2. remove known dbsnp positions (bcf file)
|
|
9 # 3. Get distribution of background noise (pcr/sequencing errors), by modelling allele fractions as normal distributions.
|
|
10 # 4. Based on these distributions, check each position for significant change from the reference allele (based on allele fraction)
|
|
11 # 5. For abberant positions, check each alternate allele to see if it passes the background signal.
|
|
12 # 6. Generate VCF file.
|
|
13
|
|
14
|
|
15 ##################
|
|
16 ## LOAD MODULES ##
|
|
17 ##################
|
|
18 use threads;
|
|
19 use threads::shared;
|
|
20 use Thread::Queue;
|
|
21 use Getopt::Std;
|
|
22
|
|
23 ####################
|
|
24 ## get paramaters ##
|
|
25 ####################
|
|
26 # t: target file
|
|
27 # b: bam file
|
12
|
28 # R: reference genome files for twobit and IGV.
|
11
|
29 # p: number of threads.
|
|
30 # s: dbsnp file
|
|
31 # m: minimal coverage (defaults 400x)
|
|
32 # P: ploidy
|
|
33 # a: outfile for allele distributions
|
|
34 # v: vcf file output.
|
17
|
35 # d: distribution plots pdf file
|
|
36 getopts('t:b:R:p:s:m:P:v:a:d:', \%opts) ;
|
11
|
37
|
|
38 ## variables
|
|
39 my $twobit :shared;
|
|
40 my $igvgenome :shared;
|
|
41 if (!defined($opts{'R'})) {
|
|
42 die("Reference Genomes not specified\n");
|
|
43 }
|
12
|
44 my @refgenomes = split(",",$opts{'R'});
|
11
|
45 if (!-e $refgenomes[0]) {
|
|
46 die("'$refgenomes[0]' is not a valid file path.");
|
|
47 }
|
|
48 else {
|
|
49 $twobit = $refgenomes[0];
|
|
50 }
|
|
51 if (!-e $refgenomes[1]) {
|
|
52 die("'$refgenomes[1]' is not a valid file path.");
|
|
53 }
|
|
54 else {
|
|
55 $igvgenome = $refgenomes[1];
|
|
56 }
|
|
57
|
|
58
|
|
59 my $mincov :shared;
|
|
60 $mincov = 320;
|
|
61 if (defined($opts{'m'})) {
|
|
62 $mincov = $opts{'m'};
|
|
63 }
|
|
64
|
|
65 my $ploidy :shared;
|
|
66 if (defined($opts{'P'}) && $opts{'P'} =~ m/^\d+$/) {
|
|
67 $ploidy = $opts{'P'};
|
|
68 }
|
|
69 else {
|
|
70 die("Ploidy (-P) was not specified or not an integer\n");
|
|
71 }
|
|
72
|
|
73
|
|
74 if (defined($opts{'v'})) {
|
|
75 $outfile = $opts{'v'};
|
|
76 }
|
|
77 else {
|
|
78 die("No output vcf-file specified.\n");
|
|
79 }
|
|
80 if (!defined($opts{'a'})) {
|
|
81 die("No output file specified for distribution details\n");
|
|
82 }
|
|
83 ## create working dir.
|
|
84 my $rand = int(rand(10000));
|
|
85 while (-d "/tmp/DC_Genotyper_$rand") {
|
|
86 $rand = int(rand(10000));
|
|
87 }
|
|
88 my $wd :shared;
|
|
89 $wd = "/tmp/DC_Genotyper_$rand";
|
|
90 system("mkdir '$wd'");
|
|
91
|
|
92
|
|
93 my $snpfile :shared;
|
|
94 my $hassnp :shared;
|
|
95 $hassnp = 'NoDbSNP';
|
|
96 $snpfile = '';
|
|
97 if (defined($opts{'s'})) {
|
|
98 $snpfile = $opts{'s'};
|
|
99 if (!-e $snpfile) {
|
|
100 die("'$snpfile' is not a valid file path.");
|
|
101 }
|
|
102
|
|
103 my $mime = `file $snpfile`;
|
|
104 if ($mime !~ m/compressed/) {
|
|
105 print "$snpfile is not in compressed format. compressing & indexing the file now.\n";
|
|
106 #print "... this takes a while\n";
|
|
107 system("bgzip -c $snpfile > $wd/dbSNP.vcf.bgz");
|
|
108 system("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz");
|
|
109 $snpfile = "$wd/dbSNP.vcf.bgz";
|
|
110 }
|
|
111 elsif (!-e "$snpfile.tbi") {
|
|
112 print "tabix index file is missing for '$snpfile'. creating now.\n";
|
|
113 ## check if I can write it out for future use
|
|
114 $snpfile =~ m/(.*)([^\/]+)$/;
|
|
115 my $d = $1;
|
|
116 if (-w $d) {
|
|
117 open OUT, ">$d/lock";
|
|
118 flock(OUT,2);
|
|
119 system("cd $d && tabix -p vcf $snpfile");
|
|
120 close OUT;
|
|
121 system("rm $d/lock");
|
|
122 }
|
|
123 else {
|
|
124 system("cp $snpfile /$wd/dbSNP.vcf.bgz");
|
|
125 system("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz");
|
|
126 $snpfile = "$wd/dbSNP.vcf.bgz";
|
|
127 }
|
|
128 }
|
|
129 $hassnp = 'WithDbSNP';
|
|
130 }
|
|
131
|
|
132
|
|
133 ## 1. Get FASTA and prepare output hashes:
|
|
134 my $targets_one = Thread::Queue->new();
|
|
135 my $targets_two = Thread::Queue->new();
|
|
136 my $targets_three = Thread::Queue->new();
|
|
137 open IN, $opts{'t'} or die("Could not open $opts{'t'} file for reading");
|
|
138 if (-d "$wd/Fasta/") {
|
|
139 system("rm $wd/Fasta/*");
|
|
140 }
|
|
141 else {
|
|
142 system("mkdir $wd/Fasta");
|
|
143 }
|
|
144 ## create the threads.
|
|
145 for (my $i = 1; $i<= $opts{'p'}; $i++) {
|
|
146 ${'thr'.$i} = threads->create('FetchFasta');
|
|
147 }
|
|
148
|
|
149 ## enqueue the targets.
|
|
150 my %thash;
|
|
151 while (<IN>) {
|
|
152 chomp;
|
|
153 my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$_);
|
|
154 $targets_one->enqueue($_);
|
|
155 $targets_two->enqueue($_);
|
|
156 $targets_three->enqueue($_);
|
|
157 $thash{$chr}{$start} = $stop;
|
|
158 }
|
|
159 close IN;
|
|
160
|
|
161 ## end the threads.
|
|
162 for (my $i = 1; $i <= $opts{'p'}; $i++) {
|
|
163 $targets_one->enqueue(undef);
|
|
164 }
|
|
165
|
|
166 for (my $i = 1; $i <= $opts{'p'}; $i++) {
|
|
167 ${'thr'.$i}->join();
|
|
168 }
|
|
169
|
|
170 ## load dbSNP inside target regions into shared structure.
|
|
171 ##########################################################
|
|
172 my %dbsnp :shared;
|
|
173 if ($snpfile ne '') {
|
17
|
174 ## BCFTOOLS query is very very fast, but not available so far in the default bcftools version included in samtools package.
|
|
175 # as a work-around, use tabix, but this is slower.
|
16
|
176 #my $bcf = `which bcftools`;
|
|
177 #chomp($bcf);
|
|
178 #if ($bcf ne '') {
|
|
179 # my $command = "bcftools query -f '\%CHROM\\t\%POS\\t\%REF\\t\%ALT\\t\%ID\\n' -R '".$opts{'t'}."' '$snpfile' > $wd/dbsnp.txt";
|
|
180 # system("$command");
|
|
181 # open IN, "$wd/dbsnp.txt";
|
|
182 # while (<IN>) {
|
|
183 # chomp;
|
|
184 # my @p = split(/\t/,$_);
|
|
185 # $dbsnp{$p[0].'-'.$p[1]} = $p[2].'-'.$p[3].'-'.$p[4];
|
|
186 # }
|
|
187 # close IN;
|
|
188 #}
|
|
189 #else {
|
|
190 # print "WARNING: BCFtools is not in the path. Skipping snp handling.\n";
|
|
191 # $snpfile = '';
|
|
192 # system("touch $wd/dbsnp.txt");
|
|
193 #}
|
|
194 system("tabix $snpfile -B $opts{'t'} | cut -f 1-5 > $wd/dbsnp.txt");
|
|
195 my $lc = `cat $wd/dbsnp.txt | wc -l`;
|
|
196 chomp($lc);
|
|
197 if ($lc eq '0') {
|
|
198 open SNP, ">$wd/dbsnp.txt";
|
|
199 ## dummy line on chr zero
|
|
200 print SNP "chr0\t1\t.\tA\tT\n";
|
|
201 close SNP;
|
17
|
202 }
|
11
|
203 }
|
|
204 else {
|
16
|
205 open SNP, ">$wd/dbsnp.txt";
|
17
|
206 ## dummy line on chr zero to prevent R issues on empty file.
|
16
|
207 print SNP "chr0\t1\t.\tA\tT\n";
|
|
208 close SNP;
|
11
|
209 }
|
|
210
|
|
211 ## now process the bam file.
|
|
212 mkdir "$wd/WIGS/";
|
|
213 my $bam :shared;
|
|
214 $bam = $opts{'b'};
|
17
|
215 # igvtools cannot handle the .dat extension, so make symlink
|
|
216 system("ln -s '$bam' '$wd/input.bam'");
|
|
217 system("cd $wd && samtools index input.bam");
|
11
|
218
|
|
219 for (my $i = 1; $i <= $opts{'p'}; $i++) {
|
|
220 ${'thr'.$i} = threads->create('CountAlleles');
|
|
221 }
|
|
222 ## end the threads.
|
|
223 for (my $i = 1; $i <= $opts{'p'}; $i++) {
|
|
224 $targets_two->enqueue(undef);
|
|
225 }
|
|
226
|
|
227 for (my $i = 1; $i <= $opts{'p'}; $i++) {
|
|
228 ${'thr'.$i}->join();
|
|
229 }
|
|
230
|
|
231 ## generate the distributions.
|
|
232 ##############################
|
|
233 my $alleles = Thread::Queue->new();
|
|
234 my %all = ('A' => 1,'C' => 2,'G' => 3, 'T' => 4);
|
|
235 foreach(keys(%all)) {
|
|
236 $alleles->enqueue($_);
|
|
237 my $a = $_;
|
|
238 foreach(keys(%all)) {
|
|
239 if ($_ eq $a) {
|
|
240 next;
|
|
241 }
|
|
242 $alleles->enqueue($a.'-'.$_);
|
|
243 }
|
|
244 }
|
|
245 for (my $i = 1; $i <= $opts{'p'}; $i++) {
|
|
246 ${'thr'.$i} = threads->create('GetDistribution');
|
|
247 }
|
|
248 ## end the threads.
|
|
249 for (my $i = 1; $i <= $opts{'p'}; $i++) {
|
|
250 $alleles->enqueue(undef);
|
|
251 }
|
|
252
|
|
253 for (my $i = 1; $i <= $opts{'p'}; $i++) {
|
|
254 ${'thr'.$i}->join();
|
|
255 }
|
|
256
|
|
257 ## group distributions into one file
|
|
258 ####################################
|
|
259 my %map =('A' => 2,'C' => 3,'G' => 4, 'T' => 5);
|
|
260 open OUT, ">".$opts{'a'};
|
17
|
261 print OUT "allele\tavg\tsd\tN\n";
|
11
|
262 foreach(keys(%map)) {
|
|
263 my $r = $_;
|
|
264 my $f = "$wd/model.$r.$mincov"."x.$hassnp.txt";
|
|
265 open IN, "$f";
|
|
266 my $a = <IN>;
|
|
267 chomp($a);
|
|
268 my $s = <IN>;
|
|
269 chomp($s);
|
17
|
270 my $n = <IN>;
|
|
271 chomp($n);
|
11
|
272 close IN;
|
17
|
273 print OUT "$r\t$a\t$s\t$n\n";
|
11
|
274 foreach(keys(%map)) {
|
|
275 if ($_ eq $r) {
|
|
276 next;
|
|
277 }
|
|
278 my $f = "$wd/model.$r-$_.$mincov"."x.$hassnp.txt";
|
|
279 open IN, "$f";
|
|
280 my $a = <IN>;
|
|
281 chomp($a);
|
|
282 my $s = <IN>;
|
|
283 chomp($s);
|
17
|
284 my $n = <IN>;
|
|
285 chomp($n);
|
11
|
286 close IN;
|
17
|
287 print OUT "$r-$_\t$a\t$s\t$n\n";
|
11
|
288 }
|
|
289 }
|
|
290 close OUT;
|
|
291
|
17
|
292 ## make pdf with distribution plots
|
|
293 ###################################
|
|
294 open OUT, ">$wd/MakePlots.R";
|
|
295 print OUT "\n";
|
|
296 print OUT "dists <- read.table(file='".$opts{'a'}."', header=TRUE, as.is=TRUE)\n";
|
|
297 print OUT "pdf(file='".$opts{'d'}."',paper='a4',onefile=TRUE)\n";
|
|
298 print OUT "par(mfrow=c(2, 2))\n";
|
|
299 print OUT "for (i in 1:nrow(dists)) {\n";
|
|
300 print OUT " if (dists[i,'avg'] > 0.5) {\n";
|
|
301 print OUT " x <- seq(0.85,1,length=1000))\n";
|
|
302 print OUT " y <- dnorm(x,mean=dists[i,'avg'],sd=dists[i,'sd']))\n";
|
|
303 print OUT " plot(x,y,main=paste('Distribution for allele \"',dists[i,'allele'],'\"',sep=''),xlab='Allelic Ratio',type='l',lwd=1))\n";
|
|
304 print OUT " abline(v=(dists[i,'avg']-3*dists[i,'sd']),col='red'))\n";
|
|
305 print OUT " text(0.855,max(y-0.5),paste(c('avg: ',round(dists[i,'avg'],digits=5),'\\nsd: ',round(dists[i,'sd'],digits=5),'\\nN: ',dists[i,'N']),sep=' ',collapse=''),adj=c(0,1)))\n";
|
|
306 print OUT " } else {)\n";
|
|
307 print OUT " x <- seq(0,0.15,length=1000))\n";
|
|
308 print OUT " y <- dnorm(x,dists[i,'avg'],sd=dists[i,'sd']))\n";
|
|
309 print OUT " plot(x,y,main=paste('Distribution for \"',dists[i,'allele'],'\" variation',sep=''),xlab='Allelic Ratio',type='l',lwd=1))\n";
|
|
310 print OUT " abline(v=(dists[i,'avg']+3*dists[i,'sd']),col='red'))\n";
|
|
311 print OUT " text(0.1,max(y-0.5),paste(c('avg: ',round(dists[i,'avg'],digits=5),'\\nsd: ',round(dists[i,'sd'],digits=5),'\\nN: ',dists[i,'N']),sep=' ',collapse=''),adj=c(0,1)))\n";
|
|
312 print OUT " })\n";
|
|
313 print OUT "})\n";
|
|
314 print OUT "dev.off())\n";
|
|
315 close OUT;
|
|
316 system("cd $wd && Rscript MakePlots.R > /dev/null 2>&1");
|
|
317
|
11
|
318 ## CALL SNPs
|
|
319 ############
|
|
320 # create the R script.
|
|
321 open R, ">$wd/CallSNPs.R";
|
16
|
322 print R "\n";
|
11
|
323 print R "args <- commandArgs(trailingOnly = TRUE)\n";
|
16
|
324 print R "counts <- read.table(file = args[1],header = FALSE, as.is = TRUE)\n";
|
11
|
325 print R "ploidy <- as.integer(args[3])\n";
|
|
326 print R "chr <- args[2]\n";
|
|
327 print R "snps <- read.table(file=args[5],header=FALSE,as.is=TRUE)\n";
|
16
|
328 print R "colnames(snps) <- c('chr','pos','id','ref','alt')\n";
|
11
|
329 print R "colnames(counts) <- c('pos','ref','A','C','G','T','TotalDepth')\n";
|
16
|
330 print R "dists <- read.table(file='".$opts{'a'}."',header=TRUE,as.is=TRUE)\n";
|
11
|
331 print R 'rownames(dists) = dists$allele'."\n";
|
|
332 print R 'dists <- dists[,-1]'."\n";
|
|
333 print R "vcf <- c()\n";
|
|
334 print R "lower <- c()\n";
|
|
335 print R "higher <- c()\n";
|
|
336 print R "for (i in 1:(ploidy)) {\n";
|
|
337 print R " lower[length(lower)+1] <- (2*i-1)/(2*ploidy)\n";
|
|
338 print R " higher[length(higher)+1] <- (2*i+1)/(2*ploidy)\n";
|
|
339 print R "}\n";
|
|
340 print R "for (i in 1:nrow(counts)) {\n";
|
|
341 print R " if (counts[i,'TotalDepth'] == 0) next\n";
|
|
342 print R " # significantly different from reference?\n";
|
|
343 print R " z <- ((counts[i,counts[i,'ref']]/counts[i,'TotalDepth']) - dists[counts[i,'ref'],'avg']) / dists[counts[i,'ref'],'sd']\n";
|
|
344 print R " if (abs(z) > 3) {\n";
|
|
345 print R " # test all alterate alleles to see which one is significant.\n";
|
|
346 print R " for (j in c('A','C','G','T')) {\n";
|
|
347 print R " if (j == counts[i,'ref']) next\n";
|
|
348 print R " z <- ((counts[i,j]/counts[i,'TotalDepth']) - dists[paste(counts[i,'ref'],'-',j,sep=''),'avg']) / dists[paste(counts[i,'ref'],'-',j,sep=''),'sd']\n";
|
|
349 print R " if (abs(z) > 3){\n";
|
|
350 print R " filter <- 'PASS'\n";
|
|
351 print R " phred <- round(-10*log(pnorm(-abs(z))),digits=0)\n";
|
|
352 print R " if (phred > 9999) phred <- 9999\n";
|
|
353 print R " frac <- counts[i,j]/counts[i,'TotalDepth']\n";
|
|
354 print R " for (k in 1:ploidy) {\n";
|
|
355 print R " if (frac >= lower[k] && frac < higher[k]) {\n";
|
|
356 print R " sample <- paste(paste(paste(rep(0,(ploidy-k)),sep='',collapse='/'),paste(rep(1,k),sep='',collapse='/'),sep='/',collapse=''),':',counts[i,counts[i,'ref']],',',counts[i,j],sep='',collapse='')\n";
|
|
357 print R " af <- k/ploidy\n";
|
|
358 print R " break\n";
|
|
359 print R " }\n";
|
|
360 print R " }\n";
|
|
361 print R " if (frac < lower[1]) {\n";
|
|
362 print R " sample <- paste(paste(paste(rep(0,(ploidy-1)),sep='',collapse='/'),paste(rep(1,1),sep='',collapse='/'),sep='/',collapse=''),':',counts[i,counts[i,'ref']],',',counts[i,j],sep='',collapse='')\n";
|
|
363 print R " af <- 1/ploidy\n";
|
|
364 print R " filter <- 'LowFraction'\n";
|
|
365 print R " }\n";
|
|
366 print R " if (counts[i,'TotalDepth'] < $mincov) {\n";
|
|
367 print R " filter <- 'LowCoverage'\n";
|
|
368 print R " }\n";
|
|
369 print R " info <- paste('DP=',counts[i,'TotalDepth'],';AF=',round(af,digits=5),';AR=',round(frac,digits=5),sep='')\n";
|
|
370 print R " snpids <- which(snps\$chr == chr & snps\$pos == counts[i,'pos'])\n";
|
|
371 print R " id <- '.'\n";
|
|
372 print R " if (length(snpids) > 0) id <- snps[snpids[1],'id']\n";
|
|
373 print R " vcf[length(vcf)+1] <- paste(chr,counts[i,'pos'],id,counts[i,'ref'],j,phred,filter,info,'GT:AD',sample,sep='\\t',collapse='')\n";
|
|
374 print R " }\n";
|
|
375 print R " }\n";
|
|
376 print R " }\n";
|
|
377 print R "}\n";
|
|
378 print R "if (length(vcf) > 0) {\n";
|
|
379 print R " write(file=args[4],paste(vcf,sep='\\n'))\n";
|
|
380 print R "}\n";
|
|
381 close R;
|
|
382 system("mkdir $wd/VCF/");
|
|
383 for (my $i = 1; $i <= $opts{'p'}; $i++) {
|
|
384 ${'thr'.$i} = threads->create('CallSNPs');
|
|
385 }
|
|
386 ## end the threads.
|
|
387 for (my $i = 1; $i <= $opts{'p'}; $i++) {
|
|
388 $targets_three->enqueue(undef);
|
|
389 }
|
|
390
|
|
391 for (my $i = 1; $i <= $opts{'p'}; $i++) {
|
|
392 ${'thr'.$i}->join();
|
|
393 }
|
|
394
|
|
395 ## BUILD FINAL VCF
|
|
396 open OUT, ">$outfile";
|
|
397 print OUT "##fileformat=VCFv4.1\n";
|
|
398 print OUT "##source=High_Ploidy_Genotyper_v.0.1\n";
|
|
399 print OUT "##genome_reference=$twobit\n";
|
|
400 if ($snpfile ne '') {
|
|
401 print OUT "##SNP_file=$snpfile\n";
|
|
402 }
|
|
403 foreach(keys(%thash)) {
|
|
404 print OUT "##contig=<ID=$_,assembly=hg19,species=\"Homo Sapiens\">\n";
|
|
405 }
|
|
406 print OUT "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n";
|
|
407 print OUT "##INFO=<ID=AF,Number=1,Type=Float,Description=\"Allele Frequency\">\n";
|
|
408 print OUT "##INFO=<ID=AR,Number=1,Type=Float,Description=\"Allelic Ratio\">\n";
|
|
409 print OUT "##FILTER=<ID=LowFraction,Description=\"Allelic Fraction under 1/2*$ploidy\">\n";
|
|
410 print OUT "##FILTER=<ID=LowCoverage,Description=\"Total Depth is lower than threshold of $mincov\">\n";
|
|
411 print OUT "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n";
|
|
412 print OUT "##FORMAT=<ID=AD,Number=2,type=Integer,Description,\"Allelic Depth\">\n";
|
|
413 print OUT "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n";
|
|
414 close OUT;
|
|
415 @i = ( 1 .. 22,'X','Y','M' );
|
|
416 foreach(@i) {
|
|
417 my $chr = "chr$_";
|
|
418 foreach(sort {$a <=> $b} keys(%{$thash{$chr}})) {
|
|
419 my $v = "$wd/VCF/$chr.$_-".$thash{$chr}{$_}.".vcf";
|
|
420 if (-e $v) {
|
|
421 system("cat '$v' >> '$outfile'");
|
|
422 }
|
|
423 }
|
|
424 }
|
|
425
|
|
426 ## clean up
|
|
427 system("rm -Rf '$wd'");
|
|
428
|
|
429 sub FetchFasta {
|
|
430 while(defined(my $line = $targets_one->dequeue())) {
|
|
431 my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line);
|
|
432 # 2bit is zero based, non-including => decrease start by one
|
|
433 $startposition = $start - 1;
|
|
434 my $command = "twoBitToFa -seq=$chr -start=$startposition -end=$stop -noMask $twobit $wd/Fasta/$chr-$start-$stop.fasta";
|
|
435 system($command);
|
|
436 }
|
|
437 }
|
|
438
|
|
439 sub CountAlleles {
|
|
440 # local version of hashes
|
|
441 my $snp = \%dbsnp;
|
|
442 my %counts;
|
|
443 $counts{'A'} = '';
|
|
444 $counts{'C'} = '';
|
|
445 $counts{'G'} = '';
|
|
446 $counts{'T'} = '';
|
|
447 my %map =('A' => 1,'C' => 2,'G' => 3, 'T' => 4);
|
|
448 my %options;
|
|
449 foreach(keys(%map)) {
|
|
450 my $r = $_;
|
|
451 foreach(keys(%map)) {
|
|
452 if ($_ eq $r) {
|
|
453 next;
|
|
454 }
|
|
455 $options{$r.'-'.$_} = '';
|
|
456 }
|
|
457 }
|
|
458 while (defined(my $line = $targets_two->dequeue())) {
|
|
459 $out = '';
|
|
460 my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line);
|
|
461 ## get reference alleles
|
|
462 my %ref_alleles;
|
|
463 open FASTA, "$wd/Fasta/$chr-$start-$stop.fasta";
|
|
464 my $head = <FASTA>;
|
|
465 my $seq = '';
|
|
466 while (<FASTA>) {
|
|
467 chomp;
|
|
468 $seq .= $_;
|
|
469 }
|
|
470 close FASTA;
|
|
471 # this generates a hash of the reference alleles once, instead of substr-calls in every bam, on every iteration.
|
|
472 for (my $pos = 0; $pos < length($seq); $pos++) {
|
|
473 $ref_alleles{($pos+$start)} = substr($seq,$pos,1);
|
|
474 }
|
|
475 ## get counts.
|
|
476 my $target = "$chr:$start-$stop";
|
|
477 my $command = "igvtools count -w 1 --bases --query '$target' '$bam' '$wd/WIGS/$chr-$start-$stop.wig' '$igvgenome' > /dev/null 2>&1";
|
|
478 system($command);
|
|
479 open WIG, "$wd/WIGS/$chr-$start-$stop.wig";
|
|
480 my $h = <WIG>;
|
|
481 $h = <WIG>;
|
|
482 $h = <WIG>;
|
|
483 my $target_counts = '';
|
|
484 while (<WIG>) {
|
|
485 chomp;
|
|
486 #my ($pos, $a, $c, $g, $t , $n) = split(/\t/,$_);
|
|
487 my @p = split(/\t/,$_);
|
|
488 my $s = $p[1] + $p[2] + $p[3] + $p[4];
|
|
489 $target_counts .= "$p[0]\t$ref_alleles{$p[0]}\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$s\n";
|
|
490 ## skip positions with coverage < minimal coverage, and positions in dbsnp if specified (if not specified, snp hash is empty).
|
|
491 if ($s > $mincov && !defined($snp->{$chr.'-'.$p[0]})) {
|
|
492 ## for model of 'non-reference'
|
|
493 my $frac = $p[$map{$ref_alleles{$p[0]}}] / $s;
|
|
494 $counts{$ref_alleles{$p[0]}} .= $frac.',';
|
|
495 $out .= "$target\t$p[0]\t$ref_alleles{$p[0]}\t$p[1]\t$p[2]\t$p[3]\t$p[4]\n";
|
|
496 ## for each of the options background models
|
|
497 foreach(keys(%map)) {
|
|
498 if ($_ eq $ref_alleles{$p[0]}) {
|
|
499 next;
|
|
500 }
|
|
501 $options{$ref_alleles{$p[0]}.'-'.$_} .= ($p[$map{$_}] / $s) .',';
|
|
502 }
|
|
503
|
|
504 }
|
|
505 }
|
|
506 close WIG;
|
|
507 open OUT, ">>$wd/allcounts.$mincov"."x.$hassnp.txt";
|
|
508 flock(OUT, 2);
|
|
509 print OUT $out;
|
|
510 close OUT;
|
|
511 open OUT, ">$wd/WIGS/$chr.$start-$stop.txt";
|
|
512 print OUT $target_counts;
|
|
513 close OUT;
|
|
514
|
|
515 }
|
|
516 foreach(keys(%counts)) {
|
|
517 open OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt";
|
|
518 flock(OUT,2);
|
|
519 print OUT $counts{$_};
|
|
520 close OUT;
|
|
521 }
|
|
522 foreach(keys(%options)) {
|
|
523 open OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt";
|
|
524 flock(OUT,2);
|
|
525 print OUT $options{$_};
|
|
526 close OUT;
|
|
527 }
|
|
528 }
|
|
529
|
|
530 sub GetDistribution {
|
|
531 while (defined(my $allele = $alleles->dequeue())) {
|
|
532 system("sed -i 's/.\$//' '$wd/counts_$allele.$mincov"."x.$hassnp.txt'");
|
|
533 open OUT, ">$wd/GetDistribution.$allele.R";
|
16
|
534 print OUT "\n";
|
11
|
535 print OUT "nt <- '$allele'\n";
|
|
536 #print OUT "pdf(file='$wd/Distribution.$allele.$mincov"."x.$hassnp.pdf',paper='a4')\n";
|
|
537 print OUT "data <- scan(file='$wd/counts_$allele.$mincov"."x.$hassnp.txt',sep=',')\n";
|
|
538 print OUT "nr <- length(data)\n";
|
|
539 print OUT "avg <- mean(data)\n";
|
|
540 print OUT "sdd <- sd(data)\n";
|
17
|
541 print OUT "write(c(avg,sdd,nr),file='$wd/model.$allele.$mincov"."x.$hassnp.txt',ncolumns=1)\n";
|
11
|
542 close OUT;
|
|
543 system("cd $wd && Rscript GetDistribution.$allele.R >/dev/null 2>&1");
|
|
544 }
|
|
545 }
|
|
546
|
|
547
|
|
548 sub CallSNPs {
|
|
549 while (defined(my $line = $targets_three->dequeue())) {
|
|
550 # split.
|
|
551 my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line);
|
|
552 my $file = "$wd/WIGS/$chr.$start-$stop.txt";
|
|
553 my $ofile = "$wd/VCF/$chr.$start-$stop.vcf";
|
|
554 system("cd $wd && Rscript CallSNPs.R '$file' '$chr' '$ploidy' '$ofile' '$wd/dbsnp.txt'");
|
|
555 }
|
|
556
|
|
557 }
|
|
558
|