comparison DC_Genotyper.pl @ 17:3ba482a2dd0e draft

Uploaded
author geert-vandeweyer
date Mon, 29 Sep 2014 04:00:10 -0400
parents 7e4a9ee69f0b
children 8938f339ed37
comparison
equal deleted inserted replaced
16:7e4a9ee69f0b 17:3ba482a2dd0e
30 # s: dbsnp file 30 # s: dbsnp file
31 # m: minimal coverage (defaults 400x) 31 # m: minimal coverage (defaults 400x)
32 # P: ploidy 32 # P: ploidy
33 # a: outfile for allele distributions 33 # a: outfile for allele distributions
34 # v: vcf file output. 34 # v: vcf file output.
35 getopts('t:b:R:p:s:m:P:v:a:', \%opts) ; 35 # d: distribution plots pdf file
36 getopts('t:b:R:p:s:m:P:v:a:d:', \%opts) ;
36 37
37 ## variables 38 ## variables
38 my $twobit :shared; 39 my $twobit :shared;
39 my $igvgenome :shared; 40 my $igvgenome :shared;
40 if (!defined($opts{'R'})) { 41 if (!defined($opts{'R'})) {
168 169
169 ## load dbSNP inside target regions into shared structure. 170 ## load dbSNP inside target regions into shared structure.
170 ########################################################## 171 ##########################################################
171 my %dbsnp :shared; 172 my %dbsnp :shared;
172 if ($snpfile ne '') { 173 if ($snpfile ne '') {
174 ## BCFTOOLS query is very very fast, but not available so far in the default bcftools version included in samtools package.
175 # as a work-around, use tabix, but this is slower.
173 #my $bcf = `which bcftools`; 176 #my $bcf = `which bcftools`;
174 #chomp($bcf); 177 #chomp($bcf);
175 #if ($bcf ne '') { 178 #if ($bcf ne '') {
176 # my $command = "bcftools query -f '\%CHROM\\t\%POS\\t\%REF\\t\%ALT\\t\%ID\\n' -R '".$opts{'t'}."' '$snpfile' > $wd/dbsnp.txt"; 179 # my $command = "bcftools query -f '\%CHROM\\t\%POS\\t\%REF\\t\%ALT\\t\%ID\\n' -R '".$opts{'t'}."' '$snpfile' > $wd/dbsnp.txt";
177 # system("$command"); 180 # system("$command");
194 if ($lc eq '0') { 197 if ($lc eq '0') {
195 open SNP, ">$wd/dbsnp.txt"; 198 open SNP, ">$wd/dbsnp.txt";
196 ## dummy line on chr zero 199 ## dummy line on chr zero
197 print SNP "chr0\t1\t.\tA\tT\n"; 200 print SNP "chr0\t1\t.\tA\tT\n";
198 close SNP; 201 close SNP;
202 }
199 } 203 }
200 else { 204 else {
201 open SNP, ">$wd/dbsnp.txt"; 205 open SNP, ">$wd/dbsnp.txt";
202 ## dummy line on chr zero 206 ## dummy line on chr zero to prevent R issues on empty file.
203 print SNP "chr0\t1\t.\tA\tT\n"; 207 print SNP "chr0\t1\t.\tA\tT\n";
204 close SNP; 208 close SNP;
205 } 209 }
206 210
207 ## now process the bam file. 211 ## now process the bam file.
208 mkdir "$wd/WIGS/"; 212 mkdir "$wd/WIGS/";
209 my $bam :shared; 213 my $bam :shared;
210 $bam = $opts{'b'}; 214 $bam = $opts{'b'};
211 my $bai = $bam.".bai"; 215 # igvtools cannot handle the .dat extension, so make symlink
212 if (!-e $bai) { 216 system("ln -s '$bam' '$wd/input.bam'");
213 print "BAI ($bai) missing for $bam : creating\n"; 217 system("cd $wd && samtools index input.bam");
214 system("samtools index $bam");
215 }
216 218
217 for (my $i = 1; $i <= $opts{'p'}; $i++) { 219 for (my $i = 1; $i <= $opts{'p'}; $i++) {
218 ${'thr'.$i} = threads->create('CountAlleles'); 220 ${'thr'.$i} = threads->create('CountAlleles');
219 } 221 }
220 ## end the threads. 222 ## end the threads.
254 256
255 ## group distributions into one file 257 ## group distributions into one file
256 #################################### 258 ####################################
257 my %map =('A' => 2,'C' => 3,'G' => 4, 'T' => 5); 259 my %map =('A' => 2,'C' => 3,'G' => 4, 'T' => 5);
258 open OUT, ">".$opts{'a'}; 260 open OUT, ">".$opts{'a'};
259 print OUT "allele\tavg\tsd\n"; 261 print OUT "allele\tavg\tsd\tN\n";
260 foreach(keys(%map)) { 262 foreach(keys(%map)) {
261 my $r = $_; 263 my $r = $_;
262 my $f = "$wd/model.$r.$mincov"."x.$hassnp.txt"; 264 my $f = "$wd/model.$r.$mincov"."x.$hassnp.txt";
263 open IN, "$f"; 265 open IN, "$f";
264 my $a = <IN>; 266 my $a = <IN>;
265 chomp($a); 267 chomp($a);
266 #$dists{$r}{'avg'} = $a;
267 my $s = <IN>; 268 my $s = <IN>;
268 chomp($s); 269 chomp($s);
269 #$dists{$r}{'sd'} = $s; 270 my $n = <IN>;
271 chomp($n);
270 close IN; 272 close IN;
271 print OUT "$r\t$a\t$s\n"; 273 print OUT "$r\t$a\t$s\t$n\n";
272 foreach(keys(%map)) { 274 foreach(keys(%map)) {
273 if ($_ eq $r) { 275 if ($_ eq $r) {
274 next; 276 next;
275 } 277 }
276 my $f = "$wd/model.$r-$_.$mincov"."x.$hassnp.txt"; 278 my $f = "$wd/model.$r-$_.$mincov"."x.$hassnp.txt";
277 open IN, "$f"; 279 open IN, "$f";
278 my $a = <IN>; 280 my $a = <IN>;
279 chomp($a); 281 chomp($a);
280 my $s = <IN>; 282 my $s = <IN>;
281 chomp($s); 283 chomp($s);
284 my $n = <IN>;
285 chomp($n);
282 close IN; 286 close IN;
283 print OUT "$r-$_\t$a\t$s\n"; 287 print OUT "$r-$_\t$a\t$s\t$n\n";
284 } 288 }
285 } 289 }
286 close OUT; 290 close OUT;
291
292 ## make pdf with distribution plots
293 ###################################
294 open OUT, ">$wd/MakePlots.R";
295 print OUT "\n";
296 print OUT "dists <- read.table(file='".$opts{'a'}."', header=TRUE, as.is=TRUE)\n";
297 print OUT "pdf(file='".$opts{'d'}."',paper='a4',onefile=TRUE)\n";
298 print OUT "par(mfrow=c(2, 2))\n";
299 print OUT "for (i in 1:nrow(dists)) {\n";
300 print OUT " if (dists[i,'avg'] > 0.5) {\n";
301 print OUT " x <- seq(0.85,1,length=1000))\n";
302 print OUT " y <- dnorm(x,mean=dists[i,'avg'],sd=dists[i,'sd']))\n";
303 print OUT " plot(x,y,main=paste('Distribution for allele \"',dists[i,'allele'],'\"',sep=''),xlab='Allelic Ratio',type='l',lwd=1))\n";
304 print OUT " abline(v=(dists[i,'avg']-3*dists[i,'sd']),col='red'))\n";
305 print OUT " text(0.855,max(y-0.5),paste(c('avg: ',round(dists[i,'avg'],digits=5),'\\nsd: ',round(dists[i,'sd'],digits=5),'\\nN: ',dists[i,'N']),sep=' ',collapse=''),adj=c(0,1)))\n";
306 print OUT " } else {)\n";
307 print OUT " x <- seq(0,0.15,length=1000))\n";
308 print OUT " y <- dnorm(x,dists[i,'avg'],sd=dists[i,'sd']))\n";
309 print OUT " plot(x,y,main=paste('Distribution for \"',dists[i,'allele'],'\" variation',sep=''),xlab='Allelic Ratio',type='l',lwd=1))\n";
310 print OUT " abline(v=(dists[i,'avg']+3*dists[i,'sd']),col='red'))\n";
311 print OUT " text(0.1,max(y-0.5),paste(c('avg: ',round(dists[i,'avg'],digits=5),'\\nsd: ',round(dists[i,'sd'],digits=5),'\\nN: ',dists[i,'N']),sep=' ',collapse=''),adj=c(0,1)))\n";
312 print OUT " })\n";
313 print OUT "})\n";
314 print OUT "dev.off())\n";
315 close OUT;
316 system("cd $wd && Rscript MakePlots.R > /dev/null 2>&1");
287 317
288 ## CALL SNPs 318 ## CALL SNPs
289 ############ 319 ############
290 # create the R script. 320 # create the R script.
291 open R, ">$wd/CallSNPs.R"; 321 open R, ">$wd/CallSNPs.R";
506 #print OUT "pdf(file='$wd/Distribution.$allele.$mincov"."x.$hassnp.pdf',paper='a4')\n"; 536 #print OUT "pdf(file='$wd/Distribution.$allele.$mincov"."x.$hassnp.pdf',paper='a4')\n";
507 print OUT "data <- scan(file='$wd/counts_$allele.$mincov"."x.$hassnp.txt',sep=',')\n"; 537 print OUT "data <- scan(file='$wd/counts_$allele.$mincov"."x.$hassnp.txt',sep=',')\n";
508 print OUT "nr <- length(data)\n"; 538 print OUT "nr <- length(data)\n";
509 print OUT "avg <- mean(data)\n"; 539 print OUT "avg <- mean(data)\n";
510 print OUT "sdd <- sd(data)\n"; 540 print OUT "sdd <- sd(data)\n";
511 #print OUT "if (avg > 0.5) {\n"; 541 print OUT "write(c(avg,sdd,nr),file='$wd/model.$allele.$mincov"."x.$hassnp.txt',ncolumns=1)\n";
512 #print OUT " x <- seq(0.8,1,length=1000)\n";
513 #print OUT " y <- dnorm(x,mean=avg,sd=sdd)\n";
514 #print OUT " plot(x,y,main='Distribution in sample $bam for nt $allele',xlab='Allelic Ratio',type='l',lwd=1)\n";
515 #print OUT " abline(v=(avg-3*sdd),col='red')\n";
516 #print OUT " text(0.81,max(y-0.5),paste(c('avg: ',avg,'\\nsd: ',sdd,'\\nnrDataPoints:', nr,'\\n$hassnp\\nMin.Cov:',$mincov),sep=' ',collapse=''),adj=c(0,1))\n";
517 #print OUT "} else {\n";
518 #print OUT " x <- seq(0,0.3,length=1000)\n";
519 #print OUT " y <- dnorm(x,mean=avg,sd=sdd)\n";
520 #print OUT " plot(x,y,main='Distribution in sample $bam for nt $allele',xlab='Allelic Ratio',type='l',lwd=1)\n";
521 #print OUT " abline(v=(avg+3*sdd),col='red')\n";
522 #print OUT " text(0.2,max(y-0.5),paste(c('avg: ',avg,'\\nsd: ',sdd,'\\nnrDataPoints:', nr,'\\n$hassnp\\nMin.Cov:',$mincov),sep=' ',collapse=''),adj=c(0,1))\n";
523 #print OUT "}\n";
524 #print OUT "dev.off()\n";
525 print OUT "write(c(avg,sdd),file='$wd/model.$allele.$mincov"."x.$hassnp.txt',ncolumns=1)\n";
526 close OUT; 542 close OUT;
527 system("cd $wd && Rscript GetDistribution.$allele.R >/dev/null 2>&1"); 543 system("cd $wd && Rscript GetDistribution.$allele.R >/dev/null 2>&1");
528 } 544 }
529 } 545 }
530 546