Mercurial > repos > geert-vandeweyer > dc_genotyper
comparison DC_Genotyper.pl @ 17:3ba482a2dd0e draft
Uploaded
author | geert-vandeweyer |
---|---|
date | Mon, 29 Sep 2014 04:00:10 -0400 |
parents | 7e4a9ee69f0b |
children | 8938f339ed37 |
comparison
equal
deleted
inserted
replaced
16:7e4a9ee69f0b | 17:3ba482a2dd0e |
---|---|
30 # s: dbsnp file | 30 # s: dbsnp file |
31 # m: minimal coverage (defaults 400x) | 31 # m: minimal coverage (defaults 400x) |
32 # P: ploidy | 32 # P: ploidy |
33 # a: outfile for allele distributions | 33 # a: outfile for allele distributions |
34 # v: vcf file output. | 34 # v: vcf file output. |
35 getopts('t:b:R:p:s:m:P:v:a:', \%opts) ; | 35 # d: distribution plots pdf file |
36 getopts('t:b:R:p:s:m:P:v:a:d:', \%opts) ; | |
36 | 37 |
37 ## variables | 38 ## variables |
38 my $twobit :shared; | 39 my $twobit :shared; |
39 my $igvgenome :shared; | 40 my $igvgenome :shared; |
40 if (!defined($opts{'R'})) { | 41 if (!defined($opts{'R'})) { |
168 | 169 |
169 ## load dbSNP inside target regions into shared structure. | 170 ## load dbSNP inside target regions into shared structure. |
170 ########################################################## | 171 ########################################################## |
171 my %dbsnp :shared; | 172 my %dbsnp :shared; |
172 if ($snpfile ne '') { | 173 if ($snpfile ne '') { |
174 ## BCFTOOLS query is very very fast, but not available so far in the default bcftools version included in samtools package. | |
175 # as a work-around, use tabix, but this is slower. | |
173 #my $bcf = `which bcftools`; | 176 #my $bcf = `which bcftools`; |
174 #chomp($bcf); | 177 #chomp($bcf); |
175 #if ($bcf ne '') { | 178 #if ($bcf ne '') { |
176 # my $command = "bcftools query -f '\%CHROM\\t\%POS\\t\%REF\\t\%ALT\\t\%ID\\n' -R '".$opts{'t'}."' '$snpfile' > $wd/dbsnp.txt"; | 179 # my $command = "bcftools query -f '\%CHROM\\t\%POS\\t\%REF\\t\%ALT\\t\%ID\\n' -R '".$opts{'t'}."' '$snpfile' > $wd/dbsnp.txt"; |
177 # system("$command"); | 180 # system("$command"); |
194 if ($lc eq '0') { | 197 if ($lc eq '0') { |
195 open SNP, ">$wd/dbsnp.txt"; | 198 open SNP, ">$wd/dbsnp.txt"; |
196 ## dummy line on chr zero | 199 ## dummy line on chr zero |
197 print SNP "chr0\t1\t.\tA\tT\n"; | 200 print SNP "chr0\t1\t.\tA\tT\n"; |
198 close SNP; | 201 close SNP; |
202 } | |
199 } | 203 } |
200 else { | 204 else { |
201 open SNP, ">$wd/dbsnp.txt"; | 205 open SNP, ">$wd/dbsnp.txt"; |
202 ## dummy line on chr zero | 206 ## dummy line on chr zero to prevent R issues on empty file. |
203 print SNP "chr0\t1\t.\tA\tT\n"; | 207 print SNP "chr0\t1\t.\tA\tT\n"; |
204 close SNP; | 208 close SNP; |
205 } | 209 } |
206 | 210 |
207 ## now process the bam file. | 211 ## now process the bam file. |
208 mkdir "$wd/WIGS/"; | 212 mkdir "$wd/WIGS/"; |
209 my $bam :shared; | 213 my $bam :shared; |
210 $bam = $opts{'b'}; | 214 $bam = $opts{'b'}; |
211 my $bai = $bam.".bai"; | 215 # igvtools cannot handle the .dat extension, so make symlink |
212 if (!-e $bai) { | 216 system("ln -s '$bam' '$wd/input.bam'"); |
213 print "BAI ($bai) missing for $bam : creating\n"; | 217 system("cd $wd && samtools index input.bam"); |
214 system("samtools index $bam"); | |
215 } | |
216 | 218 |
217 for (my $i = 1; $i <= $opts{'p'}; $i++) { | 219 for (my $i = 1; $i <= $opts{'p'}; $i++) { |
218 ${'thr'.$i} = threads->create('CountAlleles'); | 220 ${'thr'.$i} = threads->create('CountAlleles'); |
219 } | 221 } |
220 ## end the threads. | 222 ## end the threads. |
254 | 256 |
255 ## group distributions into one file | 257 ## group distributions into one file |
256 #################################### | 258 #################################### |
257 my %map =('A' => 2,'C' => 3,'G' => 4, 'T' => 5); | 259 my %map =('A' => 2,'C' => 3,'G' => 4, 'T' => 5); |
258 open OUT, ">".$opts{'a'}; | 260 open OUT, ">".$opts{'a'}; |
259 print OUT "allele\tavg\tsd\n"; | 261 print OUT "allele\tavg\tsd\tN\n"; |
260 foreach(keys(%map)) { | 262 foreach(keys(%map)) { |
261 my $r = $_; | 263 my $r = $_; |
262 my $f = "$wd/model.$r.$mincov"."x.$hassnp.txt"; | 264 my $f = "$wd/model.$r.$mincov"."x.$hassnp.txt"; |
263 open IN, "$f"; | 265 open IN, "$f"; |
264 my $a = <IN>; | 266 my $a = <IN>; |
265 chomp($a); | 267 chomp($a); |
266 #$dists{$r}{'avg'} = $a; | |
267 my $s = <IN>; | 268 my $s = <IN>; |
268 chomp($s); | 269 chomp($s); |
269 #$dists{$r}{'sd'} = $s; | 270 my $n = <IN>; |
271 chomp($n); | |
270 close IN; | 272 close IN; |
271 print OUT "$r\t$a\t$s\n"; | 273 print OUT "$r\t$a\t$s\t$n\n"; |
272 foreach(keys(%map)) { | 274 foreach(keys(%map)) { |
273 if ($_ eq $r) { | 275 if ($_ eq $r) { |
274 next; | 276 next; |
275 } | 277 } |
276 my $f = "$wd/model.$r-$_.$mincov"."x.$hassnp.txt"; | 278 my $f = "$wd/model.$r-$_.$mincov"."x.$hassnp.txt"; |
277 open IN, "$f"; | 279 open IN, "$f"; |
278 my $a = <IN>; | 280 my $a = <IN>; |
279 chomp($a); | 281 chomp($a); |
280 my $s = <IN>; | 282 my $s = <IN>; |
281 chomp($s); | 283 chomp($s); |
284 my $n = <IN>; | |
285 chomp($n); | |
282 close IN; | 286 close IN; |
283 print OUT "$r-$_\t$a\t$s\n"; | 287 print OUT "$r-$_\t$a\t$s\t$n\n"; |
284 } | 288 } |
285 } | 289 } |
286 close OUT; | 290 close OUT; |
291 | |
292 ## make pdf with distribution plots | |
293 ################################### | |
294 open OUT, ">$wd/MakePlots.R"; | |
295 print OUT "\n"; | |
296 print OUT "dists <- read.table(file='".$opts{'a'}."', header=TRUE, as.is=TRUE)\n"; | |
297 print OUT "pdf(file='".$opts{'d'}."',paper='a4',onefile=TRUE)\n"; | |
298 print OUT "par(mfrow=c(2, 2))\n"; | |
299 print OUT "for (i in 1:nrow(dists)) {\n"; | |
300 print OUT " if (dists[i,'avg'] > 0.5) {\n"; | |
301 print OUT " x <- seq(0.85,1,length=1000))\n"; | |
302 print OUT " y <- dnorm(x,mean=dists[i,'avg'],sd=dists[i,'sd']))\n"; | |
303 print OUT " plot(x,y,main=paste('Distribution for allele \"',dists[i,'allele'],'\"',sep=''),xlab='Allelic Ratio',type='l',lwd=1))\n"; | |
304 print OUT " abline(v=(dists[i,'avg']-3*dists[i,'sd']),col='red'))\n"; | |
305 print OUT " text(0.855,max(y-0.5),paste(c('avg: ',round(dists[i,'avg'],digits=5),'\\nsd: ',round(dists[i,'sd'],digits=5),'\\nN: ',dists[i,'N']),sep=' ',collapse=''),adj=c(0,1)))\n"; | |
306 print OUT " } else {)\n"; | |
307 print OUT " x <- seq(0,0.15,length=1000))\n"; | |
308 print OUT " y <- dnorm(x,dists[i,'avg'],sd=dists[i,'sd']))\n"; | |
309 print OUT " plot(x,y,main=paste('Distribution for \"',dists[i,'allele'],'\" variation',sep=''),xlab='Allelic Ratio',type='l',lwd=1))\n"; | |
310 print OUT " abline(v=(dists[i,'avg']+3*dists[i,'sd']),col='red'))\n"; | |
311 print OUT " text(0.1,max(y-0.5),paste(c('avg: ',round(dists[i,'avg'],digits=5),'\\nsd: ',round(dists[i,'sd'],digits=5),'\\nN: ',dists[i,'N']),sep=' ',collapse=''),adj=c(0,1)))\n"; | |
312 print OUT " })\n"; | |
313 print OUT "})\n"; | |
314 print OUT "dev.off())\n"; | |
315 close OUT; | |
316 system("cd $wd && Rscript MakePlots.R > /dev/null 2>&1"); | |
287 | 317 |
288 ## CALL SNPs | 318 ## CALL SNPs |
289 ############ | 319 ############ |
290 # create the R script. | 320 # create the R script. |
291 open R, ">$wd/CallSNPs.R"; | 321 open R, ">$wd/CallSNPs.R"; |
506 #print OUT "pdf(file='$wd/Distribution.$allele.$mincov"."x.$hassnp.pdf',paper='a4')\n"; | 536 #print OUT "pdf(file='$wd/Distribution.$allele.$mincov"."x.$hassnp.pdf',paper='a4')\n"; |
507 print OUT "data <- scan(file='$wd/counts_$allele.$mincov"."x.$hassnp.txt',sep=',')\n"; | 537 print OUT "data <- scan(file='$wd/counts_$allele.$mincov"."x.$hassnp.txt',sep=',')\n"; |
508 print OUT "nr <- length(data)\n"; | 538 print OUT "nr <- length(data)\n"; |
509 print OUT "avg <- mean(data)\n"; | 539 print OUT "avg <- mean(data)\n"; |
510 print OUT "sdd <- sd(data)\n"; | 540 print OUT "sdd <- sd(data)\n"; |
511 #print OUT "if (avg > 0.5) {\n"; | 541 print OUT "write(c(avg,sdd,nr),file='$wd/model.$allele.$mincov"."x.$hassnp.txt',ncolumns=1)\n"; |
512 #print OUT " x <- seq(0.8,1,length=1000)\n"; | |
513 #print OUT " y <- dnorm(x,mean=avg,sd=sdd)\n"; | |
514 #print OUT " plot(x,y,main='Distribution in sample $bam for nt $allele',xlab='Allelic Ratio',type='l',lwd=1)\n"; | |
515 #print OUT " abline(v=(avg-3*sdd),col='red')\n"; | |
516 #print OUT " text(0.81,max(y-0.5),paste(c('avg: ',avg,'\\nsd: ',sdd,'\\nnrDataPoints:', nr,'\\n$hassnp\\nMin.Cov:',$mincov),sep=' ',collapse=''),adj=c(0,1))\n"; | |
517 #print OUT "} else {\n"; | |
518 #print OUT " x <- seq(0,0.3,length=1000)\n"; | |
519 #print OUT " y <- dnorm(x,mean=avg,sd=sdd)\n"; | |
520 #print OUT " plot(x,y,main='Distribution in sample $bam for nt $allele',xlab='Allelic Ratio',type='l',lwd=1)\n"; | |
521 #print OUT " abline(v=(avg+3*sdd),col='red')\n"; | |
522 #print OUT " text(0.2,max(y-0.5),paste(c('avg: ',avg,'\\nsd: ',sdd,'\\nnrDataPoints:', nr,'\\n$hassnp\\nMin.Cov:',$mincov),sep=' ',collapse=''),adj=c(0,1))\n"; | |
523 #print OUT "}\n"; | |
524 #print OUT "dev.off()\n"; | |
525 print OUT "write(c(avg,sdd),file='$wd/model.$allele.$mincov"."x.$hassnp.txt',ncolumns=1)\n"; | |
526 close OUT; | 542 close OUT; |
527 system("cd $wd && Rscript GetDistribution.$allele.R >/dev/null 2>&1"); | 543 system("cd $wd && Rscript GetDistribution.$allele.R >/dev/null 2>&1"); |
528 } | 544 } |
529 } | 545 } |
530 | 546 |