diff DC_Genotyper.pl @ 17:3ba482a2dd0e draft

Uploaded
author geert-vandeweyer
date Mon, 29 Sep 2014 04:00:10 -0400
parents 7e4a9ee69f0b
children 8938f339ed37
line wrap: on
line diff
--- a/DC_Genotyper.pl	Mon Sep 29 02:03:50 2014 -0400
+++ b/DC_Genotyper.pl	Mon Sep 29 04:00:10 2014 -0400
@@ -32,7 +32,8 @@
 # P: ploidy
 # a: outfile for allele distributions
 # v: vcf file output.
-getopts('t:b:R:p:s:m:P:v:a:', \%opts) ;
+# d: distribution plots pdf file
+getopts('t:b:R:p:s:m:P:v:a:d:', \%opts) ;
 
 ## variables
 my $twobit :shared;
@@ -170,6 +171,8 @@
 ##########################################################
 my %dbsnp :shared;
 if ($snpfile ne '') {
+	## BCFTOOLS query is very very fast, but not available so far in the default bcftools version included in samtools package. 
+	# as a work-around, use tabix, but this is slower.
 	#my $bcf = `which bcftools`;
 	#chomp($bcf);
 	#if ($bcf ne '') {
@@ -196,10 +199,11 @@
 		## dummy line on chr zero
 		print SNP "chr0\t1\t.\tA\tT\n";
 		close SNP;
+	}
 }
 else {
 	open SNP, ">$wd/dbsnp.txt";
-	## dummy line on chr zero
+	## dummy line on chr zero to prevent R issues on empty file.
 	print SNP "chr0\t1\t.\tA\tT\n";
 	close SNP;
 }
@@ -208,11 +212,9 @@
 mkdir "$wd/WIGS/";
 my $bam :shared;
 $bam = $opts{'b'};
-my $bai = $bam.".bai";
-if (!-e $bai) {
-	print "BAI ($bai) missing for $bam : creating\n";
-	system("samtools index $bam"); 
-} 
+# igvtools cannot handle the .dat extension, so make symlink
+system("ln -s '$bam' '$wd/input.bam'");
+system("cd $wd && samtools index input.bam");
 
 for (my $i = 1; $i <= $opts{'p'}; $i++) {
 	${'thr'.$i} = threads->create('CountAlleles');
@@ -256,19 +258,19 @@
 ####################################
 my %map =('A' => 2,'C' => 3,'G' => 4, 'T' => 5);
 open OUT, ">".$opts{'a'};
-print OUT "allele\tavg\tsd\n";
+print OUT "allele\tavg\tsd\tN\n";
 foreach(keys(%map)) {
 	my $r = $_;
 	my $f = "$wd/model.$r.$mincov"."x.$hassnp.txt";
 	open IN, "$f";
 	my $a = <IN>;
 	chomp($a);
-	#$dists{$r}{'avg'} = $a;
 	my $s = <IN>;
 	chomp($s);
-	#$dists{$r}{'sd'} = $s;
+	my $n = <IN>;
+	chomp($n);
 	close IN;
-	print OUT "$r\t$a\t$s\n";
+	print OUT "$r\t$a\t$s\t$n\n";
 	foreach(keys(%map)) {
 		if ($_ eq $r) {
 			next;
@@ -279,12 +281,40 @@
 		chomp($a);
 		my $s = <IN>;
 		chomp($s);
+		my $n = <IN>;
+		chomp($n);
 		close IN;
-		print OUT "$r-$_\t$a\t$s\n";
+		print OUT "$r-$_\t$a\t$s\t$n\n";
 	}
 }
 close OUT;
 
+## make pdf with distribution plots
+###################################
+open OUT, ">$wd/MakePlots.R";
+print OUT "\n";
+print OUT "dists <- read.table(file='".$opts{'a'}."', header=TRUE, as.is=TRUE)\n";
+print OUT "pdf(file='".$opts{'d'}."',paper='a4',onefile=TRUE)\n";
+print OUT "par(mfrow=c(2, 2))\n";
+print OUT "for (i in 1:nrow(dists)) {\n";
+print OUT "   if (dists[i,'avg'] > 0.5) {\n";
+print OUT "     x <- seq(0.85,1,length=1000))\n";
+print OUT "     y <- dnorm(x,mean=dists[i,'avg'],sd=dists[i,'sd']))\n";
+print OUT "     plot(x,y,main=paste('Distribution for allele \"',dists[i,'allele'],'\"',sep=''),xlab='Allelic Ratio',type='l',lwd=1))\n";
+print OUT "     abline(v=(dists[i,'avg']-3*dists[i,'sd']),col='red'))\n";
+print OUT "     text(0.855,max(y-0.5),paste(c('avg: ',round(dists[i,'avg'],digits=5),'\\nsd: ',round(dists[i,'sd'],digits=5),'\\nN: ',dists[i,'N']),sep=' ',collapse=''),adj=c(0,1)))\n";
+print OUT "  } else {)\n";
+print OUT "   x <- seq(0,0.15,length=1000))\n";
+print OUT "   y <- dnorm(x,dists[i,'avg'],sd=dists[i,'sd']))\n";
+print OUT "   plot(x,y,main=paste('Distribution for \"',dists[i,'allele'],'\" variation',sep=''),xlab='Allelic Ratio',type='l',lwd=1))\n";
+print OUT "   abline(v=(dists[i,'avg']+3*dists[i,'sd']),col='red'))\n";
+print OUT "   text(0.1,max(y-0.5),paste(c('avg: ',round(dists[i,'avg'],digits=5),'\\nsd: ',round(dists[i,'sd'],digits=5),'\\nN: ',dists[i,'N']),sep=' ',collapse=''),adj=c(0,1)))\n";
+print OUT "  })\n";
+print OUT "})\n";
+print OUT "dev.off())\n";
+close OUT;
+system("cd $wd && Rscript MakePlots.R > /dev/null 2>&1");
+
 ## CALL SNPs
 ############
 # create the R script.
@@ -508,21 +538,7 @@
 		print OUT "nr <- length(data)\n";
 		print OUT "avg <- mean(data)\n";
 		print OUT "sdd <- sd(data)\n";
-		#print OUT "if (avg > 0.5) {\n";
-		#print OUT "  x <- seq(0.8,1,length=1000)\n";
-		#print OUT "  y <- dnorm(x,mean=avg,sd=sdd)\n";
-		#print OUT "  plot(x,y,main='Distribution in sample $bam for nt $allele',xlab='Allelic Ratio',type='l',lwd=1)\n";
-		#print OUT "  abline(v=(avg-3*sdd),col='red')\n";
-		#print OUT "  text(0.81,max(y-0.5),paste(c('avg: ',avg,'\\nsd: ',sdd,'\\nnrDataPoints:', nr,'\\n$hassnp\\nMin.Cov:',$mincov),sep=' ',collapse=''),adj=c(0,1))\n";
-		#print OUT "} else {\n";
-		#print OUT "  x <- seq(0,0.3,length=1000)\n";
-		#print OUT "  y <- dnorm(x,mean=avg,sd=sdd)\n";
-		#print OUT "  plot(x,y,main='Distribution in sample $bam for nt $allele',xlab='Allelic Ratio',type='l',lwd=1)\n";
-		#print OUT "  abline(v=(avg+3*sdd),col='red')\n";
-		#print OUT "  text(0.2,max(y-0.5),paste(c('avg: ',avg,'\\nsd: ',sdd,'\\nnrDataPoints:', nr,'\\n$hassnp\\nMin.Cov:',$mincov),sep=' ',collapse=''),adj=c(0,1))\n";
-		#print OUT "}\n";
-		#print OUT "dev.off()\n";
-		print OUT "write(c(avg,sdd),file='$wd/model.$allele.$mincov"."x.$hassnp.txt',ncolumns=1)\n";
+		print OUT "write(c(avg,sdd,nr),file='$wd/model.$allele.$mincov"."x.$hassnp.txt',ncolumns=1)\n";
 		close OUT; 
 		system("cd $wd && Rscript GetDistribution.$allele.R >/dev/null 2>&1");
 	}