Mercurial > repos > geert-vandeweyer > dc_genotyper
view DC_Genotyper.pl @ 21:36e1ba3b4d1b draft
updated perl-threading modules, due to cpan update
author | geert-vandeweyer |
---|---|
date | Thu, 07 Jan 2016 05:08:45 -0500 |
parents | 8262299f8f3c |
children |
line wrap: on
line source
#!/usr/bin/perl ############################# ## DEEP COVERAGE GENOTYPER ## ############################# # version : 0.0.1 # Principle: # 1. get allele counts on all positions in specified targets (bed) using igvtools. Only SNPs !! # 2. remove known dbsnp positions (bcf file) # 3. Get distribution of background noise (pcr/sequencing errors), by modelling allele fractions as normal distributions. # 4. Based on these distributions, check each position for significant change from the reference allele (based on allele fraction) # 5. For abberant positions, check each alternate allele to see if it passes the background signal. # 6. Generate VCF file. ################## ## LOAD MODULES ## ################## use threads; use threads::shared; use Thread::Queue; use Getopt::Std; #################### ## get paramaters ## #################### # t: target file # b: bam file # R: reference genome files for twobit and IGV. # p: number of threads. # s: dbsnp file # m: minimal coverage (defaults 400x) # P: ploidy # a: outfile for allele distributions # v: vcf file output. # d: distribution plots pdf file getopts('t:b:R:p:s:m:P:v:a:d:', \%opts) ; ## variables my $twobit :shared; my $igvgenome :shared; if (!defined($opts{'R'})) { die("Reference Genomes not specified\n"); } my @refgenomes = split(",",$opts{'R'}); if (!-e $refgenomes[0]) { die("'$refgenomes[0]' is not a valid file path."); } else { $twobit = $refgenomes[0]; } if (!-e $refgenomes[1]) { die("'$refgenomes[1]' is not a valid file path."); } else { $igvgenome = $refgenomes[1]; } my $mincov :shared; $mincov = 320; if (defined($opts{'m'})) { $mincov = $opts{'m'}; } my $ploidy :shared; if (defined($opts{'P'}) && $opts{'P'} =~ m/^\d+$/) { $ploidy = $opts{'P'}; } else { die("Ploidy (-P) was not specified or not an integer\n"); } if (defined($opts{'v'})) { $outfile = $opts{'v'}; } else { die("No output vcf-file specified.\n"); } if (!defined($opts{'a'})) { die("No output file specified for distribution details\n"); } ## create working dir. my $rand = int(rand(10000)); while (-d "/tmp/DC_Genotyper_$rand") { $rand = int(rand(10000)); } my $wd :shared; $wd = "/tmp/DC_Genotyper_$rand"; system("mkdir '$wd'"); my $snpfile :shared; my $hassnp :shared; $hassnp = 'NoDbSNP'; $snpfile = ''; if (defined($opts{'s'})) { $snpfile = $opts{'s'}; if (!-e $snpfile) { die("'$snpfile' is not a valid file path."); } my $mime = `file $snpfile`; if ($mime !~ m/compressed/) { print "$snpfile is not in compressed format. compressing & indexing the file now.\n"; #print "... this takes a while\n"; system("bgzip -c $snpfile > $wd/dbSNP.vcf.bgz"); system("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz"); $snpfile = "$wd/dbSNP.vcf.bgz"; } elsif (!-e "$snpfile.tbi") { print "tabix index file is missing for '$snpfile'. creating now.\n"; ## check if I can write it out for future use $snpfile =~ m/(.*)([^\/]+)$/; my $d = $1; if (-w $d) { open OUT, ">$d/lock"; flock(OUT,2); system("cd $d && tabix -p vcf $snpfile"); close OUT; system("rm $d/lock"); } else { system("cp $snpfile /$wd/dbSNP.vcf.bgz"); system("cd $wd/ && tabix -p vcf dbSNP.vcf.bgz"); $snpfile = "$wd/dbSNP.vcf.bgz"; } } $hassnp = 'WithDbSNP'; } ## 1. Get FASTA and prepare output hashes: my $targets_one = Thread::Queue->new(); my $targets_two = Thread::Queue->new(); my $targets_three = Thread::Queue->new(); open IN, $opts{'t'} or die("Could not open $opts{'t'} file for reading"); if (-d "$wd/Fasta/") { system("rm $wd/Fasta/*"); } else { system("mkdir $wd/Fasta"); } ## create the threads. for (my $i = 1; $i<= $opts{'p'}; $i++) { ${'thr'.$i} = threads->create('FetchFasta'); } ## enqueue the targets. my %thash; while (<IN>) { chomp; my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$_); $targets_one->enqueue($_); $targets_two->enqueue($_); $targets_three->enqueue($_); $thash{$chr}{$start} = $stop; } close IN; ## end the threads. for (my $i = 1; $i <= $opts{'p'}; $i++) { $targets_one->enqueue(undef); } for (my $i = 1; $i <= $opts{'p'}; $i++) { ${'thr'.$i}->join(); } ## load dbSNP inside target regions into shared structure. ########################################################## my %dbsnp :shared; if ($snpfile ne '') { ## BCFTOOLS query is very very fast, but not available so far in the default bcftools version included in samtools package. # as a work-around, use tabix, but this is slower. #my $bcf = `which bcftools`; #chomp($bcf); #if ($bcf ne '') { # my $command = "bcftools query -f '\%CHROM\\t\%POS\\t\%REF\\t\%ALT\\t\%ID\\n' -R '".$opts{'t'}."' '$snpfile' > $wd/dbsnp.txt"; # system("$command"); # open IN, "$wd/dbsnp.txt"; # while (<IN>) { # chomp; # my @p = split(/\t/,$_); # $dbsnp{$p[0].'-'.$p[1]} = $p[2].'-'.$p[3].'-'.$p[4]; # } # close IN; #} #else { # print "WARNING: BCFtools is not in the path. Skipping snp handling.\n"; # $snpfile = ''; # system("touch $wd/dbsnp.txt"); #} system("tabix $snpfile -B $opts{'t'} | cut -f 1-5 > $wd/dbsnp.txt"); my $lc = `cat $wd/dbsnp.txt | wc -l`; chomp($lc); if ($lc eq '0') { open SNP, ">$wd/dbsnp.txt"; ## dummy line on chr zero print SNP "chr0\t1\t.\tA\tT\n"; close SNP; } } else { open SNP, ">$wd/dbsnp.txt"; ## dummy line on chr zero to prevent R issues on empty file. print SNP "chr0\t1\t.\tA\tT\n"; close SNP; } open IN, "$wd/dbsnp.txt"; while (<IN>) { chomp; my @p = split(/\t/,$_); $dbsnp{$p[0].'-'.$p[1]} = $p[3].'-'.$p[4].'-'.$p[2]; } close IN; ## now process the bam file. mkdir "$wd/WIGS/"; my $bam :shared; $bam = $opts{'b'}; # igvtools cannot handle the .dat extension, so make symlink system("ln -s '$bam' '$wd/input.bam'"); system("cd $wd && samtools index input.bam"); for (my $i = 1; $i <= $opts{'p'}; $i++) { ${'thr'.$i} = threads->create('CountAlleles'); } ## end the threads. for (my $i = 1; $i <= $opts{'p'}; $i++) { $targets_two->enqueue(undef); } for (my $i = 1; $i <= $opts{'p'}; $i++) { ${'thr'.$i}->join(); } ## generate the distributions. ############################## my $alleles = Thread::Queue->new(); my %all = ('A' => 1,'C' => 2,'G' => 3, 'T' => 4); foreach(keys(%all)) { $alleles->enqueue($_); my $a = $_; foreach(keys(%all)) { if ($_ eq $a) { next; } $alleles->enqueue($a.'-'.$_); } } for (my $i = 1; $i <= $opts{'p'}; $i++) { ${'thr'.$i} = threads->create('GetDistribution'); } ## end the threads. for (my $i = 1; $i <= $opts{'p'}; $i++) { $alleles->enqueue(undef); } for (my $i = 1; $i <= $opts{'p'}; $i++) { ${'thr'.$i}->join(); } ## group distributions into one file #################################### my %map =('A' => 2,'C' => 3,'G' => 4, 'T' => 5); open OUT, ">".$opts{'a'}; print OUT "allele\tavg\tsd\tN\n"; foreach(keys(%map)) { my $r = $_; my $f = "$wd/model.$r.$mincov"."x.$hassnp.txt"; open IN, "$f"; my $a = <IN>; chomp($a); my $s = <IN>; chomp($s); my $n = <IN>; chomp($n); close IN; print OUT "$r\t$a\t$s\t$n\n"; foreach(keys(%map)) { if ($_ eq $r) { next; } my $f = "$wd/model.$r-$_.$mincov"."x.$hassnp.txt"; open IN, "$f"; my $a = <IN>; chomp($a); my $s = <IN>; chomp($s); my $n = <IN>; chomp($n); close IN; print OUT "$r-$_\t$a\t$s\t$n\n"; } } close OUT; ## make pdf with distribution plots ################################### open OUT, ">$wd/MakePlots.R"; print OUT "\n"; print OUT "dists <- read.table(file='".$opts{'a'}."', header=TRUE, as.is=TRUE)\n"; print OUT "pdf(file='".$opts{'d'}."',paper='a4',onefile=TRUE)\n"; print OUT "par(mfrow=c(2, 2))\n"; print OUT "for (i in 1:nrow(dists)) {\n"; print OUT " if (dists[i,'avg'] > 0.5) {\n"; print OUT " x <- seq(0.85,1,length=1000)\n"; print OUT " y <- dnorm(x,mean=dists[i,'avg'],sd=dists[i,'sd'])\n"; print OUT " plot(x,y,main=paste('Distribution for allele \"',dists[i,'allele'],'\"',sep=''),xlab='Allelic Ratio',type='l',lwd=1)\n"; print OUT " abline(v=(dists[i,'avg']-3*dists[i,'sd']),col='red')\n"; print OUT " text(0.855,max(y-0.5),paste(c('avg: ',round(dists[i,'avg'],digits=5),'\\nsd: ',round(dists[i,'sd'],digits=5),'\\nN: ',dists[i,'N']),sep=' ',collapse=''),adj=c(0,1))\n"; print OUT " } else {\n"; print OUT " x <- seq(0,0.15,length=1000)\n"; print OUT " y <- dnorm(x,dists[i,'avg'],sd=dists[i,'sd'])\n"; print OUT " plot(x,y,main=paste('Distribution for \"',dists[i,'allele'],'\" variation',sep=''),xlab='Allelic Ratio',type='l',lwd=1)\n"; print OUT " abline(v=(dists[i,'avg']+3*dists[i,'sd']),col='red')\n"; print OUT " text(0.1,max(y-0.5),paste(c('avg: ',round(dists[i,'avg'],digits=5),'\\nsd: ',round(dists[i,'sd'],digits=5),'\\nN: ',dists[i,'N']),sep=' ',collapse=''),adj=c(0,1))\n"; print OUT " }\n"; print OUT "}\n"; print OUT "dev.off()\n"; close OUT; system("cd $wd && Rscript MakePlots.R > /dev/null 2>&1"); ## CALL SNPs ############ # create the R script. open R, ">$wd/CallSNPs.R"; print R "\n"; print R "args <- commandArgs(trailingOnly = TRUE)\n"; print R "counts <- read.table(file = args[1],header = FALSE, as.is = TRUE)\n"; print R "ploidy <- as.integer(args[3])\n"; print R "chr <- args[2]\n"; print R "snps <- read.table(file=args[5],header=FALSE,as.is=TRUE)\n"; print R "colnames(snps) <- c('chr','pos','id','ref','alt')\n"; print R "colnames(counts) <- c('pos','ref','A','C','G','T','TotalDepth')\n"; print R "dists <- read.table(file='".$opts{'a'}."',header=TRUE,as.is=TRUE)\n"; print R 'rownames(dists) = dists$allele'."\n"; print R 'dists <- dists[,-1]'."\n"; print R "vcf <- c()\n"; print R "lower <- c()\n"; print R "higher <- c()\n"; print R "for (i in 1:(ploidy)) {\n"; print R " lower[length(lower)+1] <- (2*i-1)/(2*ploidy)\n"; print R " higher[length(higher)+1] <- (2*i+1)/(2*ploidy)\n"; print R "}\n"; print R "for (i in 1:nrow(counts)) {\n"; print R " if (counts[i,'TotalDepth'] == 0) next\n"; print R " # significantly different from reference?\n"; print R " z <- ((counts[i,counts[i,'ref']]/counts[i,'TotalDepth']) - dists[counts[i,'ref'],'avg']) / dists[counts[i,'ref'],'sd']\n"; print R " if (abs(z) > 3) {\n"; print R " # test all alterate alleles to see which one is significant.\n"; print R " for (j in c('A','C','G','T')) {\n"; print R " if (j == counts[i,'ref']) next\n"; print R " z <- ((counts[i,j]/counts[i,'TotalDepth']) - dists[paste(counts[i,'ref'],'-',j,sep=''),'avg']) / dists[paste(counts[i,'ref'],'-',j,sep=''),'sd']\n"; print R " if (abs(z) > 3){\n"; print R " filter <- 'PASS'\n"; print R " phred <- round(-10*log(pnorm(-abs(z))),digits=0)\n"; print R " if (phred > 9999) phred <- 9999\n"; print R " frac <- counts[i,j]/counts[i,'TotalDepth']\n"; print R " for (k in 1:ploidy) {\n"; print R " if (frac >= lower[k] && frac < higher[k]) {\n"; print R " sample <- paste(paste(paste(rep(0,(ploidy-k)),sep='',collapse='/'),paste(rep(1,k),sep='',collapse='/'),sep='/',collapse=''),':',counts[i,counts[i,'ref']],',',counts[i,j],sep='',collapse='')\n"; print R " af <- k/ploidy\n"; print R " break\n"; print R " }\n"; print R " }\n"; print R " if (frac < lower[1]) {\n"; print R " sample <- paste(paste(paste(rep(0,(ploidy-1)),sep='',collapse='/'),paste(rep(1,1),sep='',collapse='/'),sep='/',collapse=''),':',counts[i,counts[i,'ref']],',',counts[i,j],sep='',collapse='')\n"; print R " af <- 1/ploidy\n"; print R " filter <- 'LowFraction'\n"; print R " }\n"; print R " if (counts[i,'TotalDepth'] < $mincov) {\n"; print R " filter <- 'LowCoverage'\n"; print R " }\n"; print R " info <- paste('DP=',counts[i,'TotalDepth'],';AF=',round(af,digits=5),';AR=',round(frac,digits=5),sep='')\n"; print R " snpids <- which(snps\$chr == chr & snps\$pos == counts[i,'pos'])\n"; print R " id <- '.'\n"; print R " if (length(snpids) > 0) id <- snps[snpids[1],'id']\n"; print R " vcf[length(vcf)+1] <- paste(chr,counts[i,'pos'],id,counts[i,'ref'],j,phred,filter,info,'GT:AD',sample,sep='\\t',collapse='')\n"; print R " }\n"; print R " }\n"; print R " }\n"; print R "}\n"; print R "if (length(vcf) > 0) {\n"; print R " write(file=args[4],paste(vcf,sep='\\n'))\n"; print R "}\n"; close R; system("mkdir $wd/VCF/"); for (my $i = 1; $i <= $opts{'p'}; $i++) { ${'thr'.$i} = threads->create('CallSNPs'); } ## end the threads. for (my $i = 1; $i <= $opts{'p'}; $i++) { $targets_three->enqueue(undef); } for (my $i = 1; $i <= $opts{'p'}; $i++) { ${'thr'.$i}->join(); } ## BUILD FINAL VCF open OUT, ">$outfile"; print OUT "##fileformat=VCFv4.1\n"; print OUT "##source=High_Ploidy_Genotyper_v.0.1\n"; print OUT "##genome_reference=$twobit\n"; if ($snpfile ne '') { print OUT "##SNP_file=$snpfile\n"; } foreach(keys(%thash)) { print OUT "##contig=<ID=$_,assembly=hg19,species=\"Homo Sapiens\">\n"; } print OUT "##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Total Depth\">\n"; print OUT "##INFO=<ID=AF,Number=1,Type=Float,Description=\"Allele Frequency\">\n"; print OUT "##INFO=<ID=AR,Number=1,Type=Float,Description=\"Allelic Ratio\">\n"; print OUT "##FILTER=<ID=LowFraction,Description=\"Allelic Fraction under 1/2*$ploidy\">\n"; print OUT "##FILTER=<ID=LowCoverage,Description=\"Total Depth is lower than threshold of $mincov\">\n"; print OUT "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"; print OUT "##FORMAT=<ID=AD,Number=2,type=Integer,Description,\"Allelic Depth\">\n"; print OUT "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE\n"; close OUT; @i = ( 1 .. 22,'X','Y','M' ); foreach(@i) { my $chr = "chr$_"; foreach(sort {$a <=> $b} keys(%{$thash{$chr}})) { my $v = "$wd/VCF/$chr.$_-".$thash{$chr}{$_}.".vcf"; if (-e $v) { system("cat '$v' >> '$outfile'"); } } } ## clean up system("rm -Rf '$wd'"); sub FetchFasta { while(defined(my $line = $targets_one->dequeue())) { my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line); # 2bit is zero based, non-including => decrease start by one $startposition = $start - 1; my $command = "twoBitToFa -seq=$chr -start=$startposition -end=$stop -noMask $twobit $wd/Fasta/$chr-$start-$stop.fasta"; system($command); } } sub CountAlleles { # local version of hashes my $snp = \%dbsnp; my %counts; $counts{'A'} = ''; $counts{'C'} = ''; $counts{'G'} = ''; $counts{'T'} = ''; my %map =('A' => 1,'C' => 2,'G' => 3, 'T' => 4); my %options; foreach(keys(%map)) { my $r = $_; foreach(keys(%map)) { if ($_ eq $r) { next; } $options{$r.'-'.$_} = ''; } } while (defined(my $line = $targets_two->dequeue())) { $out = ''; my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line); ## get reference alleles my %ref_alleles; open FASTA, "$wd/Fasta/$chr-$start-$stop.fasta"; my $head = <FASTA>; my $seq = ''; while (<FASTA>) { chomp; $seq .= $_; } close FASTA; # this generates a hash of the reference alleles once, instead of substr-calls in every bam, on every iteration. for (my $pos = 0; $pos < length($seq); $pos++) { $ref_alleles{($pos+$start)} = substr($seq,$pos,1); } ## get counts. my $target = "$chr:$start-$stop"; my $command = "igvtools count -w 1 --bases --query '$target' '$wd/input.bam' '$wd/WIGS/$chr-$start-$stop.wig' '$igvgenome' > /dev/null 2>&1"; system($command); open WIG, "$wd/WIGS/$chr-$start-$stop.wig"; my $h = <WIG>; $h = <WIG>; $h = <WIG>; my $target_counts = ''; while (<WIG>) { chomp; #my ($pos, $a, $c, $g, $t , $n) = split(/\t/,$_); my @p = split(/\t/,$_); my $s = $p[1] + $p[2] + $p[3] + $p[4]; $target_counts .= "$p[0]\t$ref_alleles{$p[0]}\t$p[1]\t$p[2]\t$p[3]\t$p[4]\t$s\n"; ## skip positions with coverage < minimal coverage, and positions in dbsnp if specified (if not specified, snp hash is empty). if ($s > $mincov && !defined($snp->{$chr.'-'.$p[0]})) { ## for model of 'non-reference' my $frac = $p[$map{$ref_alleles{$p[0]}}] / $s; $counts{$ref_alleles{$p[0]}} .= $frac.','; $out .= "$target\t$p[0]\t$ref_alleles{$p[0]}\t$p[1]\t$p[2]\t$p[3]\t$p[4]\n"; ## for each of the options background models foreach(keys(%map)) { if ($_ eq $ref_alleles{$p[0]}) { next; } $options{$ref_alleles{$p[0]}.'-'.$_} .= ($p[$map{$_}] / $s) .','; } } } close WIG; open OUT, ">>$wd/allcounts.$mincov"."x.$hassnp.txt"; flock(OUT, 2); print OUT $out; close OUT; open OUT, ">$wd/WIGS/$chr.$start-$stop.txt"; print OUT $target_counts; close OUT; } foreach(keys(%counts)) { open OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt"; flock(OUT,2); print OUT $counts{$_}; close OUT; } foreach(keys(%options)) { open OUT, ">>$wd/counts_$_.$mincov"."x.$hassnp.txt"; flock(OUT,2); print OUT $options{$_}; close OUT; } } sub GetDistribution { while (defined(my $allele = $alleles->dequeue())) { system("sed -i 's/.\$//' '$wd/counts_$allele.$mincov"."x.$hassnp.txt'"); open OUT, ">$wd/GetDistribution.$allele.R"; print OUT "\n"; print OUT "nt <- '$allele'\n"; #print OUT "pdf(file='$wd/Distribution.$allele.$mincov"."x.$hassnp.pdf',paper='a4')\n"; print OUT "data <- scan(file='$wd/counts_$allele.$mincov"."x.$hassnp.txt',sep=',')\n"; print OUT "nr <- length(data)\n"; print OUT "avg <- mean(data)\n"; print OUT "sdd <- sd(data)\n"; print OUT "write(c(avg,sdd,nr),file='$wd/model.$allele.$mincov"."x.$hassnp.txt',ncolumns=1)\n"; close OUT; system("cd $wd && Rscript GetDistribution.$allele.R >/dev/null 2>&1"); } } sub CallSNPs { while (defined(my $line = $targets_three->dequeue())) { # split. my ($chr,$start,$stop,$name,$score,$strand) = split(/\t/,$line); my $file = "$wd/WIGS/$chr.$start-$stop.txt"; my $ofile = "$wd/VCF/$chr.$start-$stop.vcf"; system("cd $wd && Rscript CallSNPs.R '$file' '$chr' '$ploidy' '$ofile' '$wd/dbsnp.txt'"); } }