Mercurial > repos > yusuf > miseq_bam_variants
diff call_miseq_variants @ 0:1a23ea467feb default tip
intial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Thu, 26 Mar 2015 09:36:17 -0600 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/call_miseq_variants Thu Mar 26 09:36:17 2015 -0600 @@ -0,0 +1,122 @@ +#!/usr/bin/env perl + +# Locked down procedure for variant calling +use strict; +use warnings; +use File::Basename; +#@ARGV == 7 or die "Usage: $0 <sample name> <output dir> <deduped_realigned_input.bam> <target_regions.bed> <gene_regions.bed> <reference.fasta> <max num processes>\n"; + +# parse configuration file +my $dirname= dirname(__FILE__); +my $tool_data = shift @ARGV; +my %config; +if( not -e "$tool_data/miseq_bam_variants.loc" ){ + system("cp $dirname/tool-data/miseq_bam_variants.loc $tool_data/miseq_bam_variants.loc"); +} +open CONFIG, '<',"$tool_data/miseq_bam_variants.loc"; +while(<CONFIG>){ + (my $key, my $value) = split(/\s+/, $_ ); + $config{$key} = $value; +} +my $dbs_dir = $config{"reference_dbs"}; +my $name_dir = $config{"named_regions_dir"}; +my $capt_dir = $config{"capture_kits_dir"}; +close CONFIG; + +my $sample_name = shift @ARGV; +my $outdir = shift @ARGV; +my $bam = shift @ARGV; +my $bed = $capt_dir . (shift @ARGV); +my $gene_regions = $name_dir . (shift @ARGV); +my $fasta = $dbs_dir . (shift @ARGV); +my $max_processes = shift @ARGV; + +my $log = "$outdir/$sample_name.call_variants.log.txt"; +my $gatk = "$outdir/$sample_name.gatk_haplotypecaller"; +my $freeBayes = "$outdir/$sample_name.freeBayes"; +my $atlas2_indel = "$outdir/$sample_name.atlas2_indel"; +my $phase = "$outdir/$sample_name.phase"; +my $cover = "$outdir/$sample_name.read_coverage"; +my $flagstat = "$outdir/$sample_name.flagstat"; + +my @cmds; +@cmds = ( "$dirname/gatk_haplotypecaller_parallel $bam ".($max_processes > 5 ? int($max_processes*2/3+0.5) : 2)." $gatk.vcf -R $fasta --emitRefConfidence GVCF -variant_index_type LINEAR -variant_index_parameter 128000 &>> $log", + "$dirname/freebayes_parallel $bam ".($max_processes > 5 ? int(($max_processes-1)/3+0.5) : 2)." $freeBayes.vcf -j -C 2 -i -f $fasta -X &>> $log", + "samtools phase -F $bam > $phase.txt 2>> $log"); +&run_cmds($max_processes, @cmds); + +@cmds = ("$dirname/vcf2hgvs_table -q -d 2 -p 0.05 -h 0.1 -u $dbs_dir/internal_runs.sorted.vcf.gz -m $dbs_dir/mappability/hg19.crgRefSeqExomeMappability75mer.txt -x $bed -i $gatk.vcf -o $gatk.hgvs.txt -s $dbs_dir/dbsnp_1000g_esp6500.latest.vcf.gz -r $dbs_dir/hg19.fa -e $dbs_dir/hg19_refGene_gencode_ultrasensitive.gtf -b $gene_regions -z $phase.txt", + "$dirname/vcf2hgvs_table -q -d 2 -p 0.05 -h 0.1 -u $dbs_dir/internal_runs.sorted.vcf.gz -m $dbs_dir/mappability/hg19.crgRefSeqExomeMappability75mer.txt -x $bed -i $freeBayes.vcf -o $freeBayes.hgvs.txt -s $dbs_dir/dbsnp_1000g_esp6500.latest.vcf.gz -r $dbs_dir/hg19.fa -e $dbs_dir/hg19_refGene_gencode_ultrasensitive.gtf -b $gene_regions -z $phase.txt"); +&run_cmds($max_processes, @cmds); + +my $combined = "$outdir/$sample_name.combined"; +system "$dirname/combine_hgvs_tables -q true $combined.hgvs.txt GATKHaplotypeCaller $gatk.hgvs.txt FreeBayes $freeBayes.hgvs.txt >> $log"; +system "$dirname/hgvs_collapse_transcripts $combined.hgvs.txt $combined.collapsed.hgvs.txt 1 >> $log"; +system "$dirname/split_hgvs_by_confidence $combined.collapsed.hgvs.txt $combined.collapsed.confident.hgvs.txt $combined.collapsed.marginal.hgvs.txt 2 &>> $log"; + +# Check the logs for anything untowards +# e.g. FreeBayes: jumping is disabled +# or any line saying "error" +my @errors; +open(LOG, $log) + or die "Could not check log file for error message, therefore the validity of the genotypes cannot be guaranteed\n"; +while(<LOG>){ + if(/error/i or /jumping is disabled/){ + push @errors, $_; + } +} +close(LOG); + +if(@errors){ + print "Errors were detected in the genotyping log:\n", @errors; + exit 1; # failure +} +exit 0; #success + +sub run_cmds{ + + my ($max_cmds, @cmd) = @_; + + my ($num_children, $pid); + + for($num_children = 1; $num_children < $max_cmds && @cmds; $num_children++){ + # initialize the number of child processes at 1, and increment it by one + #while it is less than $max_cmds + + my $cmd = shift (@cmds); + if($pid = fork) { + # do nothing if parent + } elsif(defined $pid) { # $pid is zero here if defined + #print STDERR $cmd, "\n"; + system $cmd; + exit; + } else { + #weird fork error + die "Can't fork: $!\n"; + } + } + + while(@cmds) { + undef $pid; + FORK: { + my $cmd = shift (@cmds); + if($pid = fork) { + # parent here + $num_children++; + wait; + $num_children--; + next; + + } elsif(defined $pid) { # $pid is zero here if defined + #print STDERR $cmd, "\n"; + system $cmd; + exit; + + } else { + #weird fork error + die "Can't fork: $!\n"; + } + } + } + wait while $num_children--; +}