Mercurial > repos > yusuf > miseq_bam_variants
view call_miseq_variants @ 0:1a23ea467feb default tip
intial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Thu, 26 Mar 2015 09:36:17 -0600 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/env perl # Locked down procedure for variant calling use strict; use warnings; use File::Basename; #@ARGV == 7 or die "Usage: $0 <sample name> <output dir> <deduped_realigned_input.bam> <target_regions.bed> <gene_regions.bed> <reference.fasta> <max num processes>\n"; # parse configuration file my $dirname= dirname(__FILE__); my $tool_data = shift @ARGV; my %config; if( not -e "$tool_data/miseq_bam_variants.loc" ){ system("cp $dirname/tool-data/miseq_bam_variants.loc $tool_data/miseq_bam_variants.loc"); } open CONFIG, '<',"$tool_data/miseq_bam_variants.loc"; while(<CONFIG>){ (my $key, my $value) = split(/\s+/, $_ ); $config{$key} = $value; } my $dbs_dir = $config{"reference_dbs"}; my $name_dir = $config{"named_regions_dir"}; my $capt_dir = $config{"capture_kits_dir"}; close CONFIG; my $sample_name = shift @ARGV; my $outdir = shift @ARGV; my $bam = shift @ARGV; my $bed = $capt_dir . (shift @ARGV); my $gene_regions = $name_dir . (shift @ARGV); my $fasta = $dbs_dir . (shift @ARGV); my $max_processes = shift @ARGV; my $log = "$outdir/$sample_name.call_variants.log.txt"; my $gatk = "$outdir/$sample_name.gatk_haplotypecaller"; my $freeBayes = "$outdir/$sample_name.freeBayes"; my $atlas2_indel = "$outdir/$sample_name.atlas2_indel"; my $phase = "$outdir/$sample_name.phase"; my $cover = "$outdir/$sample_name.read_coverage"; my $flagstat = "$outdir/$sample_name.flagstat"; my @cmds; @cmds = ( "$dirname/gatk_haplotypecaller_parallel $bam ".($max_processes > 5 ? int($max_processes*2/3+0.5) : 2)." $gatk.vcf -R $fasta --emitRefConfidence GVCF -variant_index_type LINEAR -variant_index_parameter 128000 &>> $log", "$dirname/freebayes_parallel $bam ".($max_processes > 5 ? int(($max_processes-1)/3+0.5) : 2)." $freeBayes.vcf -j -C 2 -i -f $fasta -X &>> $log", "samtools phase -F $bam > $phase.txt 2>> $log"); &run_cmds($max_processes, @cmds); @cmds = ("$dirname/vcf2hgvs_table -q -d 2 -p 0.05 -h 0.1 -u $dbs_dir/internal_runs.sorted.vcf.gz -m $dbs_dir/mappability/hg19.crgRefSeqExomeMappability75mer.txt -x $bed -i $gatk.vcf -o $gatk.hgvs.txt -s $dbs_dir/dbsnp_1000g_esp6500.latest.vcf.gz -r $dbs_dir/hg19.fa -e $dbs_dir/hg19_refGene_gencode_ultrasensitive.gtf -b $gene_regions -z $phase.txt", "$dirname/vcf2hgvs_table -q -d 2 -p 0.05 -h 0.1 -u $dbs_dir/internal_runs.sorted.vcf.gz -m $dbs_dir/mappability/hg19.crgRefSeqExomeMappability75mer.txt -x $bed -i $freeBayes.vcf -o $freeBayes.hgvs.txt -s $dbs_dir/dbsnp_1000g_esp6500.latest.vcf.gz -r $dbs_dir/hg19.fa -e $dbs_dir/hg19_refGene_gencode_ultrasensitive.gtf -b $gene_regions -z $phase.txt"); &run_cmds($max_processes, @cmds); my $combined = "$outdir/$sample_name.combined"; system "$dirname/combine_hgvs_tables -q true $combined.hgvs.txt GATKHaplotypeCaller $gatk.hgvs.txt FreeBayes $freeBayes.hgvs.txt >> $log"; system "$dirname/hgvs_collapse_transcripts $combined.hgvs.txt $combined.collapsed.hgvs.txt 1 >> $log"; system "$dirname/split_hgvs_by_confidence $combined.collapsed.hgvs.txt $combined.collapsed.confident.hgvs.txt $combined.collapsed.marginal.hgvs.txt 2 &>> $log"; # Check the logs for anything untowards # e.g. FreeBayes: jumping is disabled # or any line saying "error" my @errors; open(LOG, $log) or die "Could not check log file for error message, therefore the validity of the genotypes cannot be guaranteed\n"; while(<LOG>){ if(/error/i or /jumping is disabled/){ push @errors, $_; } } close(LOG); if(@errors){ print "Errors were detected in the genotyping log:\n", @errors; exit 1; # failure } exit 0; #success sub run_cmds{ my ($max_cmds, @cmd) = @_; my ($num_children, $pid); for($num_children = 1; $num_children < $max_cmds && @cmds; $num_children++){ # initialize the number of child processes at 1, and increment it by one #while it is less than $max_cmds my $cmd = shift (@cmds); if($pid = fork) { # do nothing if parent } elsif(defined $pid) { # $pid is zero here if defined #print STDERR $cmd, "\n"; system $cmd; exit; } else { #weird fork error die "Can't fork: $!\n"; } } while(@cmds) { undef $pid; FORK: { my $cmd = shift (@cmds); if($pid = fork) { # parent here $num_children++; wait; $num_children--; next; } elsif(defined $pid) { # $pid is zero here if defined #print STDERR $cmd, "\n"; system $cmd; exit; } else { #weird fork error die "Can't fork: $!\n"; } } } wait while $num_children--; }