view call_miseq_variants @ 0:1a23ea467feb default tip

intial commit
author Yusuf Ali <ali@yusuf.email>
date Thu, 26 Mar 2015 09:36:17 -0600
parents
children
line wrap: on
line source

#!/usr/bin/env perl

# Locked down procedure for variant calling
use strict;
use warnings;
use File::Basename;
#@ARGV == 7 or die "Usage: $0 <sample name> <output dir> <deduped_realigned_input.bam> <target_regions.bed> <gene_regions.bed> <reference.fasta> <max num processes>\n";

# parse configuration file
my $dirname= dirname(__FILE__);
my $tool_data = shift @ARGV;
my %config;
if( not -e "$tool_data/miseq_bam_variants.loc" ){
  system("cp $dirname/tool-data/miseq_bam_variants.loc $tool_data/miseq_bam_variants.loc");
}
open CONFIG, '<',"$tool_data/miseq_bam_variants.loc";
while(<CONFIG>){
  (my $key, my $value) = split(/\s+/, $_ );
  $config{$key} = $value;
}
my $dbs_dir = $config{"reference_dbs"};
my $name_dir = $config{"named_regions_dir"};
my $capt_dir = $config{"capture_kits_dir"};
close CONFIG;

my $sample_name = shift @ARGV;
my $outdir = shift @ARGV;
my $bam = shift @ARGV;
my $bed = $capt_dir . (shift @ARGV);
my $gene_regions = $name_dir . (shift @ARGV);
my $fasta = $dbs_dir . (shift @ARGV);
my $max_processes = shift @ARGV;

my $log = "$outdir/$sample_name.call_variants.log.txt";
my $gatk = "$outdir/$sample_name.gatk_haplotypecaller";
my $freeBayes = "$outdir/$sample_name.freeBayes";
my $atlas2_indel = "$outdir/$sample_name.atlas2_indel";
my $phase = "$outdir/$sample_name.phase";
my $cover = "$outdir/$sample_name.read_coverage";
my $flagstat = "$outdir/$sample_name.flagstat";

my @cmds;
@cmds = (   "$dirname/gatk_haplotypecaller_parallel $bam ".($max_processes > 5 ? int($max_processes*2/3+0.5) : 2)." $gatk.vcf -R $fasta --emitRefConfidence GVCF -variant_index_type LINEAR -variant_index_parameter 128000 &>> $log",
            "$dirname/freebayes_parallel $bam ".($max_processes > 5 ? int(($max_processes-1)/3+0.5) : 2)." $freeBayes.vcf -j -C 2 -i -f $fasta -X &>> $log",
            "samtools phase -F $bam > $phase.txt 2>> $log");
&run_cmds($max_processes, @cmds);

@cmds = ("$dirname/vcf2hgvs_table -q -d 2 -p 0.05 -h 0.1 -u $dbs_dir/internal_runs.sorted.vcf.gz -m $dbs_dir/mappability/hg19.crgRefSeqExomeMappability75mer.txt -x $bed -i $gatk.vcf -o $gatk.hgvs.txt -s $dbs_dir/dbsnp_1000g_esp6500.latest.vcf.gz -r $dbs_dir/hg19.fa -e $dbs_dir/hg19_refGene_gencode_ultrasensitive.gtf -b $gene_regions -z $phase.txt",
         "$dirname/vcf2hgvs_table -q -d 2 -p 0.05 -h 0.1 -u $dbs_dir/internal_runs.sorted.vcf.gz -m $dbs_dir/mappability/hg19.crgRefSeqExomeMappability75mer.txt -x $bed -i $freeBayes.vcf -o $freeBayes.hgvs.txt -s $dbs_dir/dbsnp_1000g_esp6500.latest.vcf.gz -r $dbs_dir/hg19.fa -e $dbs_dir/hg19_refGene_gencode_ultrasensitive.gtf -b $gene_regions -z $phase.txt");
&run_cmds($max_processes, @cmds);

my $combined = "$outdir/$sample_name.combined";
system "$dirname/combine_hgvs_tables -q true $combined.hgvs.txt GATKHaplotypeCaller $gatk.hgvs.txt FreeBayes $freeBayes.hgvs.txt >> $log";
system "$dirname/hgvs_collapse_transcripts $combined.hgvs.txt $combined.collapsed.hgvs.txt 1 >> $log";
system "$dirname/split_hgvs_by_confidence $combined.collapsed.hgvs.txt $combined.collapsed.confident.hgvs.txt $combined.collapsed.marginal.hgvs.txt 2 &>> $log";

# Check the logs for anything untowards
# e.g. FreeBayes: jumping is disabled
#  or any line saying "error"
my @errors;
open(LOG, $log)
  or die "Could not check log file for error message, therefore the validity of the genotypes cannot be guaranteed\n";
while(<LOG>){
  if(/error/i or /jumping is disabled/){
    push @errors, $_;
  }
}
close(LOG);

if(@errors){
  print "Errors were detected in the genotyping log:\n", @errors; 
  exit 1; # failure
}
exit 0; #success

sub run_cmds{

  my ($max_cmds, @cmd) = @_;
  
  my ($num_children, $pid); 
  
        for($num_children = 1; $num_children < $max_cmds && @cmds; $num_children++){
        # initialize the number of child processes at 1, and increment it by one
        #while it is less than $max_cmds

                my $cmd = shift (@cmds);
                if($pid = fork) {
                        # do nothing if parent
                } elsif(defined $pid) { # $pid is zero here if defined
                        #print STDERR $cmd, "\n";
                        system $cmd;
                        exit;
                } else {
                        #weird fork error
                        die "Can't fork: $!\n";
                }
        }

        while(@cmds) {
                undef $pid;
                FORK: {
                        my $cmd = shift (@cmds);
                        if($pid = fork) {
                                # parent here
                                $num_children++;
                                wait;
                                $num_children--;
                                next;

                        } elsif(defined $pid) { # $pid is zero here if defined
                                #print STDERR $cmd, "\n";
                                system $cmd;
                                exit;

                        } else {
                                #weird fork error
                                die "Can't fork: $!\n";
                        }
                }
        }
        wait while $num_children--;
}