view freebayes_parallel @ 0:1a23ea467feb default tip

intial commit
author Yusuf Ali <ali@yusuf.email>
date Thu, 26 Mar 2015 09:36:17 -0600
parents
children
line wrap: on
line source

#!/usr/bin/env perl

use strict;
use warnings;
use File::Temp;

@ARGV >= 3 or die "Usage: $0 <input.bam> <num processes> <output.vcf> [freebayes options]\nRuns FreeBayes 0.8.7 separately for each reference chromosome, with as many concurrent processes as specified\n"; 

my $in_bam = shift @ARGV;
my $num_procs = shift @ARGV;
my $out_vcf = shift @ARGV;

$SIG{__DIE__} = $SIG{INT} = $SIG{HUP} = $SIG{TERM} = $SIG{ABRT} = \&cleanup;
# Lines look some thing like "@SQ	SN:chr1	LN:249250621"
open(SAM_HEADERS, "samtools view -H $in_bam |")
  or die "Cannot run samtools on $in_bam: $!\n";
my %seq2length;
my %seqname2orig_order;
while(<SAM_HEADERS>){
  if(/^\@SQ\s+SN:(\S+)\s+LN:(\d+)/){
    $seq2length{$1} = $2;
    $seqname2orig_order{$1} = $.;
  }
}
close(SAM_HEADERS);

# Make sure the index  is available and has the name FreeBayes expects
my $bamfile_stem = $in_bam;
$bamfile_stem =~ s/\.[^.]+$//; # remove last file extension, if there is one
if(not -e "$bamfile_stem.bai"){
  if(not -e "$in_bam.bai"){
    system("samtools index $in_bam") >> 8 and die "Cannot index $in_bam: samtools had exit startus $?\n"; 
  }
  system("ln -s $in_bam.bai $bamfile_stem.bai")>> 8 and die "Cannot link exsiting BAM index $in_bam.bai to FreeBayes required file name $bamfile_stem.bai: ln had exit status $?\n";
}

my @cmds;
my @tmp_outfiles;
my %tmpfile2orig_order;
# Sort contigs from largest to smallest for scheduling efficiency
for my $seq_name (sort {$seq2length{$b} <=> $seq2length{$a}} keys %seq2length){
  my ($tmp_fh, $tmp_filename) = tmpnam();
  push @cmds, "/export/achri_data/programs/freebayes0.8.7 -b $in_bam -v $tmp_filename " . join(" ", @ARGV) . " -r $seq_name:1..$seq2length{$seq_name}";
  push @tmp_outfiles, $tmp_filename;
  $tmpfile2orig_order{$tmp_filename} = $seqname2orig_order{$seq_name};
}

open(OUT_VCF, ">$out_vcf")
  or die "Cannot open $out_vcf for writing: $!\n";

run_cmds($num_procs, @cmds);

# Grab output header from first temp output file
open(H, $tmp_outfiles[0])
  or die "Cannot open $tmp_outfiles[0] for reading: $!\n";
while(<H>){
  last if not /^#/; # end of headers
  # mod for self-referencing meta-data
  if(/^##commandline=/){
    print OUT_VCF "##commandline=$0 $in_bam $num_procs $out_vcf ".join(" ", @ARGV)."\n";
  }
  else{
    # Otherwise verbatim
    print OUT_VCF $_;
  }

}
close(H);

# Concatenate the temporary results into a final outfile
for my $tmp_outfile (sort {$tmpfile2orig_order{$a} <=> $tmpfile2orig_order{$b}} @tmp_outfiles){
  open(TMP_VCF, $tmp_outfile) 
    or die "Cannot open $tmp_outfile for reading: $!\n";
  while(<TMP_VCF>){
    print OUT_VCF unless /^#/;
  }
  close(TMP_VCF);
}
close(OUT_VCF);
&cleanup;

sub cleanup{
  unlink @tmp_outfiles;
}
sub run_cmds{

  my ($max_cmds, @cmd) = @_;

  my ($num_children, $pid);

        for($num_children = 1; $num_children < $max_cmds && @cmds; $num_children++){
        # initialize the number of child processes at 1, and increment it by one
        #while it is less than $max_cmds

                my $cmd = shift (@cmds);
                if($pid = fork) {
                        # do nothing if parent
                } elsif(defined $pid) { # $pid is zero here if defined
                        print STDERR $cmd, "\n";
                        system $cmd;
                        exit;
                } else {
                        #weird fork error
                        die "Can't fork: $!\n";
                }
        }

        while(@cmds) {
                undef $pid;
                FORK: {
                        my $cmd = shift (@cmds);
                        if($pid = fork) {
                                # parent here
                                $num_children++;
                                wait;
                                $num_children--;
                                next;

                        } elsif(defined $pid) { # $pid is zero here if defined
                                print STDERR $cmd, "\n";
                                system $cmd;
                                exit;

                        } else {
                                #weird fork error
                                die "Can't fork: $!\n";
                        }
                }
        }
        wait while $num_children--;
}