Mercurial > repos > yusuf > miseq_bam_variants
diff freebayes_parallel @ 0:1a23ea467feb default tip
intial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Thu, 26 Mar 2015 09:36:17 -0600 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/freebayes_parallel Thu Mar 26 09:36:17 2015 -0600 @@ -0,0 +1,132 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Temp; + +@ARGV >= 3 or die "Usage: $0 <input.bam> <num processes> <output.vcf> [freebayes options]\nRuns FreeBayes 0.8.7 separately for each reference chromosome, with as many concurrent processes as specified\n"; + +my $in_bam = shift @ARGV; +my $num_procs = shift @ARGV; +my $out_vcf = shift @ARGV; + +$SIG{__DIE__} = $SIG{INT} = $SIG{HUP} = $SIG{TERM} = $SIG{ABRT} = \&cleanup; +# Lines look some thing like "@SQ SN:chr1 LN:249250621" +open(SAM_HEADERS, "samtools view -H $in_bam |") + or die "Cannot run samtools on $in_bam: $!\n"; +my %seq2length; +my %seqname2orig_order; +while(<SAM_HEADERS>){ + if(/^\@SQ\s+SN:(\S+)\s+LN:(\d+)/){ + $seq2length{$1} = $2; + $seqname2orig_order{$1} = $.; + } +} +close(SAM_HEADERS); + +# Make sure the index is available and has the name FreeBayes expects +my $bamfile_stem = $in_bam; +$bamfile_stem =~ s/\.[^.]+$//; # remove last file extension, if there is one +if(not -e "$bamfile_stem.bai"){ + if(not -e "$in_bam.bai"){ + system("samtools index $in_bam") >> 8 and die "Cannot index $in_bam: samtools had exit startus $?\n"; + } + system("ln -s $in_bam.bai $bamfile_stem.bai")>> 8 and die "Cannot link exsiting BAM index $in_bam.bai to FreeBayes required file name $bamfile_stem.bai: ln had exit status $?\n"; +} + +my @cmds; +my @tmp_outfiles; +my %tmpfile2orig_order; +# Sort contigs from largest to smallest for scheduling efficiency +for my $seq_name (sort {$seq2length{$b} <=> $seq2length{$a}} keys %seq2length){ + my ($tmp_fh, $tmp_filename) = tmpnam(); + push @cmds, "/export/achri_data/programs/freebayes0.8.7 -b $in_bam -v $tmp_filename " . join(" ", @ARGV) . " -r $seq_name:1..$seq2length{$seq_name}"; + push @tmp_outfiles, $tmp_filename; + $tmpfile2orig_order{$tmp_filename} = $seqname2orig_order{$seq_name}; +} + +open(OUT_VCF, ">$out_vcf") + or die "Cannot open $out_vcf for writing: $!\n"; + +run_cmds($num_procs, @cmds); + +# Grab output header from first temp output file +open(H, $tmp_outfiles[0]) + or die "Cannot open $tmp_outfiles[0] for reading: $!\n"; +while(<H>){ + last if not /^#/; # end of headers + # mod for self-referencing meta-data + if(/^##commandline=/){ + print OUT_VCF "##commandline=$0 $in_bam $num_procs $out_vcf ".join(" ", @ARGV)."\n"; + } + else{ + # Otherwise verbatim + print OUT_VCF $_; + } + +} +close(H); + +# Concatenate the temporary results into a final outfile +for my $tmp_outfile (sort {$tmpfile2orig_order{$a} <=> $tmpfile2orig_order{$b}} @tmp_outfiles){ + open(TMP_VCF, $tmp_outfile) + or die "Cannot open $tmp_outfile for reading: $!\n"; + while(<TMP_VCF>){ + print OUT_VCF unless /^#/; + } + close(TMP_VCF); +} +close(OUT_VCF); +&cleanup; + +sub cleanup{ + unlink @tmp_outfiles; +} +sub run_cmds{ + + my ($max_cmds, @cmd) = @_; + + my ($num_children, $pid); + + for($num_children = 1; $num_children < $max_cmds && @cmds; $num_children++){ + # initialize the number of child processes at 1, and increment it by one + #while it is less than $max_cmds + + my $cmd = shift (@cmds); + if($pid = fork) { + # do nothing if parent + } elsif(defined $pid) { # $pid is zero here if defined + print STDERR $cmd, "\n"; + system $cmd; + exit; + } else { + #weird fork error + die "Can't fork: $!\n"; + } + } + + while(@cmds) { + undef $pid; + FORK: { + my $cmd = shift (@cmds); + if($pid = fork) { + # parent here + $num_children++; + wait; + $num_children--; + next; + + } elsif(defined $pid) { # $pid is zero here if defined + print STDERR $cmd, "\n"; + system $cmd; + exit; + + } else { + #weird fork error + die "Can't fork: $!\n"; + } + } + } + wait while $num_children--; +} +