diff freebayes_parallel @ 0:1a23ea467feb default tip

intial commit
author Yusuf Ali <ali@yusuf.email>
date Thu, 26 Mar 2015 09:36:17 -0600
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/freebayes_parallel	Thu Mar 26 09:36:17 2015 -0600
@@ -0,0 +1,132 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use File::Temp;
+
+@ARGV >= 3 or die "Usage: $0 <input.bam> <num processes> <output.vcf> [freebayes options]\nRuns FreeBayes 0.8.7 separately for each reference chromosome, with as many concurrent processes as specified\n"; 
+
+my $in_bam = shift @ARGV;
+my $num_procs = shift @ARGV;
+my $out_vcf = shift @ARGV;
+
+$SIG{__DIE__} = $SIG{INT} = $SIG{HUP} = $SIG{TERM} = $SIG{ABRT} = \&cleanup;
+# Lines look some thing like "@SQ	SN:chr1	LN:249250621"
+open(SAM_HEADERS, "samtools view -H $in_bam |")
+  or die "Cannot run samtools on $in_bam: $!\n";
+my %seq2length;
+my %seqname2orig_order;
+while(<SAM_HEADERS>){
+  if(/^\@SQ\s+SN:(\S+)\s+LN:(\d+)/){
+    $seq2length{$1} = $2;
+    $seqname2orig_order{$1} = $.;
+  }
+}
+close(SAM_HEADERS);
+
+# Make sure the index  is available and has the name FreeBayes expects
+my $bamfile_stem = $in_bam;
+$bamfile_stem =~ s/\.[^.]+$//; # remove last file extension, if there is one
+if(not -e "$bamfile_stem.bai"){
+  if(not -e "$in_bam.bai"){
+    system("samtools index $in_bam") >> 8 and die "Cannot index $in_bam: samtools had exit startus $?\n"; 
+  }
+  system("ln -s $in_bam.bai $bamfile_stem.bai")>> 8 and die "Cannot link exsiting BAM index $in_bam.bai to FreeBayes required file name $bamfile_stem.bai: ln had exit status $?\n";
+}
+
+my @cmds;
+my @tmp_outfiles;
+my %tmpfile2orig_order;
+# Sort contigs from largest to smallest for scheduling efficiency
+for my $seq_name (sort {$seq2length{$b} <=> $seq2length{$a}} keys %seq2length){
+  my ($tmp_fh, $tmp_filename) = tmpnam();
+  push @cmds, "/export/achri_data/programs/freebayes0.8.7 -b $in_bam -v $tmp_filename " . join(" ", @ARGV) . " -r $seq_name:1..$seq2length{$seq_name}";
+  push @tmp_outfiles, $tmp_filename;
+  $tmpfile2orig_order{$tmp_filename} = $seqname2orig_order{$seq_name};
+}
+
+open(OUT_VCF, ">$out_vcf")
+  or die "Cannot open $out_vcf for writing: $!\n";
+
+run_cmds($num_procs, @cmds);
+
+# Grab output header from first temp output file
+open(H, $tmp_outfiles[0])
+  or die "Cannot open $tmp_outfiles[0] for reading: $!\n";
+while(<H>){
+  last if not /^#/; # end of headers
+  # mod for self-referencing meta-data
+  if(/^##commandline=/){
+    print OUT_VCF "##commandline=$0 $in_bam $num_procs $out_vcf ".join(" ", @ARGV)."\n";
+  }
+  else{
+    # Otherwise verbatim
+    print OUT_VCF $_;
+  }
+
+}
+close(H);
+
+# Concatenate the temporary results into a final outfile
+for my $tmp_outfile (sort {$tmpfile2orig_order{$a} <=> $tmpfile2orig_order{$b}} @tmp_outfiles){
+  open(TMP_VCF, $tmp_outfile) 
+    or die "Cannot open $tmp_outfile for reading: $!\n";
+  while(<TMP_VCF>){
+    print OUT_VCF unless /^#/;
+  }
+  close(TMP_VCF);
+}
+close(OUT_VCF);
+&cleanup;
+
+sub cleanup{
+  unlink @tmp_outfiles;
+}
+sub run_cmds{
+
+  my ($max_cmds, @cmd) = @_;
+
+  my ($num_children, $pid);
+
+        for($num_children = 1; $num_children < $max_cmds && @cmds; $num_children++){
+        # initialize the number of child processes at 1, and increment it by one
+        #while it is less than $max_cmds
+
+                my $cmd = shift (@cmds);
+                if($pid = fork) {
+                        # do nothing if parent
+                } elsif(defined $pid) { # $pid is zero here if defined
+                        print STDERR $cmd, "\n";
+                        system $cmd;
+                        exit;
+                } else {
+                        #weird fork error
+                        die "Can't fork: $!\n";
+                }
+        }
+
+        while(@cmds) {
+                undef $pid;
+                FORK: {
+                        my $cmd = shift (@cmds);
+                        if($pid = fork) {
+                                # parent here
+                                $num_children++;
+                                wait;
+                                $num_children--;
+                                next;
+
+                        } elsif(defined $pid) { # $pid is zero here if defined
+                                print STDERR $cmd, "\n";
+                                system $cmd;
+                                exit;
+
+                        } else {
+                                #weird fork error
+                                die "Can't fork: $!\n";
+                        }
+                }
+        }
+        wait while $num_children--;
+}
+