annotate freebayes_parallel @ 0:1a23ea467feb default tip

intial commit
author Yusuf Ali <ali@yusuf.email>
date Thu, 26 Mar 2015 09:36:17 -0600
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1 #!/usr/bin/env perl
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
3 use strict;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
4 use warnings;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
5 use File::Temp;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
6
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
7 @ARGV >= 3 or die "Usage: $0 <input.bam> <num processes> <output.vcf> [freebayes options]\nRuns FreeBayes 0.8.7 separately for each reference chromosome, with as many concurrent processes as specified\n";
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
8
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
9 my $in_bam = shift @ARGV;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
10 my $num_procs = shift @ARGV;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
11 my $out_vcf = shift @ARGV;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
12
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
13 $SIG{__DIE__} = $SIG{INT} = $SIG{HUP} = $SIG{TERM} = $SIG{ABRT} = \&cleanup;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
14 # Lines look some thing like "@SQ SN:chr1 LN:249250621"
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
15 open(SAM_HEADERS, "samtools view -H $in_bam |")
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
16 or die "Cannot run samtools on $in_bam: $!\n";
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
17 my %seq2length;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
18 my %seqname2orig_order;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
19 while(<SAM_HEADERS>){
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
20 if(/^\@SQ\s+SN:(\S+)\s+LN:(\d+)/){
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
21 $seq2length{$1} = $2;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
22 $seqname2orig_order{$1} = $.;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
23 }
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
24 }
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
25 close(SAM_HEADERS);
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
26
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
27 # Make sure the index is available and has the name FreeBayes expects
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
28 my $bamfile_stem = $in_bam;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
29 $bamfile_stem =~ s/\.[^.]+$//; # remove last file extension, if there is one
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
30 if(not -e "$bamfile_stem.bai"){
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
31 if(not -e "$in_bam.bai"){
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
32 system("samtools index $in_bam") >> 8 and die "Cannot index $in_bam: samtools had exit startus $?\n";
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
33 }
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
34 system("ln -s $in_bam.bai $bamfile_stem.bai")>> 8 and die "Cannot link exsiting BAM index $in_bam.bai to FreeBayes required file name $bamfile_stem.bai: ln had exit status $?\n";
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
35 }
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
36
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
37 my @cmds;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
38 my @tmp_outfiles;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
39 my %tmpfile2orig_order;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
40 # Sort contigs from largest to smallest for scheduling efficiency
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
41 for my $seq_name (sort {$seq2length{$b} <=> $seq2length{$a}} keys %seq2length){
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
42 my ($tmp_fh, $tmp_filename) = tmpnam();
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
43 push @cmds, "/export/achri_data/programs/freebayes0.8.7 -b $in_bam -v $tmp_filename " . join(" ", @ARGV) . " -r $seq_name:1..$seq2length{$seq_name}";
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
44 push @tmp_outfiles, $tmp_filename;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
45 $tmpfile2orig_order{$tmp_filename} = $seqname2orig_order{$seq_name};
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
46 }
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
47
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
48 open(OUT_VCF, ">$out_vcf")
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
49 or die "Cannot open $out_vcf for writing: $!\n";
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
50
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
51 run_cmds($num_procs, @cmds);
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
52
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
53 # Grab output header from first temp output file
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
54 open(H, $tmp_outfiles[0])
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
55 or die "Cannot open $tmp_outfiles[0] for reading: $!\n";
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
56 while(<H>){
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
57 last if not /^#/; # end of headers
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
58 # mod for self-referencing meta-data
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
59 if(/^##commandline=/){
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
60 print OUT_VCF "##commandline=$0 $in_bam $num_procs $out_vcf ".join(" ", @ARGV)."\n";
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
61 }
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
62 else{
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
63 # Otherwise verbatim
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
64 print OUT_VCF $_;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
65 }
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
66
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
67 }
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
68 close(H);
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
69
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
70 # Concatenate the temporary results into a final outfile
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
71 for my $tmp_outfile (sort {$tmpfile2orig_order{$a} <=> $tmpfile2orig_order{$b}} @tmp_outfiles){
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
72 open(TMP_VCF, $tmp_outfile)
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
73 or die "Cannot open $tmp_outfile for reading: $!\n";
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
74 while(<TMP_VCF>){
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
75 print OUT_VCF unless /^#/;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
76 }
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
77 close(TMP_VCF);
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
78 }
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
79 close(OUT_VCF);
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
80 &cleanup;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
81
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
82 sub cleanup{
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
83 unlink @tmp_outfiles;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
84 }
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
85 sub run_cmds{
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
86
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
87 my ($max_cmds, @cmd) = @_;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
88
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
89 my ($num_children, $pid);
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
90
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
91 for($num_children = 1; $num_children < $max_cmds && @cmds; $num_children++){
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
92 # initialize the number of child processes at 1, and increment it by one
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
93 #while it is less than $max_cmds
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
94
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
95 my $cmd = shift (@cmds);
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
96 if($pid = fork) {
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
97 # do nothing if parent
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
98 } elsif(defined $pid) { # $pid is zero here if defined
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
99 print STDERR $cmd, "\n";
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
100 system $cmd;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
101 exit;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
102 } else {
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
103 #weird fork error
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
104 die "Can't fork: $!\n";
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
105 }
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
106 }
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
107
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
108 while(@cmds) {
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
109 undef $pid;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
110 FORK: {
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
111 my $cmd = shift (@cmds);
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
112 if($pid = fork) {
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
113 # parent here
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
114 $num_children++;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
115 wait;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
116 $num_children--;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
117 next;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
118
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
119 } elsif(defined $pid) { # $pid is zero here if defined
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
120 print STDERR $cmd, "\n";
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
121 system $cmd;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
122 exit;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
123
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
124 } else {
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
125 #weird fork error
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
126 die "Can't fork: $!\n";
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
127 }
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
128 }
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
129 }
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
130 wait while $num_children--;
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
131 }
1a23ea467feb intial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
132