annotate prep_bams_parallel @ 0:6b673ffd9e38 default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:29:45 -0600
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1 #!/usr/bin/env perl
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
3 use File::Basename;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
4 use strict;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
5 use warnings;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
6
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
7 @ARGV > 5 or die "Usage: $0 <num parallel processes> <output log> <reference.fa> <temporary directory> <deduped_realigned_bam_prefix> <input1.bam> [input2.bam] ...\n";
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
8
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
9 my $num_procs = shift @ARGV;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
10 my $log = shift @ARGV;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
11 my $ref = shift @ARGV;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
12 my $tmpdir = shift @ARGV;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
13 my $outfile_prefix = shift @ARGV;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
14
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
15 my $progdir = dirname($0);
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
16
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
17 # Split the input data into separate files for each chromosome, so we can process them in parallel
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
18 my $chr_files_hashref = &generate_chr_split_bams($tmpdir, @ARGV);
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
19
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
20 my (@cmds, @logs);
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
21 for my $chr (keys %$chr_files_hashref){
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
22 push @cmds, "$progdir/prep_bams $log.$chr $ref $tmpdir $outfile_prefix.$chr.bam ".$chr_files_hashref->{$chr};
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
23 push @logs, "$log.$chr";
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
24 }
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
25
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
26 # Run the chrs in parallel to the desrired degree
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
27 &run_cmds($num_procs, @cmds);
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
28
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
29 # Remove the split input files, as they were only needed temporarily
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
30 for my $chr_file (values %$chr_files_hashref){
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
31 unlink $chr_file or warn "Cannot remove temporary file $chr_file: $!\n";
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
32 }
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
33
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
34 # Merge the logs
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
35 open(LOG, ">$log")
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
36 or die "Could not open output log $log for writing: $!\n";
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
37 for my $l (@logs){
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
38 open(L, $l)
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
39 or warn "Log file $l was expected but not found, data processing may have been incomplete.\n";
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
40 print LOG <L>;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
41 close(L);
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
42 unlink $l;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
43 }
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
44 close(LOG);
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
45
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
46 # Merge the input BAMsinto one file, then split them into per contig BAM files for downstream parallelization
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
47 sub generate_chr_split_bams{
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
48 my ($tempdir, @input_bams) = @_;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
49
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
50 my $temp_input_bam = $input_bams[0];
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
51 if(@input_bams > 1){
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
52 $temp_input_bam = "$tempdir/prep_bams_parallel_$$.bam";
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
53 system ("samtools merge $temp_input_bam ".join(" ", @input_bams)) >> 8
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
54 and die "Could not successfully run samtools merge: exit status was $?\n";
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
55 }
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
56 if(not -e "$temp_input_bam.bai" or -z "$temp_input_bam.bai"){
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
57 system ("samtools index $temp_input_bam") >> 8
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
58 and die "Could not successfully run samtools index: exit status was $?\n";
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
59 }
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
60 my @output_contigs;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
61 open(SAM, "samtools view -H $temp_input_bam |")
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
62 or die "Cannot run samtools view on $temp_input_bam: $!\n";
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
63 while(<SAM>){
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
64 if(/^\@SQ\tSN:(\S+)/){
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
65 push @output_contigs, $1;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
66 }
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
67 }
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
68 close(SAM);
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
69
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
70 my @split_bamfiles;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
71 my %contig2outfile;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
72 for my $output_contig (@output_contigs){
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
73 open(SAM, "samtools view -h $temp_input_bam '$output_contig'|")
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
74 and die "Cannot execute samtools view successfully: $!\n";
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
75 my @headers;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
76 # read through the headers, then if we get at least one mapped read, open a file to print it
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
77 while(<SAM>){
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
78 last if not /^\@/; # first real record
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
79 push @headers, $_;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
80 }
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
81 next if not defined $_; # got to EOF before a real record
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
82
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
83 my $outfile = "$tempdir/prep_bams_parallel_$$.".quotemeta($output_contig).".bam";
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
84 open(OUT, "| samtools view -b -S - > $outfile")
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
85 or die "Cannot open $outfile for writing: $!\n";
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
86 print OUT @headers, $_;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
87 while(<SAM>){ # Copy over all lines. Not using list context as this may be a huge dataset and would needlessly take up memory
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
88 print OUT $_;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
89 }
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
90 close(SAM);
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
91 close(OUT);
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
92 }
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
93 unlink $temp_input_bam or warn "Could not remove $temp_input_bam: $!\n";
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
94 }
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
95
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
96 sub run_cmds {
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
97 my ($max_cmds, @cmd) = @_;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
98
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
99 my ($num_children, $pid);
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
100
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
101 for($num_children = 1; $num_children < $max_cmds && @cmds; $num_children++){
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
102 # Initialize the number of child processes at 1, and increment it by one
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
103 # While it is less than $max_cmds
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
104
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
105 my $cmd = shift (@cmds);
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
106 if($pid = fork) {
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
107 # do nothing if parent
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
108 } elsif(defined $pid) { # $pid is zero here if defined
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
109 print STDERR $cmd, "\n";
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
110 system $cmd;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
111 exit;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
112 } else {
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
113 #weird fork error
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
114 die "Can't fork: $!\n";
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
115 }
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
116 }
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
117
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
118 while(@cmds) {
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
119 undef $pid;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
120 FORK: {
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
121 my $cmd = shift (@cmds);
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
122 if($pid = fork) {
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
123 # parent here
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
124 $num_children++;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
125 wait;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
126 $num_children--;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
127 next;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
128
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
129 } elsif(defined $pid) { # $pid is zero here if defined
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
130 print STDERR $cmd, "\n";
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
131 system $cmd;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
132 exit;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
133
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
134 } else {
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
135 #weird fork error
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
136 die "Can't fork: $!\n";
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
137 }
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
138 }
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
139 }
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
140 wait while $num_children--;
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
141 }
6b673ffd9e38 initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
142