annotate bwape_parallel_optimized @ 0:8f4d4b4e8b04 default tip

init commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:41:11 -0600
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1 #!/usr/bin/env perl
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
3 use strict;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
4 use warnings;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
5 use File::Basename;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
6 use File::Temp qw(:POSIX);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
7 use File::Spec qw(tmpdir);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
8
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
9 @ARGV == 6 or die "Usage: $0 <# threads> <ref.fasta> <read1.fq[.gz]> <read2.fq[.gz]> <out bam prefix>\nMerges, then pair aligns reads using BWA 0.6.2\n";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
10
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
11 my %config;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
12 my $dirname = dirname(__FILE__);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
13 my $tool_dir = shift @ARGV;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
14 #read config file
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
15 if(not -e "$tool_dir/miseq_fastq_bam.loc"){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
16 system("cp $dirname/tool-data/miseq_fastq_bam.loc $tool_dir/miseq_fastq_bam.loc");
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
17 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
18 open CONFIG, '<', "$tool_dir/miseq_fastq_bam.loc";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
19 while(<CONFIG>){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
20 (my $key, my $value) = split(/\s+/,$_);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
21 $config{$key} = $value;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
22 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
23 close CONFIG;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
24
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
25 my $threads = shift @ARGV;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
26 my $ref = $config{"ref_genome_dir"} . (shift @ARGV);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
27 my $read1 = shift @ARGV;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
28 my $read2 = shift @ARGV;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
29 my $outbam = shift @ARGV;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
30 my $log = "$outbam.align.log";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
31 $SIG{__DIE__} = $SIG{INT} = $SIG{HUP} = $SIG{TERM} = $SIG{ABRT} = \&cleanup;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
32
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
33 open(STDERR, ">$log");
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
34 #my $tmpdir = File::Spec->tmpdir();
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
35 my $tmpdir = ".";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
36 $ENV{"TMP_DIR"} = $tmpdir;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
37 #my ($label, $dirs, $suffix) = fileparse($read1, ".fastq", ".fq");
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
38
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
39 if(not -e "$ref.sa"){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
40 # TODO: Use NFS lock to avoid concurrent indexing of the same file?
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
41 system("bwa index $ref &>> $log") >> 8 and die "bwa index had non-zero ($?) exit status for $ref, aborting\n";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
42 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
43
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
44 # Split the input into temporary chunks for parallel processing
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
45 my $tot_read1_size = -s $read1;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
46 # Estimate expansion of compressed FASTQ files. About 3.5 from sample data we have
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
47 my $compressed_reads = 0;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
48 if(`file $read1` =~ /gzip/i){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
49 $compressed_reads = 1;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
50 $tot_read1_size *= 3.5;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
51 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
52 my $chunk_read_size = $tot_read1_size/$threads;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
53
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
54 my (@read1_chunk_tmpfiles, @read2_chunk_tmpfiles);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
55 # run initial alignment of each end in parallel
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
56 my $pid;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
57 my $read_so_far = 0;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
58 my $read_chunk_tmpfile_prefix = $$; #tmpnam();
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
59 my $tmpbam = basename($read_chunk_tmpfile_prefix);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
60
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
61 # Check the average num of lines in a chunk to ensure chunks are lined up between forward and reverse
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
62 my $lines_per_chunk = 0;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
63 open(READS1, ($compressed_reads ? "gzip -cd $read1 |" : $read1))
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
64 or die "Cannot ", ($compressed_reads ? "gunzip" : "read"), " $read1: $!\n";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
65 while(<READS1>){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
66 $read_so_far += length($_);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
67 # FASTQ records have 4 lines...only split if at the end of a record
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
68 if($.%4 == 1 and $read_so_far > $chunk_read_size){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
69 last;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
70 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
71 $lines_per_chunk++;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
72 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
73 close(READS1);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
74 my $chunk_count = 0;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
75 my @align_cmds;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
76 if($pid = fork) {
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
77 # parent here
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
78 my $read1_chunk_tmpfile;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
79 open(READS1, ($compressed_reads ? "gzip -cd $read1 |" : $read1))
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
80 or die "Cannot ", ($compressed_reads ? "gunzip" : "read"), " $read1: $!\n";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
81 while(<READS1>){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
82 # FASTQ records have 4 lines...only split if at the end of a record
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
83 if($. % $lines_per_chunk == 1){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
84 if(defined $read1_chunk_tmpfile){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
85 push @align_cmds, "bwa aln -n 0.1 -f $read1_chunk_tmpfile.sai $ref $read1_chunk_tmpfile &>> $log";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
86 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
87 $read1_chunk_tmpfile = $read_chunk_tmpfile_prefix.".".int($chunk_count++).".forward";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
88 push @read1_chunk_tmpfiles, $read1_chunk_tmpfile;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
89 open(FASTQ_CHUNK, ">$read1_chunk_tmpfile")
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
90 or die "Cannot open $read1_chunk_tmpfile for writing: $!\n";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
91 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
92 print FASTQ_CHUNK $_;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
93 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
94 close(FASTQ_CHUNK);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
95 push @align_cmds, "bwa aln -n 0.1 -f $read1_chunk_tmpfile.sai $ref $read1_chunk_tmpfile &>> $log";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
96 wait;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
97 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
98 elsif(defined $pid) { # $pid is zero in child process if defined
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
99 my $read2_chunk_tmpfile;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
100 open(READS2, ($compressed_reads ? "gzip -cd $read2 |" : $read2))
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
101 or die "Cannot ", ($compressed_reads ? "gunzip" : "read"), " $read2: $!\n";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
102 while(<READS2>){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
103 # FASTQ records have 4 lines...only split if at the end of a record
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
104 if($. % $lines_per_chunk == 1){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
105 $read2_chunk_tmpfile = $read_chunk_tmpfile_prefix.".".int($chunk_count++).".reverse";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
106 open(FASTQ_CHUNK, ">$read2_chunk_tmpfile")
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
107 or die "Cannot open $read2_chunk_tmpfile for writing: $!\n";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
108 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
109 print FASTQ_CHUNK $_;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
110 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
111 close(FASTQ_CHUNK);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
112 exit(0);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
113 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
114 else {
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
115 die "Can't fork: $!\n";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
116 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
117
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
118 # repeat the commands for the reverse reads
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
119 my @final_align_cmds = (@align_cmds, map {my $r = $_; $r =~ s/\.forward/.reverse/g; $r} @align_cmds);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
120 &run_cmds($threads, 5, @final_align_cmds);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
121
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
122 my @pair_cmds;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
123 my @bam_chunk_sorted_tmpfiles;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
124 for my $read1_chunk_tmpfile (@read1_chunk_tmpfiles){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
125 my $read2_chunk_tmpfile = $read1_chunk_tmpfile;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
126 $read2_chunk_tmpfile =~ s/\.forward$/.reverse/;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
127 my $bam_chunk_tmpfile = $read1_chunk_tmpfile;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
128 $bam_chunk_tmpfile =~ s/\.forward$/.pe.bam/;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
129 my $bam_chunk_sorted_tmpfile_prefix = $bam_chunk_tmpfile;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
130 $bam_chunk_sorted_tmpfile_prefix =~ s/\.bam$/.sorted/;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
131 push @bam_chunk_sorted_tmpfiles, "$bam_chunk_sorted_tmpfile_prefix.bam";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
132 #push @cmds, "/export/achri_data/programs/bwa-0.6.2/bwa sampe -r \"\@RG\tID:NA\tPL:illumina\tPU:NA\tLB:NA\tSM:$outbam\" $ref $read1_chunk_tmpfile.sai $read2_chunk_tmpfile.sai $read1_chunk_tmpfile $read2_chunk_tmpfile | samtools view -S -b - | samtools sort - $bam_chunk_sorted_tmpfile_prefix 2>> $log";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
133 push @pair_cmds, "bwa sampe -r \"\@RG\tID:NA\tPL:illumina\tPU:NA\tLB:NA\tSM:$outbam\" $ref $read1_chunk_tmpfile.sai $read2_chunk_tmpfile.sai $read1_chunk_tmpfile $read2_chunk_tmpfile | samtools view -S -b - > $bam_chunk_tmpfile; samtools sort $bam_chunk_tmpfile $bam_chunk_sorted_tmpfile_prefix &>> $log";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
134 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
135 &run_cmds($threads, 5, @pair_cmds);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
136
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
137 system("samtools merge $tmpdir/$tmpbam.bam ".join(" ", @bam_chunk_sorted_tmpfiles)."&>> $log");
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
138 system("samtools index $tmpdir/$tmpbam.bam &>> $log") >> 8 and die "samtools index had non-zero ($?) exit status for $tmpdir/$tmpbam.bam, aborting\n";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
139
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
140 # post-process the alignments
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
141 if($pid = fork){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
142 system("samtools flagstat $tmpdir/$tmpbam.bam > $outbam.before_dedup.flagstat.txt 2>> $log");
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
143 wait(); # for rmdup to finish
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
144 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
145 elsif(defined $pid) {
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
146 system("samtools rmdup $tmpdir/$tmpbam.bam $tmpdir/$tmpbam.rmdup.bam &>> $log")>>8 and die "Samtools rmdup failed with exit status ", ($?>>8), ", please check the log\n";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
147 exit(0);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
148 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
149 else{ # Can't fork, do in serial
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
150 system("samtools flagstat $tmpdir/$tmpbam.bam > $outbam.before_dedup.flagstat.txt 2>> $log");
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
151 system("samtools rmdup $tmpdir/$tmpbam.bam $tmpdir/$tmpbam.rmdup.bam &>> $log")>>8 and die "Samtools rmdup failed with exit status ", ($?>>8), ", please check the log\n";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
152 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
153
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
154 die "Samtools rmdup did not generate the expected output file" if not -e "$tmpdir/$tmpbam.rmdup.bam";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
155 die "Samtools generated a blank output file" if -z "$tmpdir/$tmpbam.rmdup.bam";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
156 system "samtools index $tmpdir/$tmpbam.rmdup.bam &>> $log";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
157 # Peak into the BAM to see if it's got Illumina encoded quality values, which will require an extra flag for GATK
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
158 my $is_illumina_encoded_qual = 1;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
159 my $num_mapped = 0;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
160 if(open(SAMTOOLS, "samtools view $tmpdir/$tmpbam.rmdup.bam |")){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
161 while(<SAMTOOLS>){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
162 my @F = split /\t/, $_;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
163 next unless $F[2] ne "*"; # ignore unmapped reads
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
164 my @quals = map {ord($_)} split(//, $F[10]);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
165 if(grep {$_ < 64} @quals){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
166 $is_illumina_encoded_qual = 0; last;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
167 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
168 last if ++$num_mapped > 100; # only look at the first 100 mapped reads
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
169 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
170 } # take our chances if samtools call failed that it's not illumina encoded, which will fail quickly in GATK anyways
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
171 close(SAMTOOLS);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
172
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
173 system "java -Xmx4G -jar $diname/GenomeAnalysisTK.jar -nt $threads -I $tmpdir/$tmpbam.rmdup.bam -R $ref -T RealignerTargetCreator -o $tmpdir/$tmpbam.rmdup.gatk_realigner.intervals &>> $log";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
174 die "GATK did not generate the expected intervals file\n" if not -e "$tmpdir/$tmpbam.rmdup.gatk_realigner.intervals";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
175 system "java -Xmx4G -jar $dirname/GenomeAnalysisTK.jar -I $tmpdir/$tmpbam.rmdup.bam -R $ref -T IndelRealigner -targetIntervals $tmpdir/$tmpbam.rmdup.gatk_realigner.intervals -o $outbam.bam ".
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
176 ($is_illumina_encoded_qual?"--fix_misencoded_quality_scores":"")." &>> $log";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
177 die "GATK did not generate the expected realigned BAM output file\n" if not -e "$outbam.bam";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
178 die "GATK generated a blank output file\n" if -z "$outbam.bam";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
179 my $outfile_bai = "$outbam.bai";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
180 if(not -e $outfile_bai){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
181 if(not -e "$outbam.bam.bai"){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
182 system "samtools index $outbam.bam &>> $log";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
183 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
184 system "cp $outbam.bam.bai $outfile_bai";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
185 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
186 else{
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
187 system "cp $outfile_bai $outbam.bam.bai"; # some tools expect name.bai, some expect name.bam.bai, so provide both
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
188 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
189 &cleanup;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
190
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
191 sub cleanup{
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
192 print STDERR "Cleaning up temp files for aborted bwa run: @_" if @_;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
193 for my $read1_chunk_tmpfile (@read1_chunk_tmpfiles){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
194 my $read2_chunk_tmpfile = $read1_chunk_tmpfile;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
195 $read2_chunk_tmpfile =~ s/\.forward$/.reverse/;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
196 my $bam_chunk_tmpfile = $read1_chunk_tmpfile;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
197 $bam_chunk_tmpfile =~ s/\.forward$/.pe.bam/;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
198 unlink $read1_chunk_tmpfile, "$read1_chunk_tmpfile.sai", $read2_chunk_tmpfile, "$read2_chunk_tmpfile.sai", $bam_chunk_tmpfile;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
199 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
200 unlink @bam_chunk_sorted_tmpfiles;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
201 unlink "$tmpdir/$tmpbam.bam";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
202 unlink "$tmpdir/$tmpbam.bam.bai";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
203 unlink "$tmpdir/$tmpbam.rmdup.bam";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
204 unlink "$tmpdir/$tmpbam.rmdup.bam.bai";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
205 unlink "$tmpdir/$tmpbam.rmdup.gatk_realigner.intervals";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
206 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
207
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
208 sub run_cmds{
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
209
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
210 my ($max_cmds, $sleep_time, @cmds) = @_;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
211
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
212 my ($num_children, $pid);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
213
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
214 for($num_children = 1; $num_children < $max_cmds && @cmds; $num_children++){
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
215 my $cmd = shift (@cmds);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
216 if($pid = fork) {
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
217 sleep $sleep_time; # seems to be a major system time spike if all commands launched in quick succession.
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
218 } elsif(defined $pid) {
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
219 #print STDERR "$cmd\n";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
220 system $cmd;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
221 #print STDERR "Finished $cmd\n";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
222 exit;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
223 } else {
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
224 die "Can't fork: $!\n";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
225 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
226 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
227
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
228 while(@cmds) {
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
229 undef $pid;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
230 FORK: {
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
231 my $cmd = shift (@cmds);
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
232 if($pid = fork) {
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
233 $num_children++;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
234 wait;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
235 $num_children--;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
236 next;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
237
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
238 } elsif(defined $pid) {
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
239 system $cmd;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
240 exit;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
241
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
242 } else {
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
243 die "Can't fork: $!\n";
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
244 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
245 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
246 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
247 wait while $num_children--;
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
248 }
8f4d4b4e8b04 init commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
249