Mercurial > repos > yusuf > dedup_realign_bam
changeset 0:6b673ffd9e38 default tip
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:29:45 -0600 |
parents | |
children | |
files | DedupRealignBAM.xml GenomeAnalysisTK.jar prep_bams prep_bams_parallel tool-data/dedup_realign_bam.loc |
diffstat | 5 files changed, 244 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DedupRealignBAM.xml Wed Mar 25 13:29:45 2015 -0600 @@ -0,0 +1,32 @@ +<?xml version="1.0"?> + +<tool id="dedup_realign_bam_1" name="Deduplicate and realign a BAM file(s)"> + <description>using samtools and GATK</description> + <version_string>echo 1.0.0</version_string> + <command interpreter="perl">prep_bams $__tool_data_path__ $messages $ref_genome\.fa ${__new_file_path__} $prepped_bam_file $input_bam_file + #for $i in $inputs + ${i.input} + #end for + </command> + <inputs> + <param name="ref_genome" type="genomebuild" label="Reference genome" help="against which the reads were mapped"/> + <param format="bam" name="input_bam_file" type="data" label="Source BAM (mapped reads) file" help="Need to merge multiple input BAM files for one sample? Use controls below."/> + <repeat name="inputs" title="Input BAM Files"> + <param name="input" label="Additional BAM file to merge" type="data" format="bam" /> + </repeat> + + </inputs> + <outputs> + <data name="prepped_bam_file" format="bam" type="data" label="Deduped and realigned mapped reads"/> + <data name="messages" format="text" type="data" label="Prep process log messages"/> + </outputs> + + <tests/> + + <help> + This tool runs a set of processes to first optionally merge BAMs, then deduplicate (samtools) and realign (GATK) the reads mapped to a reference genome. + This is important for genotyping studies. While these steps could be run independently in a workflow, + an enormous amount of intermediate data files are generated. This tool cleans up those intermediate files. + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/prep_bams Wed Mar 25 13:29:45 2015 -0600 @@ -0,0 +1,69 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use File::Basename; + +# configuration file stuff +my %config; +my $dirname = dirname(__FILE__); +my $tool_dir = shift @ARGV; + +if(not -e "$tool_dir/dedup_realign_bam.loc"){ + system("cp $dirname/tool-data/dedup_realign_bam.loc $tool_dir/dedup_realign_bam.loc"); +} +open CONFIG, '<', "$tool_dir/dedup_realign_bam.loc"; +while(<CONFIG>){ + next if $_ =~ /^#/; + (my $key, my $value) = split(/\s+/, $_); + $config{$key} = $value; +} +close CONFIG; + +@ARGV > 4 or die "Usage: $0 <output log> <reference.fa> <temporary directory> <deduped_realigned.bam> <input1.bam> [input2.bam] ...\n"; + +my $log = shift @ARGV; +my $ref = $config{"dbs_dir"} . shift @ARGV; +my $tmpdir = shift @ARGV; +my $outfile = shift @ARGV; +$SIG{__DIE__} = $SIG{INT} = $SIG{HUP} = $SIG{TERM} = $SIG{ABRT} = \&cleanup; +if(@ARGV > 1){ + system ("merge_bam_headers $log $tmpdir/tmp$$.headers.sam ".join(" ", @ARGV))>>8 and die "merge_bam_headers failed with exit status ", ($?>>8), ", please check the log\n"; + system("samtools merge -h $tmpdir/tmp$$.headers.sam $tmpdir/tmp$$.bam ".join(" ", @ARGV)." 2>> $log")>>8 and die "Samtools merge failed with exit status ", ($?>>8), ", please check the log\n"; + system("samtools flagstat $tmpdir/tmp$$.bam > $outfile.before_dedup.flagstat.txt"); + system("samtools rmdup $tmpdir/tmp$$.bam $tmpdir/tmp$$.rmdup.bam 2>> $log")>>8 and die "Samtools rmdup failed with exit status ", ($?>>8), ", please check the log\n"; + unlink "$tmpdir/tmp$$.bam"; +} +else{ + system("samtools flagstat $ARGV[0] > $outfile.before_dedup.flagstat.txt"); + system("samtools rmdup $ARGV[0] $tmpdir/tmp$$.rmdup.bam 2>> $log")>>8 and die "Samtools rmdup failed with exit status ", ($?>>8), ", please check the log\n"; +} +die "Samtools rmdup did not generate the expected output file" if not -e "$tmpdir/tmp$$.rmdup.bam"; +die "Samtools generated a blank output file" if -z "$tmpdir/tmp$$.rmdup.bam"; +system "samtools index $tmpdir/tmp$$.rmdup.bam"; +system "java -Xmx4G -jar $dirname/GenomeAnalysisTK.jar -I $tmpdir/tmp$$.rmdup.bam -R $ref -T RealignerTargetCreator -o $tmpdir/tmp$$.rmdup.gatk_realigner.intervals 2>> $log"; +die "GATK did not generate the expected intervals file" if not -e "$tmpdir/tmp$$.rmdup.gatk_realigner.intervals"; +system "java -Xmx4G -jar $dirname/GenomeAnalysisTK.jar -I $tmpdir/tmp$$.rmdup.bam -R $ref -T IndelRealigner -targetIntervals $tmpdir/tmp$$.rmdup.gatk_realigner.intervals -o $outfile 2>> $log"; +die "GATK did not generate the expected realigned BAM output file" if not -e $outfile; +die "GATK generated a blank output file" if -z $outfile; +my $outfile_bai = $outfile; +$outfile_bai =~ s/\.bam$/.bai/; +if(not -e $outfile_bai){ + if(not -e "$outfile.bai"){ + system "samtools index $outfile"; + } + system "cp $outfile.bai $outfile_bai"; +} +else{ + system "cp $outfile_bai $outfile.bai"; # some tools expect name.bai, some expect name.bam.bai, so provide both +} +&cleanup; + +sub cleanup{ + print STDERR @_; + unlink "$tmpdir/$tmpdir/tmp$$.headers.sam"; + unlink "$tmpdir/tmp$$.bam"; + unlink "$tmpdir/tmp$$.rmdup.bam"; + unlink "$tmpdir/tmp$$.rmdup.bam.bai"; + unlink "$tmpdir/tmp$$.rmdup.gatk_realigner.intervals"; +}
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/prep_bams_parallel Wed Mar 25 13:29:45 2015 -0600 @@ -0,0 +1,142 @@ +#!/usr/bin/env perl + +use File::Basename; +use strict; +use warnings; + +@ARGV > 5 or die "Usage: $0 <num parallel processes> <output log> <reference.fa> <temporary directory> <deduped_realigned_bam_prefix> <input1.bam> [input2.bam] ...\n"; + +my $num_procs = shift @ARGV; +my $log = shift @ARGV; +my $ref = shift @ARGV; +my $tmpdir = shift @ARGV; +my $outfile_prefix = shift @ARGV; + +my $progdir = dirname($0); + +# Split the input data into separate files for each chromosome, so we can process them in parallel +my $chr_files_hashref = &generate_chr_split_bams($tmpdir, @ARGV); + +my (@cmds, @logs); +for my $chr (keys %$chr_files_hashref){ + push @cmds, "$progdir/prep_bams $log.$chr $ref $tmpdir $outfile_prefix.$chr.bam ".$chr_files_hashref->{$chr}; + push @logs, "$log.$chr"; +} + +# Run the chrs in parallel to the desrired degree +&run_cmds($num_procs, @cmds); + +# Remove the split input files, as they were only needed temporarily +for my $chr_file (values %$chr_files_hashref){ + unlink $chr_file or warn "Cannot remove temporary file $chr_file: $!\n"; +} + +# Merge the logs +open(LOG, ">$log") + or die "Could not open output log $log for writing: $!\n"; +for my $l (@logs){ + open(L, $l) + or warn "Log file $l was expected but not found, data processing may have been incomplete.\n"; + print LOG <L>; + close(L); + unlink $l; +} +close(LOG); + +# Merge the input BAMsinto one file, then split them into per contig BAM files for downstream parallelization +sub generate_chr_split_bams{ + my ($tempdir, @input_bams) = @_; + + my $temp_input_bam = $input_bams[0]; + if(@input_bams > 1){ + $temp_input_bam = "$tempdir/prep_bams_parallel_$$.bam"; + system ("samtools merge $temp_input_bam ".join(" ", @input_bams)) >> 8 + and die "Could not successfully run samtools merge: exit status was $?\n"; + } + if(not -e "$temp_input_bam.bai" or -z "$temp_input_bam.bai"){ + system ("samtools index $temp_input_bam") >> 8 + and die "Could not successfully run samtools index: exit status was $?\n"; + } + my @output_contigs; + open(SAM, "samtools view -H $temp_input_bam |") + or die "Cannot run samtools view on $temp_input_bam: $!\n"; + while(<SAM>){ + if(/^\@SQ\tSN:(\S+)/){ + push @output_contigs, $1; + } + } + close(SAM); + + my @split_bamfiles; + my %contig2outfile; + for my $output_contig (@output_contigs){ + open(SAM, "samtools view -h $temp_input_bam '$output_contig'|") + and die "Cannot execute samtools view successfully: $!\n"; + my @headers; + # read through the headers, then if we get at least one mapped read, open a file to print it + while(<SAM>){ + last if not /^\@/; # first real record + push @headers, $_; + } + next if not defined $_; # got to EOF before a real record + + my $outfile = "$tempdir/prep_bams_parallel_$$.".quotemeta($output_contig).".bam"; + open(OUT, "| samtools view -b -S - > $outfile") + or die "Cannot open $outfile for writing: $!\n"; + print OUT @headers, $_; + while(<SAM>){ # Copy over all lines. Not using list context as this may be a huge dataset and would needlessly take up memory + print OUT $_; + } + close(SAM); + close(OUT); + } + unlink $temp_input_bam or warn "Could not remove $temp_input_bam: $!\n"; +} + +sub run_cmds { + my ($max_cmds, @cmd) = @_; + + my ($num_children, $pid); + + for($num_children = 1; $num_children < $max_cmds && @cmds; $num_children++){ + # Initialize the number of child processes at 1, and increment it by one + # While it is less than $max_cmds + + my $cmd = shift (@cmds); + if($pid = fork) { + # do nothing if parent + } elsif(defined $pid) { # $pid is zero here if defined + print STDERR $cmd, "\n"; + system $cmd; + exit; + } else { + #weird fork error + die "Can't fork: $!\n"; + } + } + + while(@cmds) { + undef $pid; + FORK: { + my $cmd = shift (@cmds); + if($pid = fork) { + # parent here + $num_children++; + wait; + $num_children--; + next; + + } elsif(defined $pid) { # $pid is zero here if defined + print STDERR $cmd, "\n"; + system $cmd; + exit; + + } else { + #weird fork error + die "Can't fork: $!\n"; + } + } + } + wait while $num_children--; +} +