changeset 0:6b673ffd9e38 default tip

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:29:45 -0600
parents
children
files DedupRealignBAM.xml GenomeAnalysisTK.jar prep_bams prep_bams_parallel tool-data/dedup_realign_bam.loc
diffstat 5 files changed, 244 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DedupRealignBAM.xml	Wed Mar 25 13:29:45 2015 -0600
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+
+<tool id="dedup_realign_bam_1" name="Deduplicate and realign a BAM file(s)">
+  <description>using samtools and GATK</description>
+  <version_string>echo 1.0.0</version_string>
+  <command interpreter="perl">prep_bams $__tool_data_path__ $messages $ref_genome\.fa ${__new_file_path__} $prepped_bam_file $input_bam_file
+      #for $i in $inputs
+        ${i.input}
+      #end for 
+  </command>
+  <inputs>
+    <param name="ref_genome" type="genomebuild" label="Reference genome" help="against which the reads were mapped"/>
+    <param format="bam" name="input_bam_file" type="data" label="Source BAM (mapped reads) file" help="Need to merge multiple input BAM files for one sample? Use controls below."/>
+    <repeat name="inputs" title="Input BAM Files">
+      <param name="input" label="Additional BAM file to merge" type="data" format="bam" />
+    </repeat>
+
+  </inputs>
+  <outputs>
+    <data name="prepped_bam_file" format="bam" type="data" label="Deduped and realigned mapped reads"/>
+    <data name="messages" format="text" type="data" label="Prep process log messages"/>
+  </outputs>
+
+  <tests/>
+
+  <help>
+  This tool runs a set of processes to first optionally merge BAMs, then deduplicate (samtools) and realign (GATK) the reads mapped to a reference genome.  
+  This is important for genotyping studies.  While these steps could be run independently in a workflow,
+  an enormous amount of intermediate data files are generated.  This tool cleans up those intermediate files.
+ </help>
+
+</tool>
Binary file GenomeAnalysisTK.jar has changed
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/prep_bams	Wed Mar 25 13:29:45 2015 -0600
@@ -0,0 +1,69 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use File::Basename;
+
+# configuration file stuff
+my %config;
+my $dirname = dirname(__FILE__);
+my $tool_dir = shift @ARGV;
+
+if(not -e "$tool_dir/dedup_realign_bam.loc"){
+  system("cp $dirname/tool-data/dedup_realign_bam.loc $tool_dir/dedup_realign_bam.loc");
+}
+open CONFIG, '<', "$tool_dir/dedup_realign_bam.loc";
+while(<CONFIG>){
+  next if $_ =~ /^#/;
+  (my $key, my $value) = split(/\s+/, $_);
+  $config{$key} = $value;
+}
+close CONFIG;
+
+@ARGV > 4 or die "Usage: $0 <output log> <reference.fa> <temporary directory> <deduped_realigned.bam> <input1.bam> [input2.bam] ...\n";
+
+my $log = shift @ARGV;
+my $ref = $config{"dbs_dir"} . shift @ARGV;
+my $tmpdir = shift @ARGV;
+my $outfile = shift @ARGV;
+$SIG{__DIE__} = $SIG{INT} = $SIG{HUP} = $SIG{TERM} = $SIG{ABRT} = \&cleanup;
+if(@ARGV > 1){
+  system ("merge_bam_headers $log $tmpdir/tmp$$.headers.sam ".join(" ", @ARGV))>>8 and die "merge_bam_headers failed with exit status ", ($?>>8), ", please check the log\n";
+  system("samtools merge -h $tmpdir/tmp$$.headers.sam $tmpdir/tmp$$.bam ".join(" ", @ARGV)." 2>> $log")>>8 and die "Samtools merge failed with exit status ", ($?>>8), ", please check the log\n";
+  system("samtools flagstat $tmpdir/tmp$$.bam > $outfile.before_dedup.flagstat.txt");
+  system("samtools rmdup $tmpdir/tmp$$.bam $tmpdir/tmp$$.rmdup.bam 2>> $log")>>8 and die "Samtools rmdup failed with exit status ", ($?>>8), ", please check the log\n";
+  unlink "$tmpdir/tmp$$.bam";
+}
+else{
+  system("samtools flagstat $ARGV[0] > $outfile.before_dedup.flagstat.txt");
+  system("samtools rmdup $ARGV[0] $tmpdir/tmp$$.rmdup.bam 2>> $log")>>8 and die "Samtools rmdup failed with exit status ", ($?>>8), ", please check the log\n";
+}
+die "Samtools rmdup did not generate the expected output file" if not -e "$tmpdir/tmp$$.rmdup.bam";
+die "Samtools generated a blank output file" if -z "$tmpdir/tmp$$.rmdup.bam";
+system "samtools index $tmpdir/tmp$$.rmdup.bam";
+system "java -Xmx4G -jar $dirname/GenomeAnalysisTK.jar -I $tmpdir/tmp$$.rmdup.bam -R $ref -T RealignerTargetCreator -o $tmpdir/tmp$$.rmdup.gatk_realigner.intervals 2>> $log";
+die "GATK did not generate the expected intervals file" if not -e "$tmpdir/tmp$$.rmdup.gatk_realigner.intervals";
+system "java -Xmx4G -jar $dirname/GenomeAnalysisTK.jar -I $tmpdir/tmp$$.rmdup.bam -R $ref -T IndelRealigner -targetIntervals $tmpdir/tmp$$.rmdup.gatk_realigner.intervals -o $outfile 2>> $log";
+die "GATK did not generate the expected realigned BAM output file" if not -e $outfile;
+die "GATK generated a blank output file" if -z $outfile;
+my $outfile_bai = $outfile;
+$outfile_bai =~ s/\.bam$/.bai/;
+if(not -e $outfile_bai){
+  if(not -e "$outfile.bai"){
+    system "samtools index $outfile";
+  }
+  system "cp $outfile.bai $outfile_bai";
+}
+else{
+  system "cp $outfile_bai $outfile.bai"; # some tools expect name.bai, some expect name.bam.bai, so provide both
+}
+&cleanup;
+
+sub cleanup{
+  print STDERR @_;
+  unlink "$tmpdir/$tmpdir/tmp$$.headers.sam";
+  unlink "$tmpdir/tmp$$.bam";
+  unlink "$tmpdir/tmp$$.rmdup.bam";
+  unlink "$tmpdir/tmp$$.rmdup.bam.bai";
+  unlink "$tmpdir/tmp$$.rmdup.gatk_realigner.intervals";
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/prep_bams_parallel	Wed Mar 25 13:29:45 2015 -0600
@@ -0,0 +1,142 @@
+#!/usr/bin/env perl
+
+use File::Basename;
+use strict;
+use warnings;
+
+@ARGV > 5 or die "Usage: $0 <num parallel processes> <output log> <reference.fa> <temporary directory> <deduped_realigned_bam_prefix> <input1.bam> [input2.bam] ...\n";
+
+my $num_procs = shift @ARGV;
+my $log = shift @ARGV;
+my $ref = shift @ARGV;
+my $tmpdir = shift @ARGV;
+my $outfile_prefix = shift @ARGV;
+
+my $progdir = dirname($0);
+
+# Split the input data into separate files for each chromosome, so we can process them in parallel
+my $chr_files_hashref = &generate_chr_split_bams($tmpdir, @ARGV);
+
+my (@cmds, @logs);
+for my $chr (keys %$chr_files_hashref){
+  push @cmds, "$progdir/prep_bams $log.$chr $ref $tmpdir $outfile_prefix.$chr.bam ".$chr_files_hashref->{$chr};
+  push @logs, "$log.$chr";
+}
+
+# Run the chrs in parallel to the desrired degree
+&run_cmds($num_procs, @cmds);
+
+# Remove the split input files, as they were only needed temporarily
+for my $chr_file (values %$chr_files_hashref){
+  unlink $chr_file or warn "Cannot remove temporary file $chr_file: $!\n";
+}
+
+# Merge the logs
+open(LOG, ">$log")
+  or die "Could not open output log $log for writing: $!\n";
+for my $l (@logs){
+  open(L, $l)
+    or warn "Log file $l was expected but not found, data processing may have been incomplete.\n";
+  print LOG <L>;
+  close(L);
+  unlink $l;
+}
+close(LOG);
+
+# Merge the input BAMsinto one file, then split them into per contig BAM files for downstream parallelization
+sub generate_chr_split_bams{
+  my ($tempdir, @input_bams) = @_;
+
+  my $temp_input_bam = $input_bams[0];
+  if(@input_bams > 1){
+    $temp_input_bam = "$tempdir/prep_bams_parallel_$$.bam";
+    system ("samtools merge $temp_input_bam ".join(" ", @input_bams)) >> 8 
+      and die "Could not successfully run samtools merge: exit status was $?\n";
+  }
+  if(not -e "$temp_input_bam.bai" or -z "$temp_input_bam.bai"){
+    system ("samtools index $temp_input_bam") >> 8
+      and die "Could not successfully run samtools index: exit status was $?\n";
+  }
+  my @output_contigs;
+  open(SAM, "samtools view -H $temp_input_bam |")
+    or die "Cannot run samtools view on $temp_input_bam: $!\n";
+  while(<SAM>){
+    if(/^\@SQ\tSN:(\S+)/){
+      push @output_contigs, $1;
+    }
+  }
+  close(SAM);
+
+  my @split_bamfiles;
+  my %contig2outfile;
+  for my $output_contig (@output_contigs){
+    open(SAM, "samtools view -h $temp_input_bam '$output_contig'|")
+      and die "Cannot execute samtools view successfully: $!\n";
+    my @headers;
+    # read through the headers, then if we get at least one mapped read, open a file to print it
+    while(<SAM>){
+      last if not /^\@/; # first real record
+      push @headers, $_;
+    }
+    next if not defined $_; # got to EOF before a real record
+
+    my $outfile = "$tempdir/prep_bams_parallel_$$.".quotemeta($output_contig).".bam";
+    open(OUT, "| samtools view -b -S - > $outfile")
+      or die "Cannot open $outfile for writing: $!\n";
+    print OUT @headers, $_;
+    while(<SAM>){ # Copy over all lines.  Not using list context as this may be a huge dataset and would needlessly take up memory
+      print OUT $_;
+    }
+    close(SAM);
+    close(OUT);
+  }
+  unlink $temp_input_bam or warn "Could not remove $temp_input_bam: $!\n";
+}
+
+sub run_cmds {
+  my ($max_cmds, @cmd) = @_;
+  
+  my ($num_children, $pid); 
+  
+        for($num_children = 1; $num_children < $max_cmds && @cmds; $num_children++){
+        # Initialize the number of child processes at 1, and increment it by one
+        # While it is less than $max_cmds
+
+                my $cmd = shift (@cmds);
+                if($pid = fork) {
+                        # do nothing if parent
+                } elsif(defined $pid) { # $pid is zero here if defined
+                        print STDERR $cmd, "\n";
+                        system $cmd;
+                        exit;
+                } else {
+                        #weird fork error
+                        die "Can't fork: $!\n";
+                }
+        }
+
+        while(@cmds) {
+                undef $pid;
+                FORK: {
+                        my $cmd = shift (@cmds);
+                        if($pid = fork) {
+                                # parent here
+                                $num_children++;
+                                wait;
+                                $num_children--;
+                                next;
+
+                        } elsif(defined $pid) { # $pid is zero here if defined
+                                print STDERR $cmd, "\n";
+                                system $cmd;
+                                exit;
+
+                        } else {
+                                #weird fork error
+                                die "Can't fork: $!\n";
+                        }
+                }
+        }
+        wait while $num_children--;
+}
+
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/dedup_realign_bam.loc	Wed Mar 25 13:29:45 2015 -0600
@@ -0,0 +1,1 @@
+dbs_dir	/export/achri_galaxy/dbs/