changeset 0:6b2e640c8c6d

initial commit
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 13:31:40 -0600
parents
children a0f4b5618eee
files DepthReports.xml depth_reports tool-data/depth_report.loc
diffstat 3 files changed, 462 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/DepthReports.xml	Wed Mar 25 13:31:40 2015 -0600
@@ -0,0 +1,43 @@
+<?xml version="1.0"?>
+
+<tool id="depth_reports_1" name="Compute depth of sequencing coverage">
+  <description>over targeted regions in a genome</description>
+  <version_string>depth_reports -v</version_string>
+  <command interpreter="perl">depth_reports $__tool_data_path__ -q $input_bam ${target_bed}.bed $out_summary_table $out_poor_coverage_bed $out_depth_graph $out_depth_table $out_mapped_percentile_table $dud_bed</command>
+  <inputs>
+    <param format="bam" name="input_bam" type="data" label="Mapped alignment file"/>
+    <param name="target_bed" type="select" display="radio" dynamic_options="kit_fileOptions()" label="The enrichment (capture) kit used for the sequencing experiment"/>
+    <param format="bed" name="dud_bed" type="data" label="Targeted regions to ignore (known duds)" optional="true" default="/dev/null"/>
+  </inputs>
+  <outputs>
+    <data name="out_summary_table" format="achri_depth_summary_table" label="Depth of coverage summary (mean, median, etc.)"/>
+    <data name="out_poor_coverage_bed" format="bed" label="Low coverage targeted regions"/>
+    <data name="out_depth_graph" format="png" label="Cumulative depth of coverage graph"/>
+    <data name="out_depth_table" format="tabular" label="Number of bases at each depth of coverage"/>
+    <data name="out_mapped_percentile_table" format="tabular" label="Mapped read depth percentiles"/>
+  </outputs>
+
+  <!-- the following code populates the  selection from the public capture kit BED datasets available in the local Galaxy installation -->
+  <code file="capture_kits.py"/>
+
+  <tests>
+    <test>
+     <param name="input_bam" value="depth_test.bam" ftype="bam"/>
+     <param name="target_bed" value="brcas.bed" ftype="bed"/>
+     <output name="out_summary_table">
+       <assert_contents>
+         <has_text text="targeted nucleotide bases: 155091"/>
+         <has_text text="bases mapped to targeted regions: 11473773"/>
+         <has_text text="bases with less than 20-fold coverage: 19046"/>
+       </assert_contents>
+     </output>
+     <output name="out_depth_graph" ftype="png"/>
+    </test>
+  </tests>
+
+  <help>
+This tool reports several statistics describing the depth of coverage of a next-gen sequencing run over its targeted reference genome. 
+These results can be used to assess the quality of the sequencing for SNP calls, etc.
+  </help>
+
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/depth_reports	Wed Mar 25 13:31:40 2015 -0600
@@ -0,0 +1,418 @@
+#!/usr/bin/env perl
+
+use Bio::DB::Sam;
+use GD::Graph::bars;
+use File::Basename;
+use strict;
+use warnings;
+use vars qw($good_homo_coverage);
+
+$good_homo_coverage = 3;
+
+if(@ARGV == 1 and $ARGV[0] eq "-v"){
+  print "Version 1.0\n";
+  exit;
+}
+
+# configuration file parsing/loading
+my %config;
+my $dirname = dirname(__FILE__);
+my $tool_data = shift @ARGV;
+if(not -e "$tool_data/depth_report.loc"){
+  system("cp $dirname/tool-data/depth_report.loc $tool_data/depth_report.loc");
+}
+
+open CONFIG, '<', "$tool_data/depth_report.loc";
+while(<CONFIG>){
+	(my $key, my $value) = split(/\s+/, $_ );
+	$config{$key} = $value;
+}
+close CONFIG;
+
+my $quiet = 0;
+if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){
+  $quiet = 1;
+  shift @ARGV;
+}
+
+my $good_coverage_basic = 20;
+@ARGV == 7 or @ARGV == 8 or @ARGV == 9 
+    or die "Usage: $0 [-q(uiet)] <mapped reads.bam> <targeted regions.bed> <out_summary_table> <out_poor_coverage_bed> <out_depth_graph> <out_depth_table> <out_mapped_percentile_table> [input_dud_regions.bed] [chr#]\n";
+
+if(not -e $ARGV[0]){
+    die "The specified BAM read alignment file ($ARGV[0]) does not exist\n";
+} 
+if(not -r $ARGV[0]){
+    die "The specified BAM read alignment file ($ARGV[0]) is not readable\n";
+} 
+
+if(@ARGV == 7 or $ARGV[7] eq "None"){
+  $ARGV[7] = "/dev/null";
+}
+if(not -r $ARGV[7]){
+    die "The specified dud regions BED file ($ARGV[7]) is not readable\n";
+} 
+
+my $target_contig;
+if(@ARGV == 9){
+  $target_contig = $ARGV[8]; # for debugging or whole genomes
+}
+
+my $bed_file = $config{"capture_kits_dir"} . $ARGV[1];
+my $targeted_total = 0;
+my $targeted_regions = 0;
+my $targeted_coverage = 0;
+my %regions; # contigname => %region{start+-end}
+open(BED, $bed_file )
+    or die "Cannot open target regions BED file $bed_file for reading: $!\n"; 
+
+open(STATS, ">$ARGV[2]")
+    or die "Cannot open summary table $ARGV[2] for writing: $!\n";
+open(POOR, ">$ARGV[3]")
+    or die "Cannot open poor coverage BED file $ARGV[3] for writing: $!\n";
+open(HISTOGRAM, ">$ARGV[4]")
+    or die "Cannot open cumulative depth graph $ARGV[4] for writing: $!\n";
+open(COVER, ">$ARGV[5]")
+    or die "Cannot open read depth table $ARGV[5] for writing: $!\n";
+open(PERCENTILE, ">$ARGV[6]")
+    or die "Cannot open percentile table $ARGV[6] for writing: $!\n";
+print PERCENTILE "# Percentile of mapped bases\tNum reference targeted positions covered\n";
+
+print STATS "# Summary statistics for BAM file $ARGV[0] using targeted regions from $bed_file\n" unless $quiet;
+
+print STDERR "Reading in target regions from BED file\n" unless $quiet;
+while(<BED>){
+    next if /^\s*(?:track\s|#)/;    
+    tr/\r//d;
+    chomp;
+    my @fields = split /\t/, $_;
+    next if defined $target_contig and $fields[0] ne $target_contig;
+    if(not exists $regions{$fields[0]}){
+	$regions{$fields[0]} = {};
+    }
+    $regions{$fields[0]}->{$fields[1].":".$fields[2]} = 0;
+    $targeted_regions++;
+    $targeted_total += $fields[2]-$fields[1]+1;
+}
+close(BED);
+
+
+print STDERR "Reading in dud regions from BED file\n" unless $quiet;
+my %duds; # contigname => %region{start+-end}
+my $duds_total = 0;
+open(DUDS, $ARGV[7])
+    or die "Cannot open dud regions BED file $ARGV[7] for reading: $!\n";
+while(<DUDS>){
+    next if /^\s*#/;    
+    chomp;
+    my @fields = split /\t/, $_;
+    next if defined $target_contig and $fields[0] ne $target_contig;
+    if(not exists $regions{$fields[0]}->{$fields[1].$fields[5].$fields[2]}){
+	die "The duds file includes a region ($fields[0]:$fields[1]$fields[5]$fields[2]) ",
+	    "not listed in the targeted regions BED file, ",
+	    "please make sure the two are synched.\n";
+    }
+    if(not exists $duds{$fields[0]}){
+	$duds{$fields[0]} = {};
+    }
+    $duds{$fields[0]}->{$fields[1].$fields[5].$fields[2]} = 1;
+    $duds_total += $fields[2]-$fields[1]+1;
+}
+close(DUDS);
+
+
+my $tick_count = int($targeted_total/100); # for progress bar
+
+# Read the alignment info
+# Load the BAM file
+my $sam = Bio::DB::Sam->new(-bam => $ARGV[0], -autoindex => 1);
+my $bam_header = $sam->bam->header;
+my $contig_names = $bam_header->target_name;
+
+# Make sure the BED file and reference sequence refer to the same contigs
+for my $contig_name (@$contig_names){
+    next if defined $target_contig and $contig_name ne $target_contig; 
+    if(not exists $regions{$contig_name}){
+	print STATS "#Warning: The BED file makes no reference to the BAM's $contig_name reference sequence, ",
+	            "no reporting will be done for this contig's mappings\n" unless $quiet;
+    }
+}
+
+print STATS "Total number of targeted reference contigs\t", scalar(keys %regions), "\n";
+print STATS "Total number of targeted regions\t$targeted_regions\n";
+print STATS "Total number of targeted nucleotide bases\t$targeted_total\n";
+print STATS "Total number of targeted bases considered pre-emptively as duds\t$duds_total\n";
+
+# Heuristic
+my $isMale = &isMale($sam);
+#print STDERR "Male: $isMale\n" unless $quiet;
+
+# Check each BED region for coverage stats
+print STDERR "Reading coverage data from BAM file (each dot represents $tick_count bases)\n" unless $quiet;
+print STDERR "0%       10%       20%       30%       40%       50%       60%       70%       80%       90%       100%\n" unless $quiet;
+my $base_count = 0;
+my @coverages;
+for my $contig_name (sort keys %regions){
+    next if defined $target_contig and $contig_name ne $target_contig; 
+    my $c_name = $contig_name;
+    if(not grep {$_ eq $c_name} @$contig_names){
+        $c_name =~ s/^chr//;
+        if(not grep {$_ eq $c_name} @$contig_names){
+            print STATS "#Warning: The BAM file makes no reference to the BED's $contig_name reference sequence, ",
+                    "no reporting will be done for this contig's mappings\n" unless $quiet;
+            next;
+        }
+    }
+    my $good_coverage = $good_coverage_basic;
+    $good_coverage = $good_homo_coverage if $contig_name =~ /X$/ and $isMale; # males are hemizygous for X, so adjust the poor coverage threshold accordingly
+    for my $range_key (keys %{$regions{$contig_name}}){
+	#print STDERR "Processing range $range_key\n";
+	my ($range_min, $range_max) = $range_key =~ /^(\d+):(\d+)$/;
+	my @cov = $sam->get_features_by_location(-seq_id => $c_name,
+						 -type   => "coverage",
+						 -start  => $range_min,
+						 -end    => $range_max);
+	if(not @cov){
+            if(not $quiet){
+	        die "No BAM data for $c_name (", $range_min,",",
+	            $range_max, ").  No coverage data returned\n";
+            }
+            next;
+	}
+	@cov = $cov[0]->coverage;
+
+	if(@cov == 0){
+	    @cov = (0) x ($range_max-$range_min+1);
+	}
+
+	# Gather min, Q1, median, Q3, max
+	$regions{$contig_name}->{$range_key} = [];
+	my $ignore = exists $duds{$contig_name}->{$range_key};
+        my $low_coverage_start = -1;
+        my @low_covs;
+	for(my $i = 0; $i <= $#cov; $i++){
+	    $targeted_coverage += $cov[$i];
+	    if(not $ignore){
+		$coverages[$base_count++] = $cov[$i];
+                if($cov[$i] >= $good_coverage){
+                  if(@low_covs){
+                    &print_low($contig_name, \@low_covs, $range_min, $low_coverage_start);
+                    @low_covs = (); # clear!
+                  }
+                }
+                else{
+                  if(not @low_covs){
+                    # starting a low coverage section
+                    $low_coverage_start = $i;
+                  }
+                  # else continuing a low coverage section
+                  push @low_covs, $cov[$i];
+                }
+	    }
+	    print STDERR "." if not $quiet and $base_count%$tick_count == 0;
+	}
+        if(@low_covs){
+          &print_low($contig_name, \@low_covs, $range_min, $low_coverage_start);
+        }
+    }
+}
+print STDERR "$base_count/",($targeted_total-$duds_total),"\n" unless $quiet;
+
+print STATS "Total number of bases mapped to targeted regions\t$targeted_coverage\n";
+
+my $percentile_size = int($targeted_coverage/100);
+
+# Generate a cumulative distribution of coverage
+print STDERR "Sorting mapped read depth statistics\n" unless $quiet;
+@coverages = sort {$a <=> $b} @coverages;
+
+print STATS "Minimum coverage in targeted regions\t$coverages[0]\n";
+print STATS "Median coverage in targeted regions\t", $coverages[int($#coverages/2)], "\n";
+print STATS "Mean coverage in targeted regions\t", ($targeted_coverage/$targeted_total), "\n";
+print STATS "Maximum coverage in targeted regions\t", $coverages[$#coverages], "\n";
+
+# Estimate the false negative rates
+print STDERR "Estimating SNP discovery sensitivity...\n" unless $quiet;
+print COVER "# Mapped read depth\tNumber of bases\n";
+my $last_coverage = 0;
+my $coverage_count = 0;
+my $homo_misses = 0;
+my $het_misses = 0;
+my @depth_labels;
+my @cumu_pct;
+$#coverages = int($#coverages*0.98); # Look at percentiles 0-98, otherwise chart is too big due to long tail
+for my $c (@coverages){
+    if($c != $last_coverage){
+	if($last_coverage != 0){
+	    # fill in any missing values for the x axis of the cumulative distribution graph
+	    for(my $i = $#depth_labels+1; $i < $last_coverage; $i++){
+		push @depth_labels, ($i%10?"":$i);
+		push @cumu_pct, $cumu_pct[$i-1];
+	    }
+	}
+	push @depth_labels, ($last_coverage%10?"":$last_coverage); # label every 10 bars 
+	if($last_coverage == 0){
+	    push @cumu_pct, $coverage_count/($targeted_total-$duds_total)*100; # cumulative percentage
+	}
+	else{
+	    push @cumu_pct, $cumu_pct[$#cumu_pct]+$coverage_count/($targeted_total-$duds_total)*100; # cumulative percentage
+	}
+	print COVER "$last_coverage\t$coverage_count\n";
+	if($c <= $good_coverage_basic){
+	    $homo_misses += false_neg_homo_count($last_coverage, $coverage_count);
+	    $het_misses += false_neg_het_count($last_coverage, $coverage_count);
+	}
+	$coverage_count = 0;
+	$last_coverage = $c;
+    }
+    $coverage_count++;
+}
+$homo_misses = int($homo_misses+0.5);
+$het_misses = int($het_misses+0.5);
+print STATS "Estimated percentage of homozygous SNPs missed\t", $homo_misses/($targeted_total*0.000358*0.35)*100, "\n";
+print STATS "Estimated number of homozygous SNPs missed\t$homo_misses\n";
+print STATS "Estimated percentage of heterozygous SNPs missed\t", $het_misses/($targeted_total*0.000358*0.65)*100, "\n";
+print STATS "Estimated number of heterozygous SNPs missed\t$het_misses\n";
+
+print STDERR "Generating coverage percentiles and false negative/false positive estimates\n" unless $quiet;
+
+my $percentile_reporting = 1;
+my $lt_fold_10 = 0;
+my $lt_fold_20 = 0;
+my $poor_cutoff_percentile = 5;
+my $poor_cutoff_coverage = 0;
+my $running_cov_tot;
+for (my $i = 0; $i <= $#coverages; $i++){
+    $running_cov_tot += $coverages[$i];
+    if($coverages[$i] < $good_coverage_basic){
+	$lt_fold_20++;
+	if($coverages[$i] < $good_homo_coverage){
+	    $lt_fold_10++;
+	}
+    }
+    if($running_cov_tot >= $percentile_reporting*$percentile_size){
+	print PERCENTILE "$percentile_reporting\t$i\n";
+	$percentile_reporting++;
+    }
+}
+close(PERCENTILE);
+
+print STATS "Total bases with less than $good_homo_coverage-fold coverage\t$lt_fold_10\n";
+print STATS "Total bases with less than $good_coverage_basic-fold coverage\t$lt_fold_20\n";
+close(STATS);
+
+print STDERR "Generating read depth cumulative distribution graph\n" unless $quiet;
+# Using three colors to show trouble coverage levels
+my @low_coverage = @cumu_pct[0..$good_homo_coverage];
+my @medium_coverage = (("0") x ($good_homo_coverage-1), @cumu_pct[$good_homo_coverage..($good_coverage_basic-1)]);
+for(my $i = 0; $i < $good_coverage_basic; $i++){
+    $cumu_pct[$i] = 0;
+}
+my $graph = new GD::Graph::bars(($#cumu_pct*2)+100, 600);
+$graph->set(x_label => "Mapped read depth",
+	    y_label => "Percentage of target reference",
+	    title => "Cumulative distribution of mapped read depth (0 to 98th percentiles)",
+	    cumulate => 1,
+	    transparent => 0,
+	    bar_spacing => 0,
+	    bar_width => 2,
+	    fgclr => "black",
+	    dclrs => ["red", "yellow", "green"],
+	    x_ticks => 0,
+	    long_ticks => 1,
+	    tick_length => 0,
+	    y_tick_number => 10,
+	    y_number_format =>'%d%%',
+	    box_axis => 0)
+    or die "While setting up chart", $graph->error;
+my $gd = $graph->plot([\@depth_labels, \@low_coverage, \@medium_coverage, \@cumu_pct]) or die $graph->error;
+binmode HISTOGRAM;
+print HISTOGRAM $gd->png;
+close(HISTOGRAM);
+
+# args: read depth, count of bases with this read depth
+sub false_neg_homo_count{
+    if($_[0] < $good_homo_coverage){          # 0.000358 based on doi:10.1371/journal.pgen.1000160
+	return 0.000358*0.35*$_[1]; # 35% of SNPs are homo based on own observations
+    }
+    elsif($_[0] > $good_coverage_basic){
+	return 0;
+    }
+    return (-0.118*log($_[0])+0.27)*0.000358*$_[1];
+}
+
+sub false_neg_het_count{
+    if($_[0] < $good_homo_coverage){
+	return 0.000358*0.65*$_[1]; # 65% of SNPs are het based on own observations
+    }
+    elsif($_[0] >= $good_coverage_basic){
+	return 0;
+    } 
+    return (-0.0004*($_[0]**2)-0.00048*$_[0]+0.2483)*0.000358*$_[1];
+}
+
+sub print_low{
+   my ($contig_name, $low_covs_ref, $base_pos,  $low_offset) = @_;
+   # exiting a low coverage section, print it, split into poor het call regions, and poor homo call regions
+   my $start = $base_pos+$low_offset;
+   my $last_i = 0;
+   for(my $i = 1; $i <= $#{$low_covs_ref}; $i++){
+     if($low_covs_ref->[$i] >= $good_homo_coverage){
+       if($low_covs_ref->[$last_i] < $good_homo_coverage){
+         my @sorted_low_covs = sort {$a <=> $b} @{$low_covs_ref}[$last_i..($i-1)];
+         print POOR "$contig_name\t",$start+$last_i,"\t", $start+$i, "\t", $sorted_low_covs[0]."-".$sorted_low_covs[$#sorted_low_covs], "\t0\n";
+         $last_i = $i;
+       }
+       if($i == $#{$low_covs_ref}){
+         my @sorted_low_covs = sort {$a <=> $b} @{$low_covs_ref}[$last_i..$i];
+         print POOR "$contig_name\t",$start+$last_i,"\t", $start+$i, "\t", $sorted_low_covs[0]."-".$sorted_low_covs[$#sorted_low_covs], "\t1\n";
+       }
+       # else continuation of a good homo coverage region
+     }
+     else{
+       if($low_covs_ref->[$last_i] >= $good_homo_coverage or $i == $#{$low_covs_ref}){
+         my @sorted_low_covs = sort {$a <=> $b} @{$low_covs_ref}[$last_i..($i-1)];
+         print POOR "$contig_name\t",$start+$last_i,"\t", $start+$i, "\t", $sorted_low_covs[0]."-".$sorted_low_covs[$#sorted_low_covs], "\t1\n";
+         $last_i = $i;
+       } 
+       if($i == $#{$low_covs_ref}){
+         my @sorted_low_covs = sort {$a <=> $b} @{$low_covs_ref}[$last_i..$i];
+         print POOR "$contig_name\t",$start+$last_i,"\t", $start+$i, "\t", $sorted_low_covs[0]."-".$sorted_low_covs[$#sorted_low_covs], "\t0\n";
+       }
+       # else continuation of a poor homo region
+     }
+   }
+}
+
+sub isMale{
+   my ($sam) = @_;
+   # Local observation:
+   # A robust measure across whole genome and exome kits (generally not capturing any Y genes) 
+   # to detect female is an exceptionally low ratio for the number of reads mapped to chrY:9200000-9300000
+   # (which contains several testes-specific genes), and chrY:13800000-13900000 (which is highly repetitive)
+   # in hg19 (UCSC).
+   # This holds regardless of the read depth for the experiment, so should be robust.  Females have 
+   # an average ratio of 0.000583843, with a std dev of 0.00069961.  We'll set the threshold to 0.004334299 (mu + 3*sigma)
+   # to be 99% sure it's not a female sample (I know normal dist isn't the correct model here, but it's close enough).  -Paul G. 2013-11-01
+   my $chrYName = "chrY";
+   if(not grep {$_ eq "chrY"} $sam->seq_ids){
+     if(grep {$_ eq "Y"} $sam->seq_ids){
+       $chrYName = "Y";
+     }
+     else{ # not human-ish?
+       return 0;
+     }
+   }
+   my $low_count_region = 0;
+   #print STDERR "Checking sex by chrY data...\n" unless $quiet;
+   $sam->fetch("$chrYName:9200000-9300000", sub {$low_count_region++});   
+   my $high_count_region = 0;
+   $sam->fetch("$chrYName:13800000-13900000", sub {$high_count_region++});   
+   if($high_count_region){
+     #print STDERR "$low_count_region/$high_count_region = ", $low_count_region/$high_count_region, "\n" unless $quiet;
+   }
+   else{
+     #print STDERR "No relevant chrY data\n" unless $quiet;
+   }
+   return $high_count_region ? ($low_count_region/$high_count_region > 0.004334299 ? 1 : 0) : 0; 
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tool-data/depth_report.loc	Wed Mar 25 13:31:40 2015 -0600
@@ -0,0 +1,1 @@
+capture_kits_dir	/export/achri_galaxy/dbs/CaptureKits/