Mercurial > repos > yusuf > depth_report
changeset 0:6b2e640c8c6d
initial commit
author | Yusuf Ali <ali@yusuf.email> |
---|---|
date | Wed, 25 Mar 2015 13:31:40 -0600 |
parents | |
children | a0f4b5618eee |
files | DepthReports.xml depth_reports tool-data/depth_report.loc |
diffstat | 3 files changed, 462 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/DepthReports.xml Wed Mar 25 13:31:40 2015 -0600 @@ -0,0 +1,43 @@ +<?xml version="1.0"?> + +<tool id="depth_reports_1" name="Compute depth of sequencing coverage"> + <description>over targeted regions in a genome</description> + <version_string>depth_reports -v</version_string> + <command interpreter="perl">depth_reports $__tool_data_path__ -q $input_bam ${target_bed}.bed $out_summary_table $out_poor_coverage_bed $out_depth_graph $out_depth_table $out_mapped_percentile_table $dud_bed</command> + <inputs> + <param format="bam" name="input_bam" type="data" label="Mapped alignment file"/> + <param name="target_bed" type="select" display="radio" dynamic_options="kit_fileOptions()" label="The enrichment (capture) kit used for the sequencing experiment"/> + <param format="bed" name="dud_bed" type="data" label="Targeted regions to ignore (known duds)" optional="true" default="/dev/null"/> + </inputs> + <outputs> + <data name="out_summary_table" format="achri_depth_summary_table" label="Depth of coverage summary (mean, median, etc.)"/> + <data name="out_poor_coverage_bed" format="bed" label="Low coverage targeted regions"/> + <data name="out_depth_graph" format="png" label="Cumulative depth of coverage graph"/> + <data name="out_depth_table" format="tabular" label="Number of bases at each depth of coverage"/> + <data name="out_mapped_percentile_table" format="tabular" label="Mapped read depth percentiles"/> + </outputs> + + <!-- the following code populates the selection from the public capture kit BED datasets available in the local Galaxy installation --> + <code file="capture_kits.py"/> + + <tests> + <test> + <param name="input_bam" value="depth_test.bam" ftype="bam"/> + <param name="target_bed" value="brcas.bed" ftype="bed"/> + <output name="out_summary_table"> + <assert_contents> + <has_text text="targeted nucleotide bases: 155091"/> + <has_text text="bases mapped to targeted regions: 11473773"/> + <has_text text="bases with less than 20-fold coverage: 19046"/> + </assert_contents> + </output> + <output name="out_depth_graph" ftype="png"/> + </test> + </tests> + + <help> +This tool reports several statistics describing the depth of coverage of a next-gen sequencing run over its targeted reference genome. +These results can be used to assess the quality of the sequencing for SNP calls, etc. + </help> + +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/depth_reports Wed Mar 25 13:31:40 2015 -0600 @@ -0,0 +1,418 @@ +#!/usr/bin/env perl + +use Bio::DB::Sam; +use GD::Graph::bars; +use File::Basename; +use strict; +use warnings; +use vars qw($good_homo_coverage); + +$good_homo_coverage = 3; + +if(@ARGV == 1 and $ARGV[0] eq "-v"){ + print "Version 1.0\n"; + exit; +} + +# configuration file parsing/loading +my %config; +my $dirname = dirname(__FILE__); +my $tool_data = shift @ARGV; +if(not -e "$tool_data/depth_report.loc"){ + system("cp $dirname/tool-data/depth_report.loc $tool_data/depth_report.loc"); +} + +open CONFIG, '<', "$tool_data/depth_report.loc"; +while(<CONFIG>){ + (my $key, my $value) = split(/\s+/, $_ ); + $config{$key} = $value; +} +close CONFIG; + +my $quiet = 0; +if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){ + $quiet = 1; + shift @ARGV; +} + +my $good_coverage_basic = 20; +@ARGV == 7 or @ARGV == 8 or @ARGV == 9 + or die "Usage: $0 [-q(uiet)] <mapped reads.bam> <targeted regions.bed> <out_summary_table> <out_poor_coverage_bed> <out_depth_graph> <out_depth_table> <out_mapped_percentile_table> [input_dud_regions.bed] [chr#]\n"; + +if(not -e $ARGV[0]){ + die "The specified BAM read alignment file ($ARGV[0]) does not exist\n"; +} +if(not -r $ARGV[0]){ + die "The specified BAM read alignment file ($ARGV[0]) is not readable\n"; +} + +if(@ARGV == 7 or $ARGV[7] eq "None"){ + $ARGV[7] = "/dev/null"; +} +if(not -r $ARGV[7]){ + die "The specified dud regions BED file ($ARGV[7]) is not readable\n"; +} + +my $target_contig; +if(@ARGV == 9){ + $target_contig = $ARGV[8]; # for debugging or whole genomes +} + +my $bed_file = $config{"capture_kits_dir"} . $ARGV[1]; +my $targeted_total = 0; +my $targeted_regions = 0; +my $targeted_coverage = 0; +my %regions; # contigname => %region{start+-end} +open(BED, $bed_file ) + or die "Cannot open target regions BED file $bed_file for reading: $!\n"; + +open(STATS, ">$ARGV[2]") + or die "Cannot open summary table $ARGV[2] for writing: $!\n"; +open(POOR, ">$ARGV[3]") + or die "Cannot open poor coverage BED file $ARGV[3] for writing: $!\n"; +open(HISTOGRAM, ">$ARGV[4]") + or die "Cannot open cumulative depth graph $ARGV[4] for writing: $!\n"; +open(COVER, ">$ARGV[5]") + or die "Cannot open read depth table $ARGV[5] for writing: $!\n"; +open(PERCENTILE, ">$ARGV[6]") + or die "Cannot open percentile table $ARGV[6] for writing: $!\n"; +print PERCENTILE "# Percentile of mapped bases\tNum reference targeted positions covered\n"; + +print STATS "# Summary statistics for BAM file $ARGV[0] using targeted regions from $bed_file\n" unless $quiet; + +print STDERR "Reading in target regions from BED file\n" unless $quiet; +while(<BED>){ + next if /^\s*(?:track\s|#)/; + tr/\r//d; + chomp; + my @fields = split /\t/, $_; + next if defined $target_contig and $fields[0] ne $target_contig; + if(not exists $regions{$fields[0]}){ + $regions{$fields[0]} = {}; + } + $regions{$fields[0]}->{$fields[1].":".$fields[2]} = 0; + $targeted_regions++; + $targeted_total += $fields[2]-$fields[1]+1; +} +close(BED); + + +print STDERR "Reading in dud regions from BED file\n" unless $quiet; +my %duds; # contigname => %region{start+-end} +my $duds_total = 0; +open(DUDS, $ARGV[7]) + or die "Cannot open dud regions BED file $ARGV[7] for reading: $!\n"; +while(<DUDS>){ + next if /^\s*#/; + chomp; + my @fields = split /\t/, $_; + next if defined $target_contig and $fields[0] ne $target_contig; + if(not exists $regions{$fields[0]}->{$fields[1].$fields[5].$fields[2]}){ + die "The duds file includes a region ($fields[0]:$fields[1]$fields[5]$fields[2]) ", + "not listed in the targeted regions BED file, ", + "please make sure the two are synched.\n"; + } + if(not exists $duds{$fields[0]}){ + $duds{$fields[0]} = {}; + } + $duds{$fields[0]}->{$fields[1].$fields[5].$fields[2]} = 1; + $duds_total += $fields[2]-$fields[1]+1; +} +close(DUDS); + + +my $tick_count = int($targeted_total/100); # for progress bar + +# Read the alignment info +# Load the BAM file +my $sam = Bio::DB::Sam->new(-bam => $ARGV[0], -autoindex => 1); +my $bam_header = $sam->bam->header; +my $contig_names = $bam_header->target_name; + +# Make sure the BED file and reference sequence refer to the same contigs +for my $contig_name (@$contig_names){ + next if defined $target_contig and $contig_name ne $target_contig; + if(not exists $regions{$contig_name}){ + print STATS "#Warning: The BED file makes no reference to the BAM's $contig_name reference sequence, ", + "no reporting will be done for this contig's mappings\n" unless $quiet; + } +} + +print STATS "Total number of targeted reference contigs\t", scalar(keys %regions), "\n"; +print STATS "Total number of targeted regions\t$targeted_regions\n"; +print STATS "Total number of targeted nucleotide bases\t$targeted_total\n"; +print STATS "Total number of targeted bases considered pre-emptively as duds\t$duds_total\n"; + +# Heuristic +my $isMale = &isMale($sam); +#print STDERR "Male: $isMale\n" unless $quiet; + +# Check each BED region for coverage stats +print STDERR "Reading coverage data from BAM file (each dot represents $tick_count bases)\n" unless $quiet; +print STDERR "0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%\n" unless $quiet; +my $base_count = 0; +my @coverages; +for my $contig_name (sort keys %regions){ + next if defined $target_contig and $contig_name ne $target_contig; + my $c_name = $contig_name; + if(not grep {$_ eq $c_name} @$contig_names){ + $c_name =~ s/^chr//; + if(not grep {$_ eq $c_name} @$contig_names){ + print STATS "#Warning: The BAM file makes no reference to the BED's $contig_name reference sequence, ", + "no reporting will be done for this contig's mappings\n" unless $quiet; + next; + } + } + my $good_coverage = $good_coverage_basic; + $good_coverage = $good_homo_coverage if $contig_name =~ /X$/ and $isMale; # males are hemizygous for X, so adjust the poor coverage threshold accordingly + for my $range_key (keys %{$regions{$contig_name}}){ + #print STDERR "Processing range $range_key\n"; + my ($range_min, $range_max) = $range_key =~ /^(\d+):(\d+)$/; + my @cov = $sam->get_features_by_location(-seq_id => $c_name, + -type => "coverage", + -start => $range_min, + -end => $range_max); + if(not @cov){ + if(not $quiet){ + die "No BAM data for $c_name (", $range_min,",", + $range_max, "). No coverage data returned\n"; + } + next; + } + @cov = $cov[0]->coverage; + + if(@cov == 0){ + @cov = (0) x ($range_max-$range_min+1); + } + + # Gather min, Q1, median, Q3, max + $regions{$contig_name}->{$range_key} = []; + my $ignore = exists $duds{$contig_name}->{$range_key}; + my $low_coverage_start = -1; + my @low_covs; + for(my $i = 0; $i <= $#cov; $i++){ + $targeted_coverage += $cov[$i]; + if(not $ignore){ + $coverages[$base_count++] = $cov[$i]; + if($cov[$i] >= $good_coverage){ + if(@low_covs){ + &print_low($contig_name, \@low_covs, $range_min, $low_coverage_start); + @low_covs = (); # clear! + } + } + else{ + if(not @low_covs){ + # starting a low coverage section + $low_coverage_start = $i; + } + # else continuing a low coverage section + push @low_covs, $cov[$i]; + } + } + print STDERR "." if not $quiet and $base_count%$tick_count == 0; + } + if(@low_covs){ + &print_low($contig_name, \@low_covs, $range_min, $low_coverage_start); + } + } +} +print STDERR "$base_count/",($targeted_total-$duds_total),"\n" unless $quiet; + +print STATS "Total number of bases mapped to targeted regions\t$targeted_coverage\n"; + +my $percentile_size = int($targeted_coverage/100); + +# Generate a cumulative distribution of coverage +print STDERR "Sorting mapped read depth statistics\n" unless $quiet; +@coverages = sort {$a <=> $b} @coverages; + +print STATS "Minimum coverage in targeted regions\t$coverages[0]\n"; +print STATS "Median coverage in targeted regions\t", $coverages[int($#coverages/2)], "\n"; +print STATS "Mean coverage in targeted regions\t", ($targeted_coverage/$targeted_total), "\n"; +print STATS "Maximum coverage in targeted regions\t", $coverages[$#coverages], "\n"; + +# Estimate the false negative rates +print STDERR "Estimating SNP discovery sensitivity...\n" unless $quiet; +print COVER "# Mapped read depth\tNumber of bases\n"; +my $last_coverage = 0; +my $coverage_count = 0; +my $homo_misses = 0; +my $het_misses = 0; +my @depth_labels; +my @cumu_pct; +$#coverages = int($#coverages*0.98); # Look at percentiles 0-98, otherwise chart is too big due to long tail +for my $c (@coverages){ + if($c != $last_coverage){ + if($last_coverage != 0){ + # fill in any missing values for the x axis of the cumulative distribution graph + for(my $i = $#depth_labels+1; $i < $last_coverage; $i++){ + push @depth_labels, ($i%10?"":$i); + push @cumu_pct, $cumu_pct[$i-1]; + } + } + push @depth_labels, ($last_coverage%10?"":$last_coverage); # label every 10 bars + if($last_coverage == 0){ + push @cumu_pct, $coverage_count/($targeted_total-$duds_total)*100; # cumulative percentage + } + else{ + push @cumu_pct, $cumu_pct[$#cumu_pct]+$coverage_count/($targeted_total-$duds_total)*100; # cumulative percentage + } + print COVER "$last_coverage\t$coverage_count\n"; + if($c <= $good_coverage_basic){ + $homo_misses += false_neg_homo_count($last_coverage, $coverage_count); + $het_misses += false_neg_het_count($last_coverage, $coverage_count); + } + $coverage_count = 0; + $last_coverage = $c; + } + $coverage_count++; +} +$homo_misses = int($homo_misses+0.5); +$het_misses = int($het_misses+0.5); +print STATS "Estimated percentage of homozygous SNPs missed\t", $homo_misses/($targeted_total*0.000358*0.35)*100, "\n"; +print STATS "Estimated number of homozygous SNPs missed\t$homo_misses\n"; +print STATS "Estimated percentage of heterozygous SNPs missed\t", $het_misses/($targeted_total*0.000358*0.65)*100, "\n"; +print STATS "Estimated number of heterozygous SNPs missed\t$het_misses\n"; + +print STDERR "Generating coverage percentiles and false negative/false positive estimates\n" unless $quiet; + +my $percentile_reporting = 1; +my $lt_fold_10 = 0; +my $lt_fold_20 = 0; +my $poor_cutoff_percentile = 5; +my $poor_cutoff_coverage = 0; +my $running_cov_tot; +for (my $i = 0; $i <= $#coverages; $i++){ + $running_cov_tot += $coverages[$i]; + if($coverages[$i] < $good_coverage_basic){ + $lt_fold_20++; + if($coverages[$i] < $good_homo_coverage){ + $lt_fold_10++; + } + } + if($running_cov_tot >= $percentile_reporting*$percentile_size){ + print PERCENTILE "$percentile_reporting\t$i\n"; + $percentile_reporting++; + } +} +close(PERCENTILE); + +print STATS "Total bases with less than $good_homo_coverage-fold coverage\t$lt_fold_10\n"; +print STATS "Total bases with less than $good_coverage_basic-fold coverage\t$lt_fold_20\n"; +close(STATS); + +print STDERR "Generating read depth cumulative distribution graph\n" unless $quiet; +# Using three colors to show trouble coverage levels +my @low_coverage = @cumu_pct[0..$good_homo_coverage]; +my @medium_coverage = (("0") x ($good_homo_coverage-1), @cumu_pct[$good_homo_coverage..($good_coverage_basic-1)]); +for(my $i = 0; $i < $good_coverage_basic; $i++){ + $cumu_pct[$i] = 0; +} +my $graph = new GD::Graph::bars(($#cumu_pct*2)+100, 600); +$graph->set(x_label => "Mapped read depth", + y_label => "Percentage of target reference", + title => "Cumulative distribution of mapped read depth (0 to 98th percentiles)", + cumulate => 1, + transparent => 0, + bar_spacing => 0, + bar_width => 2, + fgclr => "black", + dclrs => ["red", "yellow", "green"], + x_ticks => 0, + long_ticks => 1, + tick_length => 0, + y_tick_number => 10, + y_number_format =>'%d%%', + box_axis => 0) + or die "While setting up chart", $graph->error; +my $gd = $graph->plot([\@depth_labels, \@low_coverage, \@medium_coverage, \@cumu_pct]) or die $graph->error; +binmode HISTOGRAM; +print HISTOGRAM $gd->png; +close(HISTOGRAM); + +# args: read depth, count of bases with this read depth +sub false_neg_homo_count{ + if($_[0] < $good_homo_coverage){ # 0.000358 based on doi:10.1371/journal.pgen.1000160 + return 0.000358*0.35*$_[1]; # 35% of SNPs are homo based on own observations + } + elsif($_[0] > $good_coverage_basic){ + return 0; + } + return (-0.118*log($_[0])+0.27)*0.000358*$_[1]; +} + +sub false_neg_het_count{ + if($_[0] < $good_homo_coverage){ + return 0.000358*0.65*$_[1]; # 65% of SNPs are het based on own observations + } + elsif($_[0] >= $good_coverage_basic){ + return 0; + } + return (-0.0004*($_[0]**2)-0.00048*$_[0]+0.2483)*0.000358*$_[1]; +} + +sub print_low{ + my ($contig_name, $low_covs_ref, $base_pos, $low_offset) = @_; + # exiting a low coverage section, print it, split into poor het call regions, and poor homo call regions + my $start = $base_pos+$low_offset; + my $last_i = 0; + for(my $i = 1; $i <= $#{$low_covs_ref}; $i++){ + if($low_covs_ref->[$i] >= $good_homo_coverage){ + if($low_covs_ref->[$last_i] < $good_homo_coverage){ + my @sorted_low_covs = sort {$a <=> $b} @{$low_covs_ref}[$last_i..($i-1)]; + print POOR "$contig_name\t",$start+$last_i,"\t", $start+$i, "\t", $sorted_low_covs[0]."-".$sorted_low_covs[$#sorted_low_covs], "\t0\n"; + $last_i = $i; + } + if($i == $#{$low_covs_ref}){ + my @sorted_low_covs = sort {$a <=> $b} @{$low_covs_ref}[$last_i..$i]; + print POOR "$contig_name\t",$start+$last_i,"\t", $start+$i, "\t", $sorted_low_covs[0]."-".$sorted_low_covs[$#sorted_low_covs], "\t1\n"; + } + # else continuation of a good homo coverage region + } + else{ + if($low_covs_ref->[$last_i] >= $good_homo_coverage or $i == $#{$low_covs_ref}){ + my @sorted_low_covs = sort {$a <=> $b} @{$low_covs_ref}[$last_i..($i-1)]; + print POOR "$contig_name\t",$start+$last_i,"\t", $start+$i, "\t", $sorted_low_covs[0]."-".$sorted_low_covs[$#sorted_low_covs], "\t1\n"; + $last_i = $i; + } + if($i == $#{$low_covs_ref}){ + my @sorted_low_covs = sort {$a <=> $b} @{$low_covs_ref}[$last_i..$i]; + print POOR "$contig_name\t",$start+$last_i,"\t", $start+$i, "\t", $sorted_low_covs[0]."-".$sorted_low_covs[$#sorted_low_covs], "\t0\n"; + } + # else continuation of a poor homo region + } + } +} + +sub isMale{ + my ($sam) = @_; + # Local observation: + # A robust measure across whole genome and exome kits (generally not capturing any Y genes) + # to detect female is an exceptionally low ratio for the number of reads mapped to chrY:9200000-9300000 + # (which contains several testes-specific genes), and chrY:13800000-13900000 (which is highly repetitive) + # in hg19 (UCSC). + # This holds regardless of the read depth for the experiment, so should be robust. Females have + # an average ratio of 0.000583843, with a std dev of 0.00069961. We'll set the threshold to 0.004334299 (mu + 3*sigma) + # to be 99% sure it's not a female sample (I know normal dist isn't the correct model here, but it's close enough). -Paul G. 2013-11-01 + my $chrYName = "chrY"; + if(not grep {$_ eq "chrY"} $sam->seq_ids){ + if(grep {$_ eq "Y"} $sam->seq_ids){ + $chrYName = "Y"; + } + else{ # not human-ish? + return 0; + } + } + my $low_count_region = 0; + #print STDERR "Checking sex by chrY data...\n" unless $quiet; + $sam->fetch("$chrYName:9200000-9300000", sub {$low_count_region++}); + my $high_count_region = 0; + $sam->fetch("$chrYName:13800000-13900000", sub {$high_count_region++}); + if($high_count_region){ + #print STDERR "$low_count_region/$high_count_region = ", $low_count_region/$high_count_region, "\n" unless $quiet; + } + else{ + #print STDERR "No relevant chrY data\n" unless $quiet; + } + return $high_count_region ? ($low_count_region/$high_count_region > 0.004334299 ? 1 : 0) : 0; +}