annotate depth_reports @ 1:a0f4b5618eee default tip

abstracted capture kits
author Yusuf Ali <ali@yusuf.email>
date Wed, 25 Mar 2015 15:47:16 -0600
parents 6b2e640c8c6d
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
1 #!/usr/bin/env perl
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
2
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
3 use Bio::DB::Sam;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
4 use GD::Graph::bars;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
5 use File::Basename;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
6 use strict;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
7 use warnings;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
8 use vars qw($good_homo_coverage);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
9
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
10 $good_homo_coverage = 3;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
11
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
12 if(@ARGV == 1 and $ARGV[0] eq "-v"){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
13 print "Version 1.0\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
14 exit;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
15 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
16
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
17 # configuration file parsing/loading
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
18 my %config;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
19 my $dirname = dirname(__FILE__);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
20 my $tool_data = shift @ARGV;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
21 if(not -e "$tool_data/depth_report.loc"){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
22 system("cp $dirname/tool-data/depth_report.loc $tool_data/depth_report.loc");
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
23 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
24
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
25 open CONFIG, '<', "$tool_data/depth_report.loc";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
26 while(<CONFIG>){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
27 (my $key, my $value) = split(/\s+/, $_ );
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
28 $config{$key} = $value;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
29 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
30 close CONFIG;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
31
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
32 my $quiet = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
33 if(@ARGV and $ARGV[0] =~ /^-q(?:uiet)?$/){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
34 $quiet = 1;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
35 shift @ARGV;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
36 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
37
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
38 my $good_coverage_basic = 20;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
39 @ARGV == 7 or @ARGV == 8 or @ARGV == 9
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
40 or die "Usage: $0 [-q(uiet)] <mapped reads.bam> <targeted regions.bed> <out_summary_table> <out_poor_coverage_bed> <out_depth_graph> <out_depth_table> <out_mapped_percentile_table> [input_dud_regions.bed] [chr#]\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
41
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
42 if(not -e $ARGV[0]){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
43 die "The specified BAM read alignment file ($ARGV[0]) does not exist\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
44 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
45 if(not -r $ARGV[0]){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
46 die "The specified BAM read alignment file ($ARGV[0]) is not readable\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
47 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
48
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
49 if(@ARGV == 7 or $ARGV[7] eq "None"){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
50 $ARGV[7] = "/dev/null";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
51 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
52 if(not -r $ARGV[7]){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
53 die "The specified dud regions BED file ($ARGV[7]) is not readable\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
54 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
55
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
56 my $target_contig;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
57 if(@ARGV == 9){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
58 $target_contig = $ARGV[8]; # for debugging or whole genomes
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
59 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
60
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
61 my $bed_file = $config{"capture_kits_dir"} . $ARGV[1];
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
62 my $targeted_total = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
63 my $targeted_regions = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
64 my $targeted_coverage = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
65 my %regions; # contigname => %region{start+-end}
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
66 open(BED, $bed_file )
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
67 or die "Cannot open target regions BED file $bed_file for reading: $!\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
68
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
69 open(STATS, ">$ARGV[2]")
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
70 or die "Cannot open summary table $ARGV[2] for writing: $!\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
71 open(POOR, ">$ARGV[3]")
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
72 or die "Cannot open poor coverage BED file $ARGV[3] for writing: $!\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
73 open(HISTOGRAM, ">$ARGV[4]")
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
74 or die "Cannot open cumulative depth graph $ARGV[4] for writing: $!\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
75 open(COVER, ">$ARGV[5]")
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
76 or die "Cannot open read depth table $ARGV[5] for writing: $!\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
77 open(PERCENTILE, ">$ARGV[6]")
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
78 or die "Cannot open percentile table $ARGV[6] for writing: $!\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
79 print PERCENTILE "# Percentile of mapped bases\tNum reference targeted positions covered\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
80
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
81 print STATS "# Summary statistics for BAM file $ARGV[0] using targeted regions from $bed_file\n" unless $quiet;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
82
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
83 print STDERR "Reading in target regions from BED file\n" unless $quiet;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
84 while(<BED>){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
85 next if /^\s*(?:track\s|#)/;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
86 tr/\r//d;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
87 chomp;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
88 my @fields = split /\t/, $_;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
89 next if defined $target_contig and $fields[0] ne $target_contig;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
90 if(not exists $regions{$fields[0]}){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
91 $regions{$fields[0]} = {};
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
92 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
93 $regions{$fields[0]}->{$fields[1].":".$fields[2]} = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
94 $targeted_regions++;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
95 $targeted_total += $fields[2]-$fields[1]+1;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
96 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
97 close(BED);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
98
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
99
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
100 print STDERR "Reading in dud regions from BED file\n" unless $quiet;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
101 my %duds; # contigname => %region{start+-end}
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
102 my $duds_total = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
103 open(DUDS, $ARGV[7])
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
104 or die "Cannot open dud regions BED file $ARGV[7] for reading: $!\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
105 while(<DUDS>){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
106 next if /^\s*#/;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
107 chomp;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
108 my @fields = split /\t/, $_;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
109 next if defined $target_contig and $fields[0] ne $target_contig;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
110 if(not exists $regions{$fields[0]}->{$fields[1].$fields[5].$fields[2]}){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
111 die "The duds file includes a region ($fields[0]:$fields[1]$fields[5]$fields[2]) ",
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
112 "not listed in the targeted regions BED file, ",
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
113 "please make sure the two are synched.\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
114 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
115 if(not exists $duds{$fields[0]}){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
116 $duds{$fields[0]} = {};
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
117 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
118 $duds{$fields[0]}->{$fields[1].$fields[5].$fields[2]} = 1;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
119 $duds_total += $fields[2]-$fields[1]+1;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
120 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
121 close(DUDS);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
122
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
123
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
124 my $tick_count = int($targeted_total/100); # for progress bar
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
125
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
126 # Read the alignment info
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
127 # Load the BAM file
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
128 my $sam = Bio::DB::Sam->new(-bam => $ARGV[0], -autoindex => 1);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
129 my $bam_header = $sam->bam->header;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
130 my $contig_names = $bam_header->target_name;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
131
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
132 # Make sure the BED file and reference sequence refer to the same contigs
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
133 for my $contig_name (@$contig_names){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
134 next if defined $target_contig and $contig_name ne $target_contig;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
135 if(not exists $regions{$contig_name}){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
136 print STATS "#Warning: The BED file makes no reference to the BAM's $contig_name reference sequence, ",
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
137 "no reporting will be done for this contig's mappings\n" unless $quiet;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
138 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
139 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
140
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
141 print STATS "Total number of targeted reference contigs\t", scalar(keys %regions), "\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
142 print STATS "Total number of targeted regions\t$targeted_regions\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
143 print STATS "Total number of targeted nucleotide bases\t$targeted_total\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
144 print STATS "Total number of targeted bases considered pre-emptively as duds\t$duds_total\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
145
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
146 # Heuristic
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
147 my $isMale = &isMale($sam);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
148 #print STDERR "Male: $isMale\n" unless $quiet;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
149
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
150 # Check each BED region for coverage stats
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
151 print STDERR "Reading coverage data from BAM file (each dot represents $tick_count bases)\n" unless $quiet;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
152 print STDERR "0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%\n" unless $quiet;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
153 my $base_count = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
154 my @coverages;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
155 for my $contig_name (sort keys %regions){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
156 next if defined $target_contig and $contig_name ne $target_contig;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
157 my $c_name = $contig_name;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
158 if(not grep {$_ eq $c_name} @$contig_names){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
159 $c_name =~ s/^chr//;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
160 if(not grep {$_ eq $c_name} @$contig_names){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
161 print STATS "#Warning: The BAM file makes no reference to the BED's $contig_name reference sequence, ",
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
162 "no reporting will be done for this contig's mappings\n" unless $quiet;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
163 next;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
164 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
165 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
166 my $good_coverage = $good_coverage_basic;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
167 $good_coverage = $good_homo_coverage if $contig_name =~ /X$/ and $isMale; # males are hemizygous for X, so adjust the poor coverage threshold accordingly
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
168 for my $range_key (keys %{$regions{$contig_name}}){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
169 #print STDERR "Processing range $range_key\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
170 my ($range_min, $range_max) = $range_key =~ /^(\d+):(\d+)$/;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
171 my @cov = $sam->get_features_by_location(-seq_id => $c_name,
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
172 -type => "coverage",
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
173 -start => $range_min,
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
174 -end => $range_max);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
175 if(not @cov){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
176 if(not $quiet){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
177 die "No BAM data for $c_name (", $range_min,",",
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
178 $range_max, "). No coverage data returned\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
179 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
180 next;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
181 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
182 @cov = $cov[0]->coverage;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
183
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
184 if(@cov == 0){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
185 @cov = (0) x ($range_max-$range_min+1);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
186 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
187
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
188 # Gather min, Q1, median, Q3, max
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
189 $regions{$contig_name}->{$range_key} = [];
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
190 my $ignore = exists $duds{$contig_name}->{$range_key};
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
191 my $low_coverage_start = -1;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
192 my @low_covs;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
193 for(my $i = 0; $i <= $#cov; $i++){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
194 $targeted_coverage += $cov[$i];
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
195 if(not $ignore){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
196 $coverages[$base_count++] = $cov[$i];
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
197 if($cov[$i] >= $good_coverage){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
198 if(@low_covs){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
199 &print_low($contig_name, \@low_covs, $range_min, $low_coverage_start);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
200 @low_covs = (); # clear!
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
201 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
202 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
203 else{
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
204 if(not @low_covs){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
205 # starting a low coverage section
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
206 $low_coverage_start = $i;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
207 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
208 # else continuing a low coverage section
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
209 push @low_covs, $cov[$i];
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
210 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
211 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
212 print STDERR "." if not $quiet and $base_count%$tick_count == 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
213 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
214 if(@low_covs){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
215 &print_low($contig_name, \@low_covs, $range_min, $low_coverage_start);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
216 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
217 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
218 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
219 print STDERR "$base_count/",($targeted_total-$duds_total),"\n" unless $quiet;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
220
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
221 print STATS "Total number of bases mapped to targeted regions\t$targeted_coverage\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
222
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
223 my $percentile_size = int($targeted_coverage/100);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
224
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
225 # Generate a cumulative distribution of coverage
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
226 print STDERR "Sorting mapped read depth statistics\n" unless $quiet;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
227 @coverages = sort {$a <=> $b} @coverages;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
228
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
229 print STATS "Minimum coverage in targeted regions\t$coverages[0]\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
230 print STATS "Median coverage in targeted regions\t", $coverages[int($#coverages/2)], "\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
231 print STATS "Mean coverage in targeted regions\t", ($targeted_coverage/$targeted_total), "\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
232 print STATS "Maximum coverage in targeted regions\t", $coverages[$#coverages], "\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
233
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
234 # Estimate the false negative rates
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
235 print STDERR "Estimating SNP discovery sensitivity...\n" unless $quiet;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
236 print COVER "# Mapped read depth\tNumber of bases\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
237 my $last_coverage = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
238 my $coverage_count = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
239 my $homo_misses = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
240 my $het_misses = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
241 my @depth_labels;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
242 my @cumu_pct;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
243 $#coverages = int($#coverages*0.98); # Look at percentiles 0-98, otherwise chart is too big due to long tail
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
244 for my $c (@coverages){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
245 if($c != $last_coverage){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
246 if($last_coverage != 0){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
247 # fill in any missing values for the x axis of the cumulative distribution graph
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
248 for(my $i = $#depth_labels+1; $i < $last_coverage; $i++){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
249 push @depth_labels, ($i%10?"":$i);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
250 push @cumu_pct, $cumu_pct[$i-1];
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
251 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
252 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
253 push @depth_labels, ($last_coverage%10?"":$last_coverage); # label every 10 bars
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
254 if($last_coverage == 0){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
255 push @cumu_pct, $coverage_count/($targeted_total-$duds_total)*100; # cumulative percentage
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
256 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
257 else{
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
258 push @cumu_pct, $cumu_pct[$#cumu_pct]+$coverage_count/($targeted_total-$duds_total)*100; # cumulative percentage
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
259 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
260 print COVER "$last_coverage\t$coverage_count\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
261 if($c <= $good_coverage_basic){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
262 $homo_misses += false_neg_homo_count($last_coverage, $coverage_count);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
263 $het_misses += false_neg_het_count($last_coverage, $coverage_count);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
264 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
265 $coverage_count = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
266 $last_coverage = $c;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
267 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
268 $coverage_count++;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
269 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
270 $homo_misses = int($homo_misses+0.5);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
271 $het_misses = int($het_misses+0.5);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
272 print STATS "Estimated percentage of homozygous SNPs missed\t", $homo_misses/($targeted_total*0.000358*0.35)*100, "\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
273 print STATS "Estimated number of homozygous SNPs missed\t$homo_misses\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
274 print STATS "Estimated percentage of heterozygous SNPs missed\t", $het_misses/($targeted_total*0.000358*0.65)*100, "\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
275 print STATS "Estimated number of heterozygous SNPs missed\t$het_misses\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
276
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
277 print STDERR "Generating coverage percentiles and false negative/false positive estimates\n" unless $quiet;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
278
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
279 my $percentile_reporting = 1;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
280 my $lt_fold_10 = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
281 my $lt_fold_20 = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
282 my $poor_cutoff_percentile = 5;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
283 my $poor_cutoff_coverage = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
284 my $running_cov_tot;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
285 for (my $i = 0; $i <= $#coverages; $i++){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
286 $running_cov_tot += $coverages[$i];
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
287 if($coverages[$i] < $good_coverage_basic){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
288 $lt_fold_20++;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
289 if($coverages[$i] < $good_homo_coverage){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
290 $lt_fold_10++;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
291 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
292 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
293 if($running_cov_tot >= $percentile_reporting*$percentile_size){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
294 print PERCENTILE "$percentile_reporting\t$i\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
295 $percentile_reporting++;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
296 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
297 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
298 close(PERCENTILE);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
299
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
300 print STATS "Total bases with less than $good_homo_coverage-fold coverage\t$lt_fold_10\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
301 print STATS "Total bases with less than $good_coverage_basic-fold coverage\t$lt_fold_20\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
302 close(STATS);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
303
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
304 print STDERR "Generating read depth cumulative distribution graph\n" unless $quiet;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
305 # Using three colors to show trouble coverage levels
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
306 my @low_coverage = @cumu_pct[0..$good_homo_coverage];
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
307 my @medium_coverage = (("0") x ($good_homo_coverage-1), @cumu_pct[$good_homo_coverage..($good_coverage_basic-1)]);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
308 for(my $i = 0; $i < $good_coverage_basic; $i++){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
309 $cumu_pct[$i] = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
310 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
311 my $graph = new GD::Graph::bars(($#cumu_pct*2)+100, 600);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
312 $graph->set(x_label => "Mapped read depth",
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
313 y_label => "Percentage of target reference",
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
314 title => "Cumulative distribution of mapped read depth (0 to 98th percentiles)",
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
315 cumulate => 1,
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
316 transparent => 0,
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
317 bar_spacing => 0,
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
318 bar_width => 2,
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
319 fgclr => "black",
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
320 dclrs => ["red", "yellow", "green"],
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
321 x_ticks => 0,
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
322 long_ticks => 1,
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
323 tick_length => 0,
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
324 y_tick_number => 10,
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
325 y_number_format =>'%d%%',
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
326 box_axis => 0)
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
327 or die "While setting up chart", $graph->error;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
328 my $gd = $graph->plot([\@depth_labels, \@low_coverage, \@medium_coverage, \@cumu_pct]) or die $graph->error;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
329 binmode HISTOGRAM;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
330 print HISTOGRAM $gd->png;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
331 close(HISTOGRAM);
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
332
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
333 # args: read depth, count of bases with this read depth
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
334 sub false_neg_homo_count{
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
335 if($_[0] < $good_homo_coverage){ # 0.000358 based on doi:10.1371/journal.pgen.1000160
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
336 return 0.000358*0.35*$_[1]; # 35% of SNPs are homo based on own observations
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
337 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
338 elsif($_[0] > $good_coverage_basic){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
339 return 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
340 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
341 return (-0.118*log($_[0])+0.27)*0.000358*$_[1];
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
342 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
343
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
344 sub false_neg_het_count{
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
345 if($_[0] < $good_homo_coverage){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
346 return 0.000358*0.65*$_[1]; # 65% of SNPs are het based on own observations
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
347 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
348 elsif($_[0] >= $good_coverage_basic){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
349 return 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
350 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
351 return (-0.0004*($_[0]**2)-0.00048*$_[0]+0.2483)*0.000358*$_[1];
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
352 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
353
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
354 sub print_low{
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
355 my ($contig_name, $low_covs_ref, $base_pos, $low_offset) = @_;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
356 # exiting a low coverage section, print it, split into poor het call regions, and poor homo call regions
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
357 my $start = $base_pos+$low_offset;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
358 my $last_i = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
359 for(my $i = 1; $i <= $#{$low_covs_ref}; $i++){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
360 if($low_covs_ref->[$i] >= $good_homo_coverage){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
361 if($low_covs_ref->[$last_i] < $good_homo_coverage){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
362 my @sorted_low_covs = sort {$a <=> $b} @{$low_covs_ref}[$last_i..($i-1)];
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
363 print POOR "$contig_name\t",$start+$last_i,"\t", $start+$i, "\t", $sorted_low_covs[0]."-".$sorted_low_covs[$#sorted_low_covs], "\t0\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
364 $last_i = $i;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
365 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
366 if($i == $#{$low_covs_ref}){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
367 my @sorted_low_covs = sort {$a <=> $b} @{$low_covs_ref}[$last_i..$i];
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
368 print POOR "$contig_name\t",$start+$last_i,"\t", $start+$i, "\t", $sorted_low_covs[0]."-".$sorted_low_covs[$#sorted_low_covs], "\t1\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
369 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
370 # else continuation of a good homo coverage region
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
371 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
372 else{
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
373 if($low_covs_ref->[$last_i] >= $good_homo_coverage or $i == $#{$low_covs_ref}){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
374 my @sorted_low_covs = sort {$a <=> $b} @{$low_covs_ref}[$last_i..($i-1)];
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
375 print POOR "$contig_name\t",$start+$last_i,"\t", $start+$i, "\t", $sorted_low_covs[0]."-".$sorted_low_covs[$#sorted_low_covs], "\t1\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
376 $last_i = $i;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
377 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
378 if($i == $#{$low_covs_ref}){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
379 my @sorted_low_covs = sort {$a <=> $b} @{$low_covs_ref}[$last_i..$i];
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
380 print POOR "$contig_name\t",$start+$last_i,"\t", $start+$i, "\t", $sorted_low_covs[0]."-".$sorted_low_covs[$#sorted_low_covs], "\t0\n";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
381 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
382 # else continuation of a poor homo region
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
383 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
384 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
385 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
386
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
387 sub isMale{
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
388 my ($sam) = @_;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
389 # Local observation:
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
390 # A robust measure across whole genome and exome kits (generally not capturing any Y genes)
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
391 # to detect female is an exceptionally low ratio for the number of reads mapped to chrY:9200000-9300000
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
392 # (which contains several testes-specific genes), and chrY:13800000-13900000 (which is highly repetitive)
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
393 # in hg19 (UCSC).
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
394 # This holds regardless of the read depth for the experiment, so should be robust. Females have
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
395 # an average ratio of 0.000583843, with a std dev of 0.00069961. We'll set the threshold to 0.004334299 (mu + 3*sigma)
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
396 # to be 99% sure it's not a female sample (I know normal dist isn't the correct model here, but it's close enough). -Paul G. 2013-11-01
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
397 my $chrYName = "chrY";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
398 if(not grep {$_ eq "chrY"} $sam->seq_ids){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
399 if(grep {$_ eq "Y"} $sam->seq_ids){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
400 $chrYName = "Y";
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
401 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
402 else{ # not human-ish?
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
403 return 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
404 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
405 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
406 my $low_count_region = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
407 #print STDERR "Checking sex by chrY data...\n" unless $quiet;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
408 $sam->fetch("$chrYName:9200000-9300000", sub {$low_count_region++});
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
409 my $high_count_region = 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
410 $sam->fetch("$chrYName:13800000-13900000", sub {$high_count_region++});
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
411 if($high_count_region){
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
412 #print STDERR "$low_count_region/$high_count_region = ", $low_count_region/$high_count_region, "\n" unless $quiet;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
413 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
414 else{
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
415 #print STDERR "No relevant chrY data\n" unless $quiet;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
416 }
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
417 return $high_count_region ? ($low_count_region/$high_count_region > 0.004334299 ? 1 : 0) : 0;
6b2e640c8c6d initial commit
Yusuf Ali <ali@yusuf.email>
parents:
diff changeset
418 }