Mercurial > repos > jjohnson > crest
comparison SVUtil.pm @ 0:acc8d8bfeb9a
Uploaded
author | jjohnson |
---|---|
date | Wed, 08 Feb 2012 16:59:24 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc8d8bfeb9a |
---|---|
1 package SVUtil; | |
2 | |
3 use strict; | |
4 use Carp; | |
5 use English; | |
6 use Bio::SeqIO; | |
7 use Bio::DB::Sam; | |
8 use File::Spec; | |
9 use File::Path qw(remove_tree); | |
10 use File::Temp qw/ tempfile tempdir /; | |
11 use Cwd; | |
12 use SCValidator qw(LEFT_CLIP RIGHT_CLIP); | |
13 require Exporter; | |
14 our @ISA = ('Exporter'); | |
15 our @EXPORT = qw(is_new_pair clip_fa_file prepare_reads parse_range | |
16 get_input_bam get_work_dir is_PCR_dup read_fa_file find_smallest_cover | |
17 get_sclip_reads rev_comp); | |
18 our @EXPORT_OK = qw($use_scratch $spacer_seq $spcer_seq_len $work_dir); | |
19 our $use_scratch = 1; | |
20 our $work_dir; | |
21 | |
22 sub rev_comp { | |
23 my $str = shift; | |
24 $str = reverse $str; | |
25 $str =~ tr/ACTG/TGAC/; | |
26 return $str; | |
27 } | |
28 # something about binormial distribution | |
29 sub choose { | |
30 my ($n,$k) = @_; | |
31 my ($result,$j) = (1,1); | |
32 | |
33 return 0 if $k>$n||$k<0; | |
34 $k = ($n - $k) if ($n - $k) <$k; | |
35 | |
36 while ($j <= $k ) { | |
37 $result *= $n--; | |
38 $result /= $j++; | |
39 } | |
40 return $result; | |
41 } | |
42 | |
43 sub dbinorm { #desity funtion | |
44 my($n, $k, $p) = @_; | |
45 return unless($n > 0 && $k >= 0 && $n >= $k && $p >= 0 && $p <=1); | |
46 my $prob = log(choose($n, $k)) + ($n-$k)*log(1-$p) + $k*log($p); | |
47 return exp($prob); | |
48 } | |
49 | |
50 sub find_smallest_cover { | |
51 my ($c, $p, $crit) = @_; | |
52 my @s_cover; | |
53 for( my $i = 1; $i < $c; $i++) { | |
54 my $n = 1; | |
55 while(1) { | |
56 my $tmp = 0; | |
57 for( my $j = 0; $j <= $i && $j < $n; $j++) { | |
58 $tmp += dbinorm($n, $j, $p); | |
59 } | |
60 if( $tmp < $crit) { | |
61 $s_cover[$i] = $n - 1; | |
62 last; | |
63 } | |
64 $n++; | |
65 } | |
66 } | |
67 return \@s_cover; | |
68 } | |
69 | |
70 | |
71 | |
72 sub read_fa_file { | |
73 my $file = shift; | |
74 my $in = Bio::SeqIO->new(-file => $file, -format => 'Fasta'); | |
75 my %seqs; | |
76 while( my $seq=$in->next_seq()) { | |
77 $seqs{$seq->display_id} = $seq->seq; | |
78 } | |
79 return \%seqs; | |
80 } | |
81 | |
82 # rmdup: remove PCR dumplication related code | |
83 # you can replace it using your own criteria | |
84 sub is_new_pair { | |
85 my ($a, $sclip_len, $pairs) = @_; | |
86 foreach my $p (@{$pairs}) { | |
87 return undef if($a->start == $p->[0] && $a->end == $p->[1] && | |
88 $a->mate_start == $p->[2] && $a->mate_end == $p->[3] && $sclip_len == $p->[4]); | |
89 } | |
90 return 1; | |
91 } | |
92 | |
93 # | |
94 #clipping sequnce related code | |
95 # | |
96 # the spacer sequence, you can replace it with your own one | |
97 my $spacer_seq = 'ACGTACGT' . 'CGTA' x 6 . 'ATGCATGC'; | |
98 my $spacer_seq_len = length($spacer_seq); | |
99 | |
100 # clip the reads that with spacer_seq at one end | |
101 sub clip_fa_file { | |
102 my ($file, $ort) = @_; | |
103 my $out = $file . ".clip.fa"; | |
104 my $in = Bio::SeqIO->new( -file => $file, -format => 'Fasta'); | |
105 open my $OUT, ">$out" or croak "can't open $file.fa:$OS_ERROR"; | |
106 while( my $seq=$in->next_seq()) { | |
107 print $OUT ">", $seq->display_id, "\n"; | |
108 if($ort eq RIGHT_CLIP) { | |
109 print $OUT $seq->subseq($spacer_seq_len + 1, $seq->length), "\n"; | |
110 } | |
111 else { | |
112 print $OUT $seq->subseq(1, $seq->length - $spacer_seq_len), "\n"; | |
113 } | |
114 } | |
115 close($OUT); | |
116 return $out; | |
117 } | |
118 | |
119 # prepare the reads into a fasta file | |
120 # due to most assembler can't handle short read well, we add | |
121 # the spacer_seq to the start/end of each sequence arrording to | |
122 # the softclip postion | |
123 sub prepare_reads { | |
124 my($r_reads, $clip) = @_; | |
125 my ($fh, $fa_name) = tempfile( DIR => $work_dir, SUFFIX => '.fa'); | |
126 my $qual_name = $fa_name . ".qual"; | |
127 my $spacer_qual = ' 50 ' x $spacer_seq_len; | |
128 | |
129 open my $qual_fh, ">$qual_name" or croak "can't open $qual_name:$OS_ERROR"; | |
130 foreach my $r (@{$r_reads}) { | |
131 print $fh ">", $r->{name}, "\n"; | |
132 my $tmp = $r->{seq}; | |
133 $tmp = defined $clip ? | |
134 ($clip == RIGHT_CLIP ? $spacer_seq . $tmp : $tmp . $spacer_seq) : | |
135 $tmp; | |
136 print $fh $tmp, "\n"; | |
137 next unless $r->{qual}; | |
138 print $qual_fh ">", $r->{name}, "\n"; | |
139 $tmp = join(" ", @{$r->{qual}}); | |
140 $tmp = defined $clip ? | |
141 ($clip == RIGHT_CLIP ? $spacer_qual . $tmp : $tmp . $spacer_qual) : | |
142 $tmp; | |
143 print $qual_fh $tmp, "\n"; | |
144 } | |
145 close $qual_fh; | |
146 return $fa_name; | |
147 } | |
148 | |
149 # check PCR duplicate | |
150 sub is_PCR_dup { | |
151 my ($a, $pairs, $sclip_len) = @_; | |
152 my ($s, $e, $ms, $me ) = ($a->start, $a->end, $a->mate_start ? $a->mate_start : 0, | |
153 $a->mate_end ? $a->mate_end : 0); | |
154 foreach my $p (@{$pairs}) { | |
155 return 1 if($s == $p->[0] && $e == $p->[1] && | |
156 $ms == $p->[2] && $me == $p->[3] && $sclip_len == $p->[4]); | |
157 } | |
158 return; | |
159 } | |
160 | |
161 # parse the range of input, format is: chr:start-end | |
162 # start and end is optional | |
163 sub parse_range { | |
164 my $range = shift; | |
165 my ($chr, $start, $end); | |
166 my @field = split /:/, $range; | |
167 $chr = $field[0]; | |
168 # $chr = substr($chr, 3) if($chr =~ /^chr/); | |
169 if(@field > 1) { | |
170 @field = split /-/, $field[1]; | |
171 $start = $field[0]; | |
172 $end = $field[1] if($field[1]); | |
173 if($start !~ m/^\d+$/ or $end !~ m/^\d+$/) { | |
174 croak "wrong range format, need to be: chr:start-end"; | |
175 } | |
176 } | |
177 return ($chr, $start, $end); | |
178 } | |
179 | |
180 # St. Jude only | |
181 # grab the bam file from the bam dir | |
182 my $raw_bam_dir = "/nfs_exports/genomes/1/PCGP/BucketRaw"; | |
183 sub get_input_bam { | |
184 my $sample = shift; | |
185 my ($group) = $sample =~ /(.*?)\d+/; | |
186 my $tmp_bam_dir = File::Spec->catdir($raw_bam_dir, $group); | |
187 opendir(my $dh, $tmp_bam_dir) or croak "can't open directory: $raw_bam_dir: $OS_ERROR"; | |
188 my @files = grep { /^$sample-.*bam$/ } readdir($dh); | |
189 close $dh; | |
190 return File::Spec->catfile($raw_bam_dir, $group, $files[0]); | |
191 } | |
192 | |
193 #scratch related code | |
194 sub get_work_dir { | |
195 my %options = @_; | |
196 my $scratch_dir; | |
197 if($options{-SCRATCH}){ | |
198 $scratch_dir = (exists $options{-DIR} ? $options{-DIR} : | |
199 (-e '/scratch_space' ? '/scratch_space' : undef) ); | |
200 } | |
201 | |
202 my $dir; | |
203 if($scratch_dir) { | |
204 $scratch_dir = File::Spec->rel2abs($scratch_dir); | |
205 if($ENV{'JOB_ID'}) { | |
206 $dir = File::Spec->catdir($scratch_dir, $ENV{'JOB_ID'}) | |
207 } | |
208 else { | |
209 $dir = tempdir(DIR => $scratch_dir); | |
210 } | |
211 } | |
212 else { | |
213 $dir = tempdir(); | |
214 } | |
215 mkdir($dir) if( ! -e $dir); | |
216 $work_dir = $dir; | |
217 return $dir; | |
218 } | |
219 | |
220 | |
221 sub get_sclip_reads { | |
222 my %args = @_; | |
223 my ($sam, $chr, $start, $end, $clip, $min_len, $validator, $paired, $rmdup, $full_seq, $extra_base) = | |
224 ($args{-SAM}, $args{-CHR}, $args{-START}, $args{-END}, $args{-CLIP}, | |
225 $args{-MINCLIPLEN}, $args{-VALIDATOR}, $args{-PAIRED}, $args{-RMDUP}, | |
226 $args{-FULLSEQ}, $args{-EXTRA}); | |
227 $extra_base = 0 unless $extra_base; | |
228 my @rtn; | |
229 $min_len = 0 unless $min_len; | |
230 my $max_sclip_len = 0; | |
231 my $range = $chr; | |
232 $range = $chr . ":" . $start . "-" . $end if($start && $end); | |
233 my @pairs; | |
234 my $strand_validator = $paired ? 1 : 0; | |
235 | |
236 $sam->fetch($range, sub { | |
237 my $a = shift; | |
238 my $cigar_str = $a->cigar_str; | |
239 my @cigar_array = @{$a->cigar_array}; | |
240 return if($cigar_str !~ m/S/); | |
241 return unless($a->start && $a->end); | |
242 #return if(!$a->proper_pair && $paired); | |
243 if($paired && !$strand_validator) { | |
244 $validator->add_validator("strand_validator"); | |
245 } | |
246 | |
247 if($paired && !$a->proper_pair) { | |
248 $validator->remove_validator("strand_validator"); | |
249 $strand_validator = 0; | |
250 } | |
251 my ($sclip_len, $pos, $seq, $qual); | |
252 my @tmp = $a->qscore; | |
253 if($cigar_array[0]->[0] eq 'S' && $clip == LEFT_CLIP ) { | |
254 $pos = $a->start; | |
255 return if($pos < $start or $pos > $end); # the softclipping position is not in range | |
256 $sclip_len = $cigar_array[0]->[1]; | |
257 #print $cigar_str, "\t", scalar @pairs, "\n"; | |
258 return unless($validator && $validator->validate($a, LEFT_CLIP)); | |
259 $max_sclip_len = $sclip_len if($max_sclip_len < $sclip_len); | |
260 return if(@pairs > 0 && $rmdup && is_PCR_dup($a, \@pairs, $sclip_len)); #it's a PCR dumplicate | |
261 $seq = $full_seq ? $a->query->dna : substr($a->query->dna, 0, $sclip_len + $extra_base); | |
262 @tmp = $full_seq ? @tmp : @tmp[0 .. ($sclip_len -1 + $extra_base)]; | |
263 | |
264 push @pairs, [$a->start, $a->end, $a->mate_start ? $a->mate_start : 0, | |
265 $a->mate_end ? $a->mate_end : 0, $sclip_len] if($rmdup); | |
266 my $qscore = $a->qscore; | |
267 | |
268 push @rtn, { | |
269 name => $a->qname, | |
270 seq => $seq, | |
271 qual => \@tmp, | |
272 full_seq => $a->query->dna, | |
273 full_qual => $qscore, | |
274 pos => $pos, | |
275 }; | |
276 } | |
277 if($cigar_array[$#cigar_array]->[0] eq 'S' && $clip == RIGHT_CLIP ) { | |
278 $pos = $a->end; | |
279 return if($pos < $start or $pos > $end); # the softclipping position is not in range | |
280 # print $cigar_str, "\t", scalar @pairs, "\n"; | |
281 return unless $validator->validate($a, RIGHT_CLIP); | |
282 $sclip_len = $cigar_array[$#cigar_array]->[1]; | |
283 $max_sclip_len = $sclip_len if($max_sclip_len < $sclip_len); | |
284 return if(@pairs > 0 && $rmdup && is_PCR_dup($a, \@pairs, $sclip_len)); #it's a PCR dumplicate | |
285 $seq = $full_seq ? $a->qseq : substr($a->qseq, $a->l_qseq - $sclip_len - $extra_base ); | |
286 @tmp = $full_seq ? @tmp : @tmp[($a->l_qseq - $sclip_len - $extra_base) .. ($a->l_qseq - 1)]; | |
287 push @pairs, [$a->start, $a->end, $a->mate_start ? $a->mate_start : 0, | |
288 $a->mate_end ? $a->mate_end : 0, $sclip_len] if($rmdup); | |
289 my $qscore = $a->qscore; | |
290 push @rtn, { | |
291 name => $a->qname, | |
292 seq => $seq, | |
293 qual => \@tmp, | |
294 full_seq => $a->query->dna, | |
295 full_qual => $qscore, | |
296 pos => $pos, | |
297 }; | |
298 } | |
299 } | |
300 ); | |
301 if($max_sclip_len >= $min_len) { | |
302 return @rtn; | |
303 } | |
304 else{ | |
305 undef @rtn; return @rtn; | |
306 } | |
307 } | |
308 | |
309 my $start_dir; | |
310 BEGIN { | |
311 $start_dir = getcwd; | |
312 } | |
313 END { | |
314 chdir($start_dir); | |
315 remove_tree($work_dir) if($use_scratch && $work_dir && $start_dir ne $work_dir); | |
316 } | |
317 | |
318 1; |