Mercurial > repos > jjohnson > crest
comparison StructVar.pm @ 0:acc8d8bfeb9a
Uploaded
author | jjohnson |
---|---|
date | Wed, 08 Feb 2012 16:59:24 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc8d8bfeb9a |
---|---|
1 package StructVar; | |
2 | |
3 # $Id: StructVar.pm, v 1.00 2010/09/24 jianmin.wang Exp $ | |
4 | |
5 use strict; | |
6 use Bio::DB::Sam; | |
7 use Carp; | |
8 use Data::Dumper; | |
9 use English; | |
10 use Bio::SearchIO; | |
11 use GeneModel; | |
12 use Cwd; | |
13 use SVExtTools; | |
14 use List::MoreUtils qw/ uniq /; | |
15 use SVUtil qw( prepare_reads read_fa_file get_sclip_reads rev_comp); | |
16 use SCValidator qw(LEFT_CLIP RIGHT_CLIP); | |
17 | |
18 use constant UNK => 0; | |
19 use constant CTX => 1; | |
20 use constant INV => 2; | |
21 use constant INS => 3; | |
22 use constant DEL => 4; | |
23 use constant ITX => 5; | |
24 | |
25 # type of SVs and thier string | |
26 my %type2str = ( | |
27 0 => 'UNKNOWN', # unknow type, not single event | |
28 1 => 'CTX', # cross chromosome translocation | |
29 2 => 'INV', # inversion, requires both breakpoints available | |
30 3 => 'INS', # (tandem) insertion | |
31 4 => 'DEL', # deletion | |
32 5 => 'ITX', # within chromosome translocation | |
33 ); | |
34 # class static variables | |
35 my $sam_d; | |
36 my $sam_g; | |
37 my $gm; | |
38 my $assembler; | |
39 my $aligner; | |
40 my $mapper; | |
41 my @filters = (\&_germline_sclip_filter, \&_type_distance_filter, \&_germline_indel_filter); | |
42 my $RNASeq; | |
43 my $read_len; | |
44 my $genome; | |
45 my $tr_max_indel_size; #tandem repeat mediated indel size | |
46 my $tr_min_size; | |
47 my $tr_max_size; #tandem repeat max_size of single repeat | |
48 my $tr_min_num; #tandem repeat minimum number of occurence | |
49 | |
50 my $max_bp_dist = 15; | |
51 my $germline_seq_width = 100; | |
52 my $germline_search_width = 50; | |
53 my $min_percent_cons_of_read = 0.75; | |
54 | |
55 my %default_filters = ( | |
56 'germline_sclip' => \&_germline_sclip_filter, | |
57 'type_distance' => \&_type_distance_filter, | |
58 'germline_indel' => \&_germline_indel_filter, | |
59 # 'mapping_artifact' => \&_mapping_artifact_filter, | |
60 'mapping_quality' => \&_mapping_quality_filter, | |
61 'tandem_repeat' => \&_tandem_repeat_filter, | |
62 ); | |
63 | |
64 sub new { | |
65 my $class = shift; | |
66 my %param = @_; | |
67 my $self = {}; | |
68 $self->{first_bp} = {}; | |
69 $self->{second_bp} = {}; | |
70 $self->{first_bp} = $param{-FIRST_BP} if($param{-FIRST_BP}); | |
71 $self->{second_bp} = $param{-SECOND_BP} if($param{-SECOND_BP}); | |
72 $self->{consensus} = $param{-CONSENSUS} ? $param{-CONSENSUS} : ""; | |
73 $self->{type} = ""; | |
74 bless $self, ref($class) || $class; | |
75 return $self; | |
76 } | |
77 | |
78 sub add_filter { | |
79 my $self = shift; | |
80 my $name = shift; | |
81 my $f = shift; | |
82 croak "you must specify a filter" if(!$f); | |
83 croak "the filter must be a subroutine" if(ref($f) ne 'CODE'); | |
84 $default_filters{$name} = $f; | |
85 } | |
86 | |
87 sub add_RNASeq_filter { | |
88 $default_filters{'RNASeq_strand'} = \&_RNASeq_strand_filter; | |
89 $default_filters{'RNASeq_INS'} = \&_RNASeq_INS_filter; | |
90 } | |
91 | |
92 sub remove_filter { | |
93 my $self = shift; | |
94 my $name = shift; | |
95 if($default_filters{$name}){ | |
96 delete $default_filters{$name}; | |
97 } | |
98 else { | |
99 print STDERR "Filter $name is not in filter list"; | |
100 } | |
101 } | |
102 | |
103 sub tr_max_indel_size { #tandem repeat mediated indel size | |
104 my $self = shift; | |
105 my $value = shift; | |
106 $tr_max_indel_size = $value if($value); | |
107 return $tr_max_indel_size; | |
108 } | |
109 | |
110 sub min_percent_cons_of_read { | |
111 my $self= shift; | |
112 my $value = shift; | |
113 $min_percent_cons_of_read = $value if($value); | |
114 return $min_percent_cons_of_read; | |
115 } | |
116 | |
117 sub tr_min_size { | |
118 my $self = shift; | |
119 my $value = shift; | |
120 $tr_min_size = $value if($value); | |
121 return $tr_min_size; | |
122 } | |
123 sub germline_seq_width { | |
124 my $self = shift; | |
125 my $value = shift; | |
126 $germline_seq_width = $value if($value); | |
127 return $germline_seq_width; | |
128 } | |
129 | |
130 sub germline_search_width { | |
131 my $self = shift; | |
132 my $value = shift; | |
133 $germline_search_width = $value if($value); | |
134 return $germline_search_width; | |
135 } | |
136 | |
137 sub max_bp_dist { | |
138 my $self = shift; | |
139 my $value = shift; | |
140 $max_bp_dist = $value if($value); | |
141 return $max_bp_dist; | |
142 } | |
143 | |
144 sub tr_max_size { #tandem repeat max_size of single repeat | |
145 my $self = shift; | |
146 my $value = shift; | |
147 $tr_max_size = $value if($value); | |
148 return $tr_max_size; | |
149 } | |
150 | |
151 sub tr_min_num { #tandem repeat minimum number of occurence | |
152 my $self = shift; | |
153 my $value = shift; | |
154 $tr_min_num = $value if($value); | |
155 return $tr_min_num; | |
156 } | |
157 sub genome { | |
158 my $self = shift; | |
159 my $value = shift; | |
160 $genome = $value if($value); | |
161 return $genome; | |
162 } | |
163 sub sam_d { | |
164 my $self = shift; | |
165 my $value = shift; | |
166 $sam_d = $value if($value); | |
167 return $sam_d; | |
168 } | |
169 | |
170 sub sam_g { | |
171 my $self = shift; | |
172 my $value = shift; | |
173 $sam_g = $value if($value); | |
174 return $sam_g; | |
175 } | |
176 | |
177 sub assembler { | |
178 my $self = shift; | |
179 my $value = shift; | |
180 $assembler = $value if($value); | |
181 return $assembler; | |
182 } | |
183 | |
184 sub aligner { | |
185 my $self = shift; | |
186 my $value = shift; | |
187 $aligner = $value if($value); | |
188 return $aligner; | |
189 } | |
190 | |
191 sub RNASeq { | |
192 my $self = shift; | |
193 my $value = shift; | |
194 $RNASeq = 1 if($value); | |
195 return $RNASeq; | |
196 } | |
197 | |
198 sub mapper { | |
199 my $self = shift; | |
200 my $value = shift; | |
201 $mapper = $value if($value); | |
202 return $mapper; | |
203 } | |
204 | |
205 sub gene_model { | |
206 my $self = shift; | |
207 my $value = shift; | |
208 $gm = $value if($value); | |
209 return $gm; | |
210 } | |
211 | |
212 sub read_len { | |
213 my $self = shift; | |
214 my $value = shift; | |
215 $read_len = $value if($value); | |
216 return $read_len; | |
217 | |
218 } | |
219 sub first_bp { | |
220 my $self = shift; | |
221 my %param = @_; | |
222 if(%param) { | |
223 $self->{first_bp}{left_chr} = $param{-LEFT_CHR}; | |
224 $self->{first_bp}{left_pos} = $param{-LEFT_POS}; | |
225 $self->{first_bp}{left_ort} = $param{-LEFT_ORT}; | |
226 $self->{first_bp}{right_chr} = $param{-RIGHT_CHR}; | |
227 $self->{first_bp}{right_pos} = $param{-RIGHT_POS}; | |
228 $self->{first_bp}{right_ort} = $param{-RIGHT_ORT}; | |
229 $self->{first_bp}{sc_reads} = $param{-SC_READS}; | |
230 $self->{first_bp}{pos} = $param{-POS}; | |
231 $self->{first_bp}{cover} = $param{-COVER}; | |
232 $self->{first_bp}{sc_seq} = $param{-SC_SEQ}; | |
233 $self->{first_bp}{chr} = $param{-CHR}; | |
234 } | |
235 return $self->{first_bp}; | |
236 } | |
237 | |
238 sub second_bp { | |
239 my $self = shift; | |
240 my %param = @_; | |
241 if(%param) { | |
242 $self->{second_bp}{left_chr} = $param{-LEFT_CHR}; | |
243 $self->{second_bp}{left_pos} = $param{-LEFT_POS}; | |
244 $self->{second_bp}{left_ort} = $param{-LEFT_ORT}; | |
245 $self->{second_bp}{right_chr} = $param{-RIGHT_CHR}; | |
246 $self->{second_bp}{right_pos} = $param{-RIGHT_POS}; | |
247 $self->{second_bp}{right_ort} = $param{-RIGHT_ORT}; | |
248 $self->{second_bp}{sc_reads} = $param{-SC_READS}; | |
249 $self->{second_bp}{pos} = $param{-POS}; | |
250 $self->{second_bp}{cover} = $param{-COVER}; | |
251 $self->{second_bp}{sc_seq} = $param{-SC_SEQ}; | |
252 $self->{second_bp}{chr} = $param{-CHR}; | |
253 } | |
254 return $self->{second_bp}; | |
255 } | |
256 | |
257 sub type { | |
258 my $self = shift; | |
259 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp}); | |
260 if($bp1->{left_chr} ne $bp1->{right_chr}) { | |
261 return CTX if($bp1->{left_ort} eq $bp2->{left_ort} && | |
262 $bp1->{right_ort} eq $bp2->{right_ort}); | |
263 | |
264 return UNK; | |
265 } | |
266 if( $bp1->{left_ort} eq $bp1->{right_ort} && | |
267 $bp2->{left_ort} eq $bp2->{right_ort} ) { # Insertion or deletion | |
268 return DEL if( $bp1->{left_pos} <= $bp1->{right_pos} && | |
269 $bp2->{left_pos} <= $bp2->{right_pos}); | |
270 return INS if( $bp1->{left_pos} >= $bp1->{right_pos} && | |
271 $bp2->{left_pos} >= $bp2->{right_pos}); | |
272 return UNK; | |
273 } | |
274 if( $bp1->{left_ort} ne $bp1->{right_ort} && | |
275 $bp2->{left_ort} ne $bp2->{right_ort} ) { | |
276 return INV if( ($bp1->{left_ort} eq '+' && $bp2->{left_ort} eq '-') | |
277 || ($bp2->{left_ort} eq '+' && $bp1->{left_ort} eq '-')); | |
278 return ITX if( ($bp1->{left_ort} eq '+' && $bp2->{left_ort} eq '+') | |
279 || ($bp1->{left_ort} eq '-' && $bp2->{left_ort} eq '-') ); | |
280 return UNK; | |
281 } | |
282 return UNK; | |
283 } | |
284 | |
285 sub to_string { | |
286 my $self = shift; | |
287 my $type = $self->type; | |
288 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp}); | |
289 my ($left_len, $right_len) = (0, 0); | |
290 if(exists $bp1->{left_len}) { | |
291 ($left_len, $right_len ) = ($bp1->{left_len}, $bp1->{right_len}); | |
292 } | |
293 else { | |
294 $left_len = length($bp1->{sc_seq}) if($bp1->{sc_seq}); | |
295 $right_len = length($bp2->{sc_seq}) if($bp2->{sc_seq}); | |
296 } | |
297 if($type == INV && $bp1->{pos} > $bp2->{pos}) { | |
298 ($bp1, $bp2) = ($bp2, $bp1); | |
299 ($left_len, $right_len) = ($right_len, $left_len); | |
300 } | |
301 my ($cover1, $cover2) = ($bp1->{cover}, $bp2->{cover}); | |
302 | |
303 my ($chrA, $chrB, $ortA, $ortB) = ($bp1->{left_chr}, $bp1->{right_chr}, | |
304 $bp1->{left_ort}, $bp1->{right_ort}); | |
305 my ($posA, $posB, $countA, $countB) = ($bp1->{pos}, $bp2->{pos}, $bp1->{sc_reads}, $bp2->{sc_reads}); | |
306 if($bp1->{chr} eq $bp1->{right_chr} && $bp1->{pos} == $bp1->{right_pos}) { | |
307 $posA = $bp2->{pos}; | |
308 $posB = $bp1->{pos}; | |
309 $countA = $bp2->{sc_reads}; | |
310 $countB = $bp1->{sc_reads}; | |
311 ($cover1, $cover2) = ($cover2, $cover1); | |
312 ($left_len, $right_len) = ($right_len, $left_len); | |
313 } | |
314 | |
315 my $rtn = join("\t", ($chrA, $posA, $ortA, $countA, $chrB, $posB, $ortB, $countB, | |
316 $type2str{$type}, $cover1, $cover2, $left_len, $right_len, $self->get_statistics(100), | |
317 $self->{c_start}, $self->{start_chr}, $self->{t_start}, | |
318 $self->{c_end}, $self->{end_chr}, $self->{t_end}, $self->{consensus})); | |
319 if($self->{type} == INV) { | |
320 $rtn = join("\t", ($rtn, $self->{c_start2}, $self->{start_chr2}, $self->{t_start2}, | |
321 $self->{c_end2}, $self->{end_chr2}, $self->{t_end}, $self->{consensus2})); | |
322 } | |
323 return $rtn; | |
324 } | |
325 | |
326 sub set_consensus { | |
327 my $self = shift; | |
328 my ($validator, $paired ) = @_; | |
329 if(!$assembler) { | |
330 print STDERR "No assembler set, no consensus generated\n"; | |
331 return; | |
332 } | |
333 my %all_reads; | |
334 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp}); | |
335 my $N = 0; | |
336 | |
337 my @s_reads1 = $self->_get_reads($sam_d, $bp1); | |
338 my @s_reads2 = $self->_get_reads($sam_d, $bp2); | |
339 | |
340 if($self->{type} == INV) { | |
341 $self->{consensus} = _generate_consensus(\@s_reads1); | |
342 $self->{consensus2} = _generate_consensus(\@s_reads2); | |
343 } | |
344 else { | |
345 push @s_reads1, @s_reads2; | |
346 $self->{consensus} = _generate_consensus(\@s_reads1); | |
347 } | |
348 } | |
349 | |
350 sub _generate_consensus { | |
351 my $s_reads = shift; | |
352 my $N = scalar(@{$s_reads}); | |
353 return $s_reads->[0]{seq} if($N == 1); | |
354 my $fa_name = prepare_reads($s_reads); | |
355 my ($contig_file, $sclip_count) = $assembler->run($fa_name); | |
356 if(!$contig_file || -s $contig_file == 0) { | |
357 system ("rm $fa_name"); system("rm $fa_name*"); | |
358 return; | |
359 } | |
360 my $n = 0; | |
361 my $contig_name; | |
362 foreach my $c (keys %{$sclip_count}) { | |
363 if($sclip_count->{$c} > $n) { | |
364 $n = $sclip_count->{$c}; | |
365 $contig_name = $c; | |
366 } | |
367 } | |
368 return if($N * 8 > $n * 10); | |
369 my $contig_seqs = read_fa_file($contig_file); | |
370 return $contig_seqs->{$contig_name}; | |
371 } | |
372 | |
373 sub _get_reads{ | |
374 my $self = shift; | |
375 my ($sam, $bp) = @_; | |
376 | |
377 my ($chr, $pos) = ($bp->{chr}, $bp->{pos}); | |
378 my ($start, $end) = ($pos, $pos); | |
379 if($bp->{all_pos}) { | |
380 my @tmp = @{$bp->{all_pos}}; | |
381 @tmp = sort {$a <=> $b} @tmp; | |
382 ($start, $end) = ($tmp[0], $tmp[$#tmp]); | |
383 } | |
384 my %reads; | |
385 return if(scalar @{$bp->{reads}} == 0); | |
386 foreach my $r (@{$bp->{reads}}){ | |
387 $reads{$r} = 1; | |
388 } | |
389 | |
390 #my ($sam, $chr, $start, $end, $reads) = | |
391 # ($args{-SAM}, $args{-CHR}, $args{-START}, $args{-END}, $args{-READS}) ; | |
392 | |
393 my @rtn; | |
394 my $range = $chr; | |
395 $range = $chr . ":" . $start . "-" . $end; | |
396 my($s, $e) = ($start, $end); | |
397 $sam->fetch($range, sub { | |
398 my $a = shift; | |
399 return unless (exists $reads{$a->qname}); | |
400 return unless ($a->start && $a->end); | |
401 return unless (($a->start >= $start && $a->start <= $end) | |
402 || ($a->end >= $start && $a->end <= $end)); | |
403 $s = $a->start if($s > $a->start); | |
404 $e = $a->end if($e < $a->end); | |
405 my $qscore = $a->qscore; | |
406 delete $reads{$a->qname}; | |
407 push @rtn, { | |
408 name => $a->qname, | |
409 seq => $a->query->dna, | |
410 qual => $qscore, | |
411 }; | |
412 } ); | |
413 # $bp->{range} = [$s, $e]; | |
414 if($self->{type} == INV) { | |
415 if($start == $bp->{left_pos} || $end == $bp->{left_pos}) { | |
416 $bp->{left_range} = [$s, $e]; | |
417 } | |
418 else { | |
419 $bp->{right_range} = [$s, $e]; | |
420 } | |
421 return @rtn; | |
422 } | |
423 if($chr eq $self->{first_bp}{left_chr} && ($start == $self->{first_bp}{left_pos} || $end == $self->{first_bp}{left_pos})){ | |
424 $self->{first_bp}{left_range} = [$s, $e]; | |
425 } | |
426 if($chr eq $self->{first_bp}{right_chr} && ($start == $self->{first_bp}{right_pos} || $end == $self->{first_bp}{right_pos})){ | |
427 $self->{first_bp}{right_range} = [$s, $e]; | |
428 } | |
429 | |
430 return @rtn; | |
431 } | |
432 | |
433 my $start_dir; | |
434 BEGIN { | |
435 $start_dir = getcwd; | |
436 } | |
437 | |
438 sub _cat_genes { | |
439 my ($ort, @genes) = @_; | |
440 my $rtn; | |
441 my @names; | |
442 foreach my $g (@genes) { | |
443 push @names, $g->val->name; | |
444 } | |
445 @names = uniq @names; | |
446 $rtn = join(";", @names); | |
447 $rtn = $rtn . "($ort)" if($rtn); | |
448 return $rtn; | |
449 } | |
450 | |
451 sub get_genes { | |
452 my $self = shift; | |
453 return ['NA', 'NA', 'NA'] unless $gm; | |
454 | |
455 my ($gene1, $gene2); | |
456 my $dist; | |
457 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp}); | |
458 my ($ort1, $ort2) = ($bp1->{left_ort}, $bp1->{right_ort}); | |
459 my ($chr1, $chr2) = ($bp1->{left_chr}, $bp1->{right_chr}); | |
460 my ($pos1, $pos2) = ($bp1->{pos}, $bp2->{pos}); | |
461 ($pos1, $pos2) = ($pos2, $pos1) if($bp1->{right_pos} == $pos1); | |
462 my $range1 = $ort1 eq "+" ? [$pos1 - 5, $pos1] : [$pos1, $pos1 + 5]; | |
463 my $range2 = $ort2 eq "+" ? [$pos2, $pos2 + 5] : [$pos2 - 5, $pos2]; | |
464 | |
465 my ($tmp1, $tmp2) = ($chr1 =~ m/chr/ ? $chr1 : "chr" . $chr1, | |
466 $chr2 =~ m/chr/ ? $chr2 : "chr" . $chr2); | |
467 my ($r_tree1, $f_tree1) = ($gm->sub_model($tmp1, "-"), $gm->sub_model($tmp1, "+")); | |
468 my ($r_tree2, $f_tree2) = ($gm->sub_model($tmp2, "-"), $gm->sub_model($tmp2, "+")); | |
469 | |
470 my ($g_ort1, $g_ort2); | |
471 my @genes; | |
472 @genes = $r_tree1->intersect($range1) if($r_tree1); | |
473 $gene1 = _cat_genes("-", @genes) if(scalar @genes > 0 ); | |
474 undef @genes; | |
475 @genes = $f_tree1->intersect($range1) if($f_tree1); | |
476 if(scalar @genes > 0) { | |
477 my $tmp = _cat_genes("+", @genes); | |
478 if($gene1) { | |
479 $gene1 .= ";" . $tmp; | |
480 } | |
481 else { | |
482 $gene1 = $tmp; | |
483 } | |
484 } | |
485 undef @genes; | |
486 @genes = $r_tree2->intersect($range2) if($r_tree2); | |
487 $gene2 = _cat_genes("-", @genes) if(scalar @genes > 0 ); | |
488 undef @genes; | |
489 @genes = $f_tree2->intersect($range2) if($f_tree2);; | |
490 if(scalar @genes > 0) { | |
491 my $tmp = _cat_genes("+", @genes); | |
492 if($gene2) { | |
493 $gene2 .= ";" . $tmp; | |
494 } | |
495 else { | |
496 $gene2 = $tmp; | |
497 } | |
498 } | |
499 | |
500 $gene1 = 'NA' unless $gene1; | |
501 $gene2 = 'NA' unless $gene2; | |
502 my $type = $self->{type}; | |
503 if( $type == INS or $type == DEL ) { | |
504 ($pos1, $pos2) = ($pos2, $pos1) if($pos1 > $pos2); | |
505 @genes = $r_tree1->intersect([$pos1 - 5, $pos2 + 5]); | |
506 if(scalar @genes >= 2) { | |
507 $dist = scalar(@genes) - 2; | |
508 $dist .= "(-)"; | |
509 } | |
510 @genes = $f_tree1->intersect([$pos1 - 5, $pos2 + 5]); | |
511 if(scalar @genes >= 2) { | |
512 if($dist) { | |
513 $dist .= ":" . (scalar (@genes) - 2); | |
514 } | |
515 else { | |
516 $dist = scalar (@genes) - 2; | |
517 } | |
518 $dist .= "(+)"; | |
519 } | |
520 } | |
521 $dist = 'NA' unless $dist; | |
522 return [$gene1, $gene2, $dist]; | |
523 } | |
524 | |
525 sub to_full_string { | |
526 my $self = shift; | |
527 my $type = $self->{type}; | |
528 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp}); | |
529 return join("\t", ( $bp1->{left_chr}, $bp1->{left_pos}, $bp1->{left_ort}, | |
530 $bp1->{right_chr}, $bp1->{right_pos}, $bp1->{right_ort}, $bp1->{sc_reads}, | |
531 $bp2->{left_chr}, $bp2->{left_pos}, $bp2->{left_ort}, | |
532 $bp2->{right_chr}, $bp2->{right_pos}, $bp2->{right_ort}, $bp2->{sc_reads}, | |
533 $type2str{$type})); | |
534 } | |
535 | |
536 sub filter { | |
537 my $self = shift; | |
538 print STDERR "SV filter starting....\n"; | |
539 $self->{type} = $self->type; | |
540 $self->set_consensus; | |
541 | |
542 if(!$self->{consensus} || length $self->{consensus} < $read_len * $min_percent_cons_of_read) { | |
543 $self->{error} = "No Consensus or consensus too short"; | |
544 print STDERR "FAILED\n"; | |
545 return; | |
546 } | |
547 if($self->{type} == INV && (!$self->{consensus2} || length $self->{consensus2} < $read_len * $min_percent_cons_of_read)) { | |
548 $self->{error} = "No Consensus or consensus too short"; | |
549 print STDERR "FAILED\n"; | |
550 return; | |
551 } | |
552 | |
553 foreach my $f (values(%default_filters) ){ | |
554 if(! $f->($self)){ | |
555 print STDERR "FAILED\n"; | |
556 return; | |
557 } | |
558 } | |
559 print STDERR "PASSED\n"; | |
560 return 1; | |
561 } | |
562 | |
563 sub _mapping_quality_filter { | |
564 my $self = shift; | |
565 if($self->{type} == INV) { | |
566 my $tmp1 = $self->_mapping_quality($self->{first_bp}, $self->{consensus}, 1); | |
567 my $tmp2 = $self->_mapping_quality($self->{second_bp}, $self->{consensus2}, 2); | |
568 return $tmp1 && $tmp2; | |
569 } | |
570 return $self->_mapping_quality($self->{first_bp}, $self->{consensus}, 1); | |
571 } | |
572 | |
573 sub _mapping_quality { | |
574 my $self = shift; | |
575 my $bp = shift; | |
576 my $con = shift; | |
577 my $which = shift; | |
578 $which = "" unless $which == 2; | |
579 my $l = length($con); | |
580 print STDERR "Mapping quality filter ... "; | |
581 | |
582 open my $QUERY, ">query.fa" or croak "can't open query.fa : $OS_ERROR"; | |
583 print $QUERY ">query\n", $con, "\n"; | |
584 close $QUERY; | |
585 | |
586 my ($range1, $range2) = $self->_find_bp_ref_range($bp, $con); | |
587 my ($s1, $e1, $t_s1, $t_e1) = $self->_map_consensus($range1); | |
588 return unless $e1; | |
589 my ($chr, $s, $e) = split /[:|-]/, $range2; | |
590 my $offset = 0; | |
591 if($s < $t_e1 && $e > $t_s1) { #overlap | |
592 my $tmp = substr $con, 0, $s; | |
593 if($s1 < $l - $e1) { | |
594 $offset = $e1; | |
595 $tmp = substr $con, $e1 + 1; | |
596 } | |
597 open my $QUERY, ">query.fa" or croak "can't open query.fa : $OS_ERROR"; | |
598 print $QUERY ">query\n", $tmp, "\n"; | |
599 close $QUERY; | |
600 } | |
601 my ($s2, $e2, $t_s2, $t_e2) = $self->_map_consensus($range2); | |
602 return unless $e2; | |
603 $s2 += $offset; | |
604 $e2 += $offset; | |
605 if($s1 < $s2) { | |
606 if($e1 < $s2) { | |
607 $self->{"insert" . $which} = substr($con, $e1, $s2 - $e1); | |
608 } | |
609 $self->{"c_start" . $which} = $s1; | |
610 $self->{"t_start" . $which} = $t_s1; | |
611 $self->{"start_chr" . $which} = $bp->{left_chr}; | |
612 $self->{"c_end" . $which} = $e2; | |
613 $self->{"t_end" . $which} = $t_e2; | |
614 $self->{"end_chr". $which} = $bp->{right_chr}; | |
615 $s1 = 1; | |
616 ($e1, $s2) = ($s2, $e1); | |
617 $e2 = $l; | |
618 } | |
619 else { | |
620 if($e2 < $s1) { | |
621 $self->{"insert" . $which} = substr($con, $e2, $s1 - $e2); | |
622 } | |
623 $self->{"c_start" . $which} = $s2; | |
624 $self->{"t_start" . $which} = $t_s2; | |
625 $self->{"start_chr" . $which} = $bp->{right_chr}; | |
626 $self->{"c_end" . $which} = $e1; | |
627 $self->{"t_end" . $which} = $t_e1; | |
628 $self->{"end_chr" . $which } = $bp->{left_chr}; | |
629 $s2 = 1; | |
630 ($e2, $s1) = ($s1, $e2); | |
631 $e1 = $l; | |
632 } | |
633 | |
634 my $n = 1; | |
635 foreach my $r( ($range1, $range2) ) { | |
636 my $r_a = $r eq $range1 ? $range2 : $range1; | |
637 my($chr, $s, $e) = split /[:|-]/, $r; | |
638 my($chr_a, $s_a, $e_a) = split /[:|-]/, $r_a; | |
639 my($s_c, $e_c) = $n == 1 ? ($s1, $e1) : ($s2, $e2); | |
640 $n++; | |
641 open my $QUERY, ">query.fa" or croak "can't open query.fa : $OS_ERROR"; | |
642 print $QUERY ">query\n", substr($self->{consensus}, $s_c, $e_c - $s_c + 1), "\n"; | |
643 close $QUERY; | |
644 my $hit = $aligner->run(-TARGET => $genome . ":" . $r, -QUERY => "query.fa"); | |
645 my $h = $hit->{query}; | |
646 return unless $h; | |
647 my $hsp = $h->next_hsp; | |
648 my @matches = $hsp->matches; | |
649 $hit = $mapper->run(-QUERY => "query.fa"); | |
650 foreach $h (@{$hit->{query}}) { | |
651 next if($h->{tchr} eq $chr && $h->{tstart} >= $s - $l && $h->{tend} <= $e + $l); # the same one | |
652 return if($h->{matches} >= $matches[0]); #find a better or as good as match | |
653 return if($h->{tchr} eq $chr_a && $h->{matches} - $matches[0] >= -5 && $self->{type} == CTX); | |
654 } | |
655 } | |
656 print STDERR "PASSED\n"; | |
657 return 1; | |
658 } | |
659 | |
660 sub _map_consensus { | |
661 my $self = shift; | |
662 my $range = shift; | |
663 my ($s_c, $e_c, $s_t, $e_t); | |
664 | |
665 my $hits = $aligner->run(-TARGET => $genome . ":" . $range, -QUERY => "query.fa"); | |
666 my ($chr, $s, $e) = split /[:|-]/, $range; | |
667 my $h = $hits->{query}; | |
668 return unless $h; # can't find a good enough match | |
669 my $hsp = $h->next_hsp; | |
670 my @matches = $hsp->matches; | |
671 ($s_c, $e_c, $s_t, $e_t) = ($hsp->start('query'), $hsp->end('query'), | |
672 $hsp->start('hit'), $hsp->end('hit')); | |
673 return if( ($e_c - $s_c + 1) * 0.97 > $matches[0]); # the alignment is not good | |
674 ($s_t, $e_t) = $hsp->strand == 1 ? ($s_t + $s, $e_t + $s) : ($e_t + $s, $s_t + $s); | |
675 return ($s_c, $e_c, $s_t, $e_t); | |
676 } | |
677 | |
678 sub _find_bp_ref_range { | |
679 my $self = shift; | |
680 my $bp = shift; | |
681 my $con = shift; | |
682 | |
683 my $l = int(length($con)/2); | |
684 my $ext = $l * 2; | |
685 $ext = 100000 if($RNASeq); | |
686 if($self->{type} == DEL && abs($bp->{left_pos} - $bp->{right_pos}) < $l ){ | |
687 $l = abs($bp->{left_pos} - $bp->{right_pos}) - 1; | |
688 } | |
689 | |
690 my($range1, $range2); | |
691 my ($chr, $p) = ($bp->{left_chr}, $bp->{left_pos}); | |
692 my $ort = $bp->{left_ort}; | |
693 my ($s, $e); | |
694 if($bp->{left_range} ) { | |
695 $s = $bp->{left_range}->[0] - $l; | |
696 $e = $bp->{left_range}->[1] + $l; | |
697 } | |
698 else { | |
699 $s = $e = $p; | |
700 $s = $ort eq "+" ? $p - $ext : $p - $l; | |
701 $e = $ort eq "+" ? $p + $l : $p + $ext; | |
702 } | |
703 $s = 1 if($s < 1); | |
704 $e = $bp->{left_pos} + $l if($self->{type} == DEL); | |
705 $range1 = $chr . ":" . $s . "-" . $e; | |
706 | |
707 ($chr, $p) = ($bp->{right_chr}, $bp->{right_pos}); | |
708 $ort = $bp->{left_ort}; | |
709 if($bp->{right_range} ) { | |
710 $s = $bp->{right_range}->[0] - $l; | |
711 $e = $bp->{right_range}->[1] + $l; | |
712 } | |
713 else { | |
714 $s = $e = $p; | |
715 $s = $ort eq "-" ? $p - $ext : $p - $l; | |
716 $e = $ort eq "-" ? $p + $l : $p + $ext; | |
717 } | |
718 $s = $bp->{right_pos} - $l if($self->{type} == DEL); | |
719 $s = 1 if($s < 1); | |
720 $range2 = $chr . ":" . $s . "-" . $e; | |
721 | |
722 return ($range1, $range2); | |
723 } | |
724 | |
725 sub _find_ref_range { | |
726 my $self = shift; | |
727 my $l = int(length($self->{consensus})/2); | |
728 my ($range1, $range2); | |
729 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp}); | |
730 | |
731 my ($chr, $p) = ($bp1->{chr}, $bp1->{pos}); | |
732 my ($s, $e); | |
733 my $ext = $l*2 ; | |
734 $ext = 100000 if($RNASeq); | |
735 if($self->{type} == DEL && abs($bp1->{pos} - $bp2->{pos}) < $l ){ | |
736 $l = abs($bp1->{pos} - $bp2->{pos}) - 1; | |
737 } | |
738 | |
739 my $ort = $bp1->{pos} == $bp1->{left_pos} ? | |
740 ($bp1->{left_ort} eq "+" ? "-" : "+") : $bp1->{right_ort}; | |
741 if($bp1->{range}) { | |
742 $s = $bp1->{range}->[0] - $l; | |
743 $s = 1 if($s < 1); | |
744 $e = $bp1->{range}->[1] + $l; | |
745 } | |
746 else { | |
747 $s = $e = $p; | |
748 $s = $ort eq "+" ? $p - $l : $p - $ext; | |
749 $e = $ort eq "+" ? $p + $ext : $p + $l; | |
750 } | |
751 $s = 1 if($s < 1); | |
752 $e = $bp1->{pos} + $l if($self->{type} == DEL); | |
753 | |
754 $range1 = $chr . ":" . $s . "-" . $e; | |
755 | |
756 ($chr, $p) = ($bp2->{chr}, $bp2->{pos}); | |
757 $ort = $bp2->{pos} == $bp2->{left_pos} ? | |
758 ($bp2->{left_ort} eq '+' ? '-' : '+') : $bp2->{right_ort}; | |
759 if($self->{type} == INV) { | |
760 $ort = $bp1->{pos} == $bp1->{left_pos} ? | |
761 $bp1->{right_ort} : ($bp1->{left_ort} eq '+' ? '-' : '+'); | |
762 } | |
763 | |
764 if($bp2->{range} && $self->{type} != INV) { | |
765 $s = $bp2->{range}->[0] - $l; | |
766 $e = $bp2->{range}->[1] + $l; | |
767 } | |
768 else { | |
769 $s = $e = $p; | |
770 $s = $ort eq "+" ? $p - $l : $p - $ext; | |
771 $e = $ort eq "+" ? $p + $ext : $p + $l; | |
772 } | |
773 $s = $bp2->{pos} - $l if($self->{type} == DEL); | |
774 $s = 1 if($s < 1); | |
775 | |
776 $range2 = $chr . ":" . $s . "-" . $e; | |
777 return($range1, $range2); | |
778 } | |
779 | |
780 my $GAP = -5; | |
781 my $MATCH = 2; | |
782 my $MISMATCH = -5; | |
783 sub _refine_bp { | |
784 my $self = shift; | |
785 my $h = shift; | |
786 my $ref_seq = shift; | |
787 my $con = $self->{consensus}; | |
788 my $hsp = $h->next_hsp; | |
789 my ($i, $j) = ($hsp->start("hit"), $hsp->start("query")); | |
790 if($hsp->strand == -1) { | |
791 $con = rev_comp($con); | |
792 $j = length($con) - $hsp->end("query"); | |
793 } | |
794 | |
795 my ($score, $g, $max_score, $s) = (0, 0, 0); | |
796 my ($current_s_r, $current_s_c) = ($i, $j); | |
797 my ($s_r, $s_c, $e_r, $e_c); | |
798 my $p; | |
799 my ($n_matches, $n_max_matches) = (0, 0); | |
800 my @bl_r = @{$hsp->gap_blocks('hit')}; | |
801 my @bl_c = @{$hsp->gap_blocks('query')}; | |
802 for( my $n = 0; $n < scalar @bl_r; $n++) { | |
803 my $m = $bl_r[$n]->[1]; | |
804 if($p) { #gaps | |
805 if (!$RNASeq || $bl_r[$n]->[0] - $p < 25) { | |
806 $score += $GAP * ($bl_r[$n]->[0] - $p); | |
807 $score = 0 if($score < 0 ) | |
808 } | |
809 $i += $m; | |
810 $j += $bl_c[$n]->[1]; | |
811 } | |
812 | |
813 while($m) { #matches / mismatches | |
814 ($current_s_r, $current_s_c, $n_matches) = ($i, $j, 0) | |
815 if($score == 0); #reset | |
816 | |
817 if(substr($ref_seq, $i, 1) eq substr($con, $j, 1)) { | |
818 $score += $MATCH; | |
819 $n_matches++; | |
820 } | |
821 else { | |
822 $score += $MISMATCH; | |
823 } | |
824 if($score > $max_score) { | |
825 $n_max_matches = $n_matches; | |
826 $max_score = $score; | |
827 ($s_r, $s_c) = ($current_s_r, $current_s_c); | |
828 ($e_r, $e_c) = ($i, $j); | |
829 } | |
830 $n_matches = $score = 0 if($score < 0 ); | |
831 $m--; $i++; $j++; | |
832 } | |
833 $p = $bl_r[$n]->[0] + $bl_r[$n]->[1]; | |
834 } | |
835 return($n_max_matches, $s_r, $e_r, $s_c, $e_c); | |
836 } | |
837 | |
838 sub _mapping_artifact_filter { | |
839 my $self = shift; | |
840 if($self->{type} == INV) { | |
841 return ($self->_mapping_artifact($self->{consensus}) | |
842 || $self->_mapping_artifact($self->{consensus2}) ); | |
843 } | |
844 return $self->_mapping_artifact($self->{consensus}); | |
845 } | |
846 | |
847 sub _mapping_artifact { | |
848 my $self = shift; | |
849 my $con = shift; | |
850 my $l = length $con; | |
851 print STDERR "Mapping artifact filter ... "; | |
852 $self->{error} = "Mapping Artifact"; | |
853 $l = $l - 5; | |
854 my $options = " minIdentity=97 -out=psl -nohead"; | |
855 $options = $options . " -maxIntron=5 " unless $RNASeq; | |
856 my $exp = $l; | |
857 if($self->{type} == DEL && $RNASeq) { | |
858 $exp = $self->{second_bp}{pos} - $self->{first_bp}{pos}; | |
859 $options = $options . " -maxIntron=" . ($exp + 100) if($exp > 75000); | |
860 } | |
861 | |
862 open my $QUERY, ">query.fa" or croak "can't open query.fa : $OS_ERROR"; | |
863 print $QUERY ">query\n", $con, "\n"; | |
864 close $QUERY; | |
865 my $tmp_mapping = $mapper->run(-QUERY => "query.fa", -OPTIONS => $options); | |
866 system("rm query.fa"); system("rm query.fa*"); | |
867 | |
868 $l = $l + 5; | |
869 if($RNASeq) { | |
870 #can't find a decent mapping for DEL event | |
871 return if($self->{type} == DEL && !$tmp_mapping->{query}); | |
872 foreach my $h(@{$tmp_mapping->{query}}) { | |
873 #next if($h->{tend} - $h->{tstart} < $exp - 10); | |
874 return if ($self->{type} != DEL && $h->{qend} - $h->{qstart} > $l - 10); | |
875 my $chr = $h->{tchr}; | |
876 $chr = "chr" . $chr unless $chr =~ m/^chr/; | |
877 my ($r_tree, $f_tree) = ($gm->sub_model($chr, "-"), $gm->sub_model($chr, "+")); | |
878 return unless ($f_tree && $r_tree); | |
879 my @f_genes = $f_tree->intersect([$h->{tstart}, $h->{tend}]); | |
880 my @r_genes = $r_tree->intersect([$h->{tstart}, $h->{tend}]); | |
881 return if(scalar @f_genes <= 1 && scalar @r_genes <= 1); | |
882 my @f_tmp_genes; | |
883 my @r_tmp_genes; | |
884 | |
885 foreach my $g (@f_genes) { | |
886 foreach my $block (@{$h->{blocks}}) { | |
887 if($g->val->overlap($block)){ | |
888 push @f_tmp_genes, $g; | |
889 last; | |
890 } | |
891 } | |
892 } | |
893 foreach my $g (@r_genes) { | |
894 foreach my $block (@{$h->{blocks}}) { | |
895 if($g->val->overlap($block)){ | |
896 push @r_tmp_genes, $g; | |
897 last; | |
898 } | |
899 } | |
900 } | |
901 return if(scalar @f_tmp_genes < 1 && scalar @r_tmp_genes < 1); | |
902 } | |
903 } | |
904 else { | |
905 foreach my $h (@{$tmp_mapping->{query}}){ | |
906 return if($h->{tend} - $h->{tstart} < $l + 10 && $h->{qend} - $h->{qstart} > $l - 5); | |
907 } | |
908 } | |
909 $self->{error} = undef; | |
910 return 1; | |
911 } | |
912 | |
913 sub _germline_sclip_filter { | |
914 my $self = shift; | |
915 print STDERR "Germline sclip filter\n"; | |
916 return 1 if(!$sam_g); | |
917 $self->{error} = "Germline Event"; | |
918 my $rtn = _compare_seq($self->{first_bp}) && | |
919 _compare_seq($self->{second_bp}); | |
920 $self->{error} = undef if($rtn); | |
921 return $rtn; | |
922 | |
923 } | |
924 | |
925 sub _compare_seq { | |
926 my $bp = shift; | |
927 my $seq; | |
928 if($bp->{pos} == $bp->{left_pos}) { | |
929 my $seg = $sam_d->segment($bp->{right_chr}, $bp->{right_pos} - | |
930 $germline_seq_width, $bp->{right_pos} + $germline_seq_width); | |
931 $seq = $seg->dna; | |
932 } | |
933 else { | |
934 my $seg = $sam_d->segment($bp->{left_chr}, $bp->{left_pos} - $germline_seq_width, | |
935 $bp->{left_pos} + $germline_seq_width); | |
936 $seq = $seg->dna; | |
937 } | |
938 my $sclip_seq = _get_sclip_seqs($sam_g, $bp->{chr}, $bp->{pos} - $germline_search_width, | |
939 $bp->{pos} + $germline_search_width); | |
940 | |
941 | |
942 open my $TARGET, ">target.fa" or croak "can't open target.fa : $OS_ERROR"; | |
943 print $TARGET ">target\n", $seq, "\n"; | |
944 close $TARGET; | |
945 | |
946 if(keys(%{$sclip_seq})) { | |
947 open my $QUERY, ">query.fa" or croak "can't open query.fa : $OS_ERROR"; | |
948 foreach my $s (keys(%{$sclip_seq})) { | |
949 print $QUERY ">", $s, "\n", $sclip_seq->{$s}, "\n"; | |
950 } | |
951 close($QUERY); | |
952 my $hits = $aligner->run(-TARGET => "target.fa", -QUERY => 'query.fa'); | |
953 return if(keys(%{$hits})); | |
954 } | |
955 return 1; | |
956 } | |
957 | |
958 sub _type_distance_filter { | |
959 my $self = shift; | |
960 my $type = $self->{type}; | |
961 $self->{error} = "Type Distance"; | |
962 print STDERR "Type distance filter\n"; | |
963 if($type == UNK) { #those events are not sure, always ignore them | |
964 print STDERR $self->to_full_string, "\n"; | |
965 return; | |
966 } | |
967 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp}); | |
968 return 1 if($bp2->{sc_reads} == 0); | |
969 my ($d1, $d2); | |
970 if($type == INS || $type == DEL) { | |
971 ($d1, $d2) = ($bp1->{left_pos} - $bp2->{left_pos}, | |
972 $bp1->{right_pos} - $bp2->{right_pos}); | |
973 } | |
974 if( $type == INV ){ | |
975 ($d1, $d2) = ($bp1->{left_pos} - $bp2->{left_pos}, | |
976 $bp2->{right_pos} - $bp1->{right_pos}); | |
977 } | |
978 if( $type == ITX ) { | |
979 ($d1, $d2) = ($bp1->{left_pos} - $bp2->{right_pos}, | |
980 $bp2->{left_pos} - $bp1->{right_pos}); | |
981 } | |
982 if($type == CTX ) { | |
983 if($bp1->{left_ort} eq $bp1->{right_ort} || | |
984 $bp1->{sc_reads} == 0 || $bp2->{sc_reads} == 0) { | |
985 ($d1, $d2) = ($bp1->{left_pos} - $bp2->{left_pos}, $bp1->{right_pos} - $bp2->{right_pos}) ; | |
986 } | |
987 else { | |
988 ($d1, $d2) = ($bp1->{left_pos} - $bp2->{right_pos}, $bp2->{left_pos} - $bp1->{right_pos}); | |
989 } | |
990 } | |
991 print STDERR $self->to_full_string, "\t", $d1, "\t", $d2, "\n"; | |
992 if($d1 <= 1 && $d2 <= 1 || abs($d1 - $d2) <= 5) { | |
993 $self->{error} = undef; | |
994 return 1; | |
995 } | |
996 if(abs($d1) > $max_bp_dist || abs($d2) > $max_bp_dist){ | |
997 print STDERR "Distance fail\n"; | |
998 return; | |
999 } | |
1000 $self->{error} = undef; | |
1001 return 1; # the distance is small enough | |
1002 } | |
1003 | |
1004 sub _germline_indel_filter { | |
1005 my $self = shift; | |
1006 my $type = $self->{type}; | |
1007 print STDERR "Germline INDEL FILTER test\n"; | |
1008 return if(abs($self->{first_bp}{pos} - $self->{second_bp}{pos}) < 40); | |
1009 return 1 if(!$sam_g); | |
1010 return 1 if( $type != INS && $type != DEL); | |
1011 return 1 if( abs($self->{first_bp}{pos} - $self->{second_bp}{pos}) > 50 ); | |
1012 $self->{error} = "Germline INDEL"; | |
1013 my ($start, $end ) = ($self->{first_bp}{left_pos}, $self->{first_bp}{right_pos}); | |
1014 my $indel_len = abs($end - $start); | |
1015 ($start, $end) = ($end, $start) if($start > $end); | |
1016 | |
1017 my $itr = $sam_g->features(-iterator => 1, -seq_id => $self->{first_bp}{chr}, | |
1018 -start => $start, -end => $end); | |
1019 while( my $a = $itr->next_seq ) { | |
1020 next if($type == DEL && $a->cigar_str !~ m/D/); | |
1021 next if($type == INS && $a->cigar_str !~ m/I/); | |
1022 my $cigar_array = $a->cigar_array; | |
1023 my $pos = $a->pos; | |
1024 foreach my $ca (@{$cigar_array}) { | |
1025 if($ca->[0] eq 'M') { | |
1026 $pos += $ca->[1]; | |
1027 next; | |
1028 } | |
1029 if($ca->[0] eq 'I' && $type == INS ){ | |
1030 return if(abs($ca->[1] - $indel_len) <= 20 && abs($pos - $start) <= 20); | |
1031 next; | |
1032 } | |
1033 if($ca->[0] eq 'D' && $type == DEL) { | |
1034 return if(abs($ca->[1] - $indel_len) <= 20 && abs($pos - $start) <= 20); | |
1035 $pos += $ca->[1]; | |
1036 next; | |
1037 } | |
1038 } | |
1039 } | |
1040 $self->{error} = undef; | |
1041 return 1; | |
1042 } | |
1043 | |
1044 sub _tandem_repeat_filter { | |
1045 my $self = shift; | |
1046 my $con = $self->{consensus}; | |
1047 print STDERR "low compexity filter\n"; | |
1048 open my $QUERY, ">query.fa" or croak "can't open query.fa : $OS_ERROR"; | |
1049 print $QUERY ">query\n", $self->{consensus}, "\n"; | |
1050 close $QUERY; | |
1051 | |
1052 system("ptrfinder -seq query.fa -repsize $tr_min_size,$tr_max_size -minrep $tr_min_num > query.fa.rep"); | |
1053 if(-e "query.fa.rep") { | |
1054 open my $REP, "<query.fa.rep" or croak "can't open query.fa.rep:$OS_ERROR"; | |
1055 while( my $line = <$REP>) { | |
1056 chomp $line; | |
1057 my($pattern, $len, $times, $s, $e) = | |
1058 $line =~ m/PATTERN\s(\w+)\sLENGTH\s(\d+)\sTIMES\s(\d+)\sSTART\s(\d+)\sSTOP\s(\d+)\sID/; | |
1059 if($self->{type} == DEL || $self->{type} == INS){ | |
1060 if(abs($self->{first_bp}{left_pos} - $self->{first_bp}{right_pos}) < $tr_max_indel_size) { | |
1061 print STDERR "Tandem repeat mediated INDEL!\n"; | |
1062 close $REP; | |
1063 return; | |
1064 } | |
1065 } | |
1066 else{ | |
1067 if(($s < 5 || length($self->{consensus}) - $e < 5) && $len * $times > 30 ) { | |
1068 print STDERR "Tandem repeat mediated events!\n"; | |
1069 close $REP; | |
1070 return; | |
1071 } | |
1072 } | |
1073 } | |
1074 } | |
1075 return 1; | |
1076 } | |
1077 | |
1078 sub _get_sclip_seqs { | |
1079 my ($sam, $chr, $start, $end) = @_; | |
1080 my %rtn; | |
1081 my $range = $chr . ":" . $start . "-" . $end; | |
1082 | |
1083 $sam->fetch($range, sub { | |
1084 my $a = shift; | |
1085 my $cigar_str = $a->cigar_str; | |
1086 return if($cigar_str !~ /S/); | |
1087 my ($sclip_len, $pos, $seq, $qual); | |
1088 my @cigar_array = @{$a->cigar_array}; | |
1089 if($cigar_array[0]->[0] eq 'S' ) { | |
1090 $sclip_len = $cigar_array[0]->[1]; $pos = $a->start; | |
1091 return if($pos < $start or $pos > $end); # the softclipping position is not in range | |
1092 $seq = substr($a->query->dna, 0, $sclip_len ); | |
1093 $rtn{$a->qname} = $seq if($sclip_len >= 10); | |
1094 } | |
1095 #if($cigar_str =~ m/S(\d+)$/) { | |
1096 if($cigar_array[$#cigar_array]->[0] eq 'S') { | |
1097 $sclip_len = $cigar_array[$#cigar_array]->[1]; | |
1098 $pos = $a->end; | |
1099 return if($pos < $start or $pos > $end); # the softclipping position is not in range | |
1100 $seq = substr($a->qseq, $a->l_qseq - $sclip_len ); | |
1101 $rtn{$a->qname} = $seq if($sclip_len >= 10); | |
1102 } | |
1103 } | |
1104 ); | |
1105 return \%rtn; | |
1106 } | |
1107 | |
1108 sub _RNASeq_strand_filter { | |
1109 my $self = shift; | |
1110 my $type = $self->type; | |
1111 print STDERR "RNASeq strand filter\n"; | |
1112 return 1 unless $gm; | |
1113 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp}); | |
1114 my ($ort1, $ort2) = ($bp1->{left_ort}, $bp1->{right_ort}); | |
1115 my ($chr1, $chr2) = ($bp1->{left_chr}, $bp1->{right_chr}); | |
1116 my ($pos1, $pos2) = ($bp1->{pos}, $bp2->{pos}); | |
1117 ($pos1, $pos2) = ($pos2, $pos1) if($bp1->{right_pos} == $pos1); | |
1118 my ($tmp1, $tmp2) = ($chr1 =~ m/chr/ ? $chr1 : "chr" . $chr1, | |
1119 $chr2 =~ m/chr/ ? $chr2 : "chr" . $chr2); | |
1120 $tmp1 = "chrM" if($tmp1 eq "chrMT"); | |
1121 $tmp2 = "chrM" if($tmp2 eq "chrMT"); | |
1122 my ($r_tree1, $f_tree1) = ($gm->sub_model($tmp1, "-"), $gm->sub_model($tmp1, "+")); | |
1123 my ($r_tree2, $f_tree2) = ($gm->sub_model($tmp2, "-"), $gm->sub_model($tmp2, "+")); | |
1124 my ($g_ort1, $g_ort2); | |
1125 return 1 unless($r_tree1 && $r_tree2 && $f_tree1 && $f_tree2); | |
1126 $g_ort1 = "-" if(scalar($r_tree1->intersect([$pos1 - 5, $pos1])) > 0); | |
1127 $g_ort1 = "+" if(scalar($f_tree1->intersect([$pos1 - 5, $pos1])) > 0); | |
1128 $g_ort2 = "-" if(scalar($r_tree2->intersect([$pos2, $pos2 + 5])) > 0); | |
1129 $g_ort2 = "+" if(scalar($f_tree2->intersect([$pos2, $pos2 + 5])) > 0); | |
1130 return 1 unless($g_ort1 && $g_ort2); | |
1131 return 1 if($g_ort1 eq $ort1 && $g_ort2 eq $ort2); | |
1132 return 1 if($g_ort1 ne $ort1 && $g_ort2 ne $ort2); | |
1133 return; | |
1134 } | |
1135 | |
1136 # INS filter check to make sure that the INS part overlap with any gene | |
1137 # if the INS part only in intron will return as a false positive | |
1138 sub _RNASeq_INS_filter { | |
1139 my $self = shift; | |
1140 my $type = $self->{type}; | |
1141 return 1 if($type != INS); | |
1142 print STDERR "RNAseq INS filter\n"; | |
1143 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp}); | |
1144 my ($pos1, $pos2) = ($bp1->{pos}, $bp2->{pos}); | |
1145 ($pos1, $pos2) = ($pos2, $pos1) if($pos2 < $pos1); | |
1146 my $chr = $bp1->{chr}; | |
1147 $chr = "chr" . $chr unless $chr =~ m/chr/; | |
1148 my ($r_tree, $f_tree) = ($gm->sub_model($chr, "-"), $gm->sub_model($chr, "+")); | |
1149 foreach my $tree( ($r_tree, $f_tree) ) { | |
1150 my @tmp; | |
1151 @tmp = $f_tree->intersect([$pos1, $pos2]); | |
1152 foreach my $g (@tmp) { | |
1153 return 1 if($g->val->overlap([$pos1, $pos2])); | |
1154 } | |
1155 } | |
1156 return; | |
1157 } | |
1158 | |
1159 sub _RNASeq_DEL_filter { | |
1160 my $self = shift; | |
1161 my $type = $self->type; | |
1162 return 1 if($type != DEL); | |
1163 print STDERR "RNAseq DEL filter\n"; | |
1164 | |
1165 my ($gene1, $gene2, $dist) = @{$self->get_genes}; | |
1166 return if($gene1 eq $gene2); | |
1167 my ($d_plus, $d_minus) = (0, 0); | |
1168 $d_plus = $1 if($dist =~ m/(\d+)\(\+\)/); | |
1169 $d_minus = $1 if($dist =~ m/(\d+)\(\-\)/); | |
1170 return if($d_plus == 0 && $d_minus == 0); | |
1171 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp}); | |
1172 return if($bp1->{cover} == 0 || $bp2->{cover} == 0); | |
1173 my($left_len, $right_len); | |
1174 if(exists $bp1->{left_len}) { | |
1175 ($left_len, $right_len ) = ($bp1->{left_len}, $bp1->{right_len}); | |
1176 } | |
1177 else { | |
1178 $left_len = length($bp1->{sc_seq}) if($bp1->{sc_seq}); | |
1179 $right_len = length($bp2->{sc_seq}) if($bp2->{sc_seq}); | |
1180 } | |
1181 return if($left_len < 30 || $right_len < 30); | |
1182 return if($bp1->{sc_reads} < 10 || $bp2->{sc_reads} < 10); | |
1183 return unless $bp1->{left_gene}->overlap([$bp1->{pos} - $left_len, $bp1->{pos}]); | |
1184 return unless $bp1->{right_gene}->overlap([$bp2->{pos} + $right_len, $bp2->{pos}]); | |
1185 return 1; | |
1186 } | |
1187 | |
1188 sub get_statistics { | |
1189 my $self = shift; | |
1190 my $half_width = shift; | |
1191 my $sam = $self->sam_d; | |
1192 my @rtn; | |
1193 | |
1194 foreach my $bp ( ($self->{first_bp}, $self->{second_bp})) { | |
1195 my ($chr, $pos ) = ($bp->{chr}, $bp->{pos}); | |
1196 my $range = $chr . ":" . ($pos - $half_width) . "-" . ($pos + $half_width); | |
1197 my ($n_seq, $n_rep, $total_pid) = (0, 0, 0); | |
1198 | |
1199 $sam->fetch($range, sub { | |
1200 my $a = shift; | |
1201 return unless ($a->start && $a->end); | |
1202 return unless ($a->start >= $pos - $half_width && $a->end <= $pos + $half_width); | |
1203 $n_seq++; | |
1204 $total_pid += _cal_pid($a); | |
1205 if($a->has_tag("XT")) { | |
1206 $n_rep++ if($a->aux_get("XT") ne "U"); | |
1207 } | |
1208 } | |
1209 ); | |
1210 if($n_seq == 0) { | |
1211 push @rtn, (0, 0); | |
1212 } | |
1213 else { | |
1214 push @rtn, ($total_pid/$n_seq, $n_rep/$n_seq); | |
1215 } | |
1216 } | |
1217 return @rtn; | |
1218 } | |
1219 | |
1220 sub _cal_pid { | |
1221 my $a = shift; | |
1222 my ($ref, $matches, $query) = $a->padded_alignment; | |
1223 my ($n_match, $n) = (0, 0); | |
1224 for( my $i = 0; $i < length($matches); $i++) { | |
1225 my $c = substr $matches, $i, 1; | |
1226 $n_match++ if($c eq "|"); | |
1227 $n++; | |
1228 } | |
1229 return 0 if($n == 0); | |
1230 return $n_match/$n; | |
1231 } | |
1232 | |
1233 | |
1234 1; | |
1235 __END__; | |
1236 | |
1237 =pod | |
1238 | |
1239 =head1 NAME | |
1240 |