0
|
1 package StructVar;
|
|
2
|
|
3 # $Id: StructVar.pm, v 1.00 2010/09/24 jianmin.wang Exp $
|
|
4
|
|
5 use strict;
|
|
6 use Bio::DB::Sam;
|
|
7 use Carp;
|
|
8 use Data::Dumper;
|
|
9 use English;
|
|
10 use Bio::SearchIO;
|
|
11 use GeneModel;
|
|
12 use Cwd;
|
|
13 use SVExtTools;
|
|
14 use List::MoreUtils qw/ uniq /;
|
|
15 use SVUtil qw( prepare_reads read_fa_file get_sclip_reads rev_comp);
|
|
16 use SCValidator qw(LEFT_CLIP RIGHT_CLIP);
|
|
17
|
|
18 use constant UNK => 0;
|
|
19 use constant CTX => 1;
|
|
20 use constant INV => 2;
|
|
21 use constant INS => 3;
|
|
22 use constant DEL => 4;
|
|
23 use constant ITX => 5;
|
|
24
|
|
25 # type of SVs and thier string
|
|
26 my %type2str = (
|
|
27 0 => 'UNKNOWN', # unknow type, not single event
|
|
28 1 => 'CTX', # cross chromosome translocation
|
|
29 2 => 'INV', # inversion, requires both breakpoints available
|
|
30 3 => 'INS', # (tandem) insertion
|
|
31 4 => 'DEL', # deletion
|
|
32 5 => 'ITX', # within chromosome translocation
|
|
33 );
|
|
34 # class static variables
|
|
35 my $sam_d;
|
|
36 my $sam_g;
|
|
37 my $gm;
|
|
38 my $assembler;
|
|
39 my $aligner;
|
|
40 my $mapper;
|
|
41 my @filters = (\&_germline_sclip_filter, \&_type_distance_filter, \&_germline_indel_filter);
|
|
42 my $RNASeq;
|
|
43 my $read_len;
|
|
44 my $genome;
|
|
45 my $tr_max_indel_size; #tandem repeat mediated indel size
|
|
46 my $tr_min_size;
|
|
47 my $tr_max_size; #tandem repeat max_size of single repeat
|
|
48 my $tr_min_num; #tandem repeat minimum number of occurence
|
|
49
|
|
50 my $max_bp_dist = 15;
|
|
51 my $germline_seq_width = 100;
|
|
52 my $germline_search_width = 50;
|
|
53 my $min_percent_cons_of_read = 0.75;
|
|
54
|
|
55 my %default_filters = (
|
|
56 'germline_sclip' => \&_germline_sclip_filter,
|
|
57 'type_distance' => \&_type_distance_filter,
|
|
58 'germline_indel' => \&_germline_indel_filter,
|
|
59 # 'mapping_artifact' => \&_mapping_artifact_filter,
|
|
60 'mapping_quality' => \&_mapping_quality_filter,
|
|
61 'tandem_repeat' => \&_tandem_repeat_filter,
|
|
62 );
|
|
63
|
|
64 sub new {
|
|
65 my $class = shift;
|
|
66 my %param = @_;
|
|
67 my $self = {};
|
|
68 $self->{first_bp} = {};
|
|
69 $self->{second_bp} = {};
|
|
70 $self->{first_bp} = $param{-FIRST_BP} if($param{-FIRST_BP});
|
|
71 $self->{second_bp} = $param{-SECOND_BP} if($param{-SECOND_BP});
|
|
72 $self->{consensus} = $param{-CONSENSUS} ? $param{-CONSENSUS} : "";
|
|
73 $self->{type} = "";
|
|
74 bless $self, ref($class) || $class;
|
|
75 return $self;
|
|
76 }
|
|
77
|
|
78 sub add_filter {
|
|
79 my $self = shift;
|
|
80 my $name = shift;
|
|
81 my $f = shift;
|
|
82 croak "you must specify a filter" if(!$f);
|
|
83 croak "the filter must be a subroutine" if(ref($f) ne 'CODE');
|
|
84 $default_filters{$name} = $f;
|
|
85 }
|
|
86
|
|
87 sub add_RNASeq_filter {
|
|
88 $default_filters{'RNASeq_strand'} = \&_RNASeq_strand_filter;
|
|
89 $default_filters{'RNASeq_INS'} = \&_RNASeq_INS_filter;
|
|
90 }
|
|
91
|
|
92 sub remove_filter {
|
|
93 my $self = shift;
|
|
94 my $name = shift;
|
|
95 if($default_filters{$name}){
|
|
96 delete $default_filters{$name};
|
|
97 }
|
|
98 else {
|
|
99 print STDERR "Filter $name is not in filter list";
|
|
100 }
|
|
101 }
|
|
102
|
|
103 sub tr_max_indel_size { #tandem repeat mediated indel size
|
|
104 my $self = shift;
|
|
105 my $value = shift;
|
|
106 $tr_max_indel_size = $value if($value);
|
|
107 return $tr_max_indel_size;
|
|
108 }
|
|
109
|
|
110 sub min_percent_cons_of_read {
|
|
111 my $self= shift;
|
|
112 my $value = shift;
|
|
113 $min_percent_cons_of_read = $value if($value);
|
|
114 return $min_percent_cons_of_read;
|
|
115 }
|
|
116
|
|
117 sub tr_min_size {
|
|
118 my $self = shift;
|
|
119 my $value = shift;
|
|
120 $tr_min_size = $value if($value);
|
|
121 return $tr_min_size;
|
|
122 }
|
|
123 sub germline_seq_width {
|
|
124 my $self = shift;
|
|
125 my $value = shift;
|
|
126 $germline_seq_width = $value if($value);
|
|
127 return $germline_seq_width;
|
|
128 }
|
|
129
|
|
130 sub germline_search_width {
|
|
131 my $self = shift;
|
|
132 my $value = shift;
|
|
133 $germline_search_width = $value if($value);
|
|
134 return $germline_search_width;
|
|
135 }
|
|
136
|
|
137 sub max_bp_dist {
|
|
138 my $self = shift;
|
|
139 my $value = shift;
|
|
140 $max_bp_dist = $value if($value);
|
|
141 return $max_bp_dist;
|
|
142 }
|
|
143
|
|
144 sub tr_max_size { #tandem repeat max_size of single repeat
|
|
145 my $self = shift;
|
|
146 my $value = shift;
|
|
147 $tr_max_size = $value if($value);
|
|
148 return $tr_max_size;
|
|
149 }
|
|
150
|
|
151 sub tr_min_num { #tandem repeat minimum number of occurence
|
|
152 my $self = shift;
|
|
153 my $value = shift;
|
|
154 $tr_min_num = $value if($value);
|
|
155 return $tr_min_num;
|
|
156 }
|
|
157 sub genome {
|
|
158 my $self = shift;
|
|
159 my $value = shift;
|
|
160 $genome = $value if($value);
|
|
161 return $genome;
|
|
162 }
|
|
163 sub sam_d {
|
|
164 my $self = shift;
|
|
165 my $value = shift;
|
|
166 $sam_d = $value if($value);
|
|
167 return $sam_d;
|
|
168 }
|
|
169
|
|
170 sub sam_g {
|
|
171 my $self = shift;
|
|
172 my $value = shift;
|
|
173 $sam_g = $value if($value);
|
|
174 return $sam_g;
|
|
175 }
|
|
176
|
|
177 sub assembler {
|
|
178 my $self = shift;
|
|
179 my $value = shift;
|
|
180 $assembler = $value if($value);
|
|
181 return $assembler;
|
|
182 }
|
|
183
|
|
184 sub aligner {
|
|
185 my $self = shift;
|
|
186 my $value = shift;
|
|
187 $aligner = $value if($value);
|
|
188 return $aligner;
|
|
189 }
|
|
190
|
|
191 sub RNASeq {
|
|
192 my $self = shift;
|
|
193 my $value = shift;
|
|
194 $RNASeq = 1 if($value);
|
|
195 return $RNASeq;
|
|
196 }
|
|
197
|
|
198 sub mapper {
|
|
199 my $self = shift;
|
|
200 my $value = shift;
|
|
201 $mapper = $value if($value);
|
|
202 return $mapper;
|
|
203 }
|
|
204
|
|
205 sub gene_model {
|
|
206 my $self = shift;
|
|
207 my $value = shift;
|
|
208 $gm = $value if($value);
|
|
209 return $gm;
|
|
210 }
|
|
211
|
|
212 sub read_len {
|
|
213 my $self = shift;
|
|
214 my $value = shift;
|
|
215 $read_len = $value if($value);
|
|
216 return $read_len;
|
|
217
|
|
218 }
|
|
219 sub first_bp {
|
|
220 my $self = shift;
|
|
221 my %param = @_;
|
|
222 if(%param) {
|
|
223 $self->{first_bp}{left_chr} = $param{-LEFT_CHR};
|
|
224 $self->{first_bp}{left_pos} = $param{-LEFT_POS};
|
|
225 $self->{first_bp}{left_ort} = $param{-LEFT_ORT};
|
|
226 $self->{first_bp}{right_chr} = $param{-RIGHT_CHR};
|
|
227 $self->{first_bp}{right_pos} = $param{-RIGHT_POS};
|
|
228 $self->{first_bp}{right_ort} = $param{-RIGHT_ORT};
|
|
229 $self->{first_bp}{sc_reads} = $param{-SC_READS};
|
|
230 $self->{first_bp}{pos} = $param{-POS};
|
|
231 $self->{first_bp}{cover} = $param{-COVER};
|
|
232 $self->{first_bp}{sc_seq} = $param{-SC_SEQ};
|
|
233 $self->{first_bp}{chr} = $param{-CHR};
|
|
234 }
|
|
235 return $self->{first_bp};
|
|
236 }
|
|
237
|
|
238 sub second_bp {
|
|
239 my $self = shift;
|
|
240 my %param = @_;
|
|
241 if(%param) {
|
|
242 $self->{second_bp}{left_chr} = $param{-LEFT_CHR};
|
|
243 $self->{second_bp}{left_pos} = $param{-LEFT_POS};
|
|
244 $self->{second_bp}{left_ort} = $param{-LEFT_ORT};
|
|
245 $self->{second_bp}{right_chr} = $param{-RIGHT_CHR};
|
|
246 $self->{second_bp}{right_pos} = $param{-RIGHT_POS};
|
|
247 $self->{second_bp}{right_ort} = $param{-RIGHT_ORT};
|
|
248 $self->{second_bp}{sc_reads} = $param{-SC_READS};
|
|
249 $self->{second_bp}{pos} = $param{-POS};
|
|
250 $self->{second_bp}{cover} = $param{-COVER};
|
|
251 $self->{second_bp}{sc_seq} = $param{-SC_SEQ};
|
|
252 $self->{second_bp}{chr} = $param{-CHR};
|
|
253 }
|
|
254 return $self->{second_bp};
|
|
255 }
|
|
256
|
|
257 sub type {
|
|
258 my $self = shift;
|
|
259 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
|
|
260 if($bp1->{left_chr} ne $bp1->{right_chr}) {
|
|
261 return CTX if($bp1->{left_ort} eq $bp2->{left_ort} &&
|
|
262 $bp1->{right_ort} eq $bp2->{right_ort});
|
|
263
|
|
264 return UNK;
|
|
265 }
|
|
266 if( $bp1->{left_ort} eq $bp1->{right_ort} &&
|
|
267 $bp2->{left_ort} eq $bp2->{right_ort} ) { # Insertion or deletion
|
|
268 return DEL if( $bp1->{left_pos} <= $bp1->{right_pos} &&
|
|
269 $bp2->{left_pos} <= $bp2->{right_pos});
|
|
270 return INS if( $bp1->{left_pos} >= $bp1->{right_pos} &&
|
|
271 $bp2->{left_pos} >= $bp2->{right_pos});
|
|
272 return UNK;
|
|
273 }
|
|
274 if( $bp1->{left_ort} ne $bp1->{right_ort} &&
|
|
275 $bp2->{left_ort} ne $bp2->{right_ort} ) {
|
|
276 return INV if( ($bp1->{left_ort} eq '+' && $bp2->{left_ort} eq '-')
|
|
277 || ($bp2->{left_ort} eq '+' && $bp1->{left_ort} eq '-'));
|
|
278 return ITX if( ($bp1->{left_ort} eq '+' && $bp2->{left_ort} eq '+')
|
|
279 || ($bp1->{left_ort} eq '-' && $bp2->{left_ort} eq '-') );
|
|
280 return UNK;
|
|
281 }
|
|
282 return UNK;
|
|
283 }
|
|
284
|
|
285 sub to_string {
|
|
286 my $self = shift;
|
|
287 my $type = $self->type;
|
|
288 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
|
|
289 my ($left_len, $right_len) = (0, 0);
|
|
290 if(exists $bp1->{left_len}) {
|
|
291 ($left_len, $right_len ) = ($bp1->{left_len}, $bp1->{right_len});
|
|
292 }
|
|
293 else {
|
|
294 $left_len = length($bp1->{sc_seq}) if($bp1->{sc_seq});
|
|
295 $right_len = length($bp2->{sc_seq}) if($bp2->{sc_seq});
|
|
296 }
|
|
297 if($type == INV && $bp1->{pos} > $bp2->{pos}) {
|
|
298 ($bp1, $bp2) = ($bp2, $bp1);
|
|
299 ($left_len, $right_len) = ($right_len, $left_len);
|
|
300 }
|
|
301 my ($cover1, $cover2) = ($bp1->{cover}, $bp2->{cover});
|
|
302
|
|
303 my ($chrA, $chrB, $ortA, $ortB) = ($bp1->{left_chr}, $bp1->{right_chr},
|
|
304 $bp1->{left_ort}, $bp1->{right_ort});
|
|
305 my ($posA, $posB, $countA, $countB) = ($bp1->{pos}, $bp2->{pos}, $bp1->{sc_reads}, $bp2->{sc_reads});
|
|
306 if($bp1->{chr} eq $bp1->{right_chr} && $bp1->{pos} == $bp1->{right_pos}) {
|
|
307 $posA = $bp2->{pos};
|
|
308 $posB = $bp1->{pos};
|
|
309 $countA = $bp2->{sc_reads};
|
|
310 $countB = $bp1->{sc_reads};
|
|
311 ($cover1, $cover2) = ($cover2, $cover1);
|
|
312 ($left_len, $right_len) = ($right_len, $left_len);
|
|
313 }
|
|
314
|
|
315 my $rtn = join("\t", ($chrA, $posA, $ortA, $countA, $chrB, $posB, $ortB, $countB,
|
|
316 $type2str{$type}, $cover1, $cover2, $left_len, $right_len, $self->get_statistics(100),
|
|
317 $self->{c_start}, $self->{start_chr}, $self->{t_start},
|
|
318 $self->{c_end}, $self->{end_chr}, $self->{t_end}, $self->{consensus}));
|
|
319 if($self->{type} == INV) {
|
|
320 $rtn = join("\t", ($rtn, $self->{c_start2}, $self->{start_chr2}, $self->{t_start2},
|
|
321 $self->{c_end2}, $self->{end_chr2}, $self->{t_end}, $self->{consensus2}));
|
|
322 }
|
|
323 return $rtn;
|
|
324 }
|
|
325
|
|
326 sub set_consensus {
|
|
327 my $self = shift;
|
|
328 my ($validator, $paired ) = @_;
|
|
329 if(!$assembler) {
|
|
330 print STDERR "No assembler set, no consensus generated\n";
|
|
331 return;
|
|
332 }
|
|
333 my %all_reads;
|
|
334 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
|
|
335 my $N = 0;
|
|
336
|
|
337 my @s_reads1 = $self->_get_reads($sam_d, $bp1);
|
|
338 my @s_reads2 = $self->_get_reads($sam_d, $bp2);
|
|
339
|
|
340 if($self->{type} == INV) {
|
|
341 $self->{consensus} = _generate_consensus(\@s_reads1);
|
|
342 $self->{consensus2} = _generate_consensus(\@s_reads2);
|
|
343 }
|
|
344 else {
|
|
345 push @s_reads1, @s_reads2;
|
|
346 $self->{consensus} = _generate_consensus(\@s_reads1);
|
|
347 }
|
|
348 }
|
|
349
|
|
350 sub _generate_consensus {
|
|
351 my $s_reads = shift;
|
|
352 my $N = scalar(@{$s_reads});
|
|
353 return $s_reads->[0]{seq} if($N == 1);
|
|
354 my $fa_name = prepare_reads($s_reads);
|
|
355 my ($contig_file, $sclip_count) = $assembler->run($fa_name);
|
|
356 if(!$contig_file || -s $contig_file == 0) {
|
|
357 system ("rm $fa_name"); system("rm $fa_name*");
|
|
358 return;
|
|
359 }
|
|
360 my $n = 0;
|
|
361 my $contig_name;
|
|
362 foreach my $c (keys %{$sclip_count}) {
|
|
363 if($sclip_count->{$c} > $n) {
|
|
364 $n = $sclip_count->{$c};
|
|
365 $contig_name = $c;
|
|
366 }
|
|
367 }
|
|
368 return if($N * 8 > $n * 10);
|
|
369 my $contig_seqs = read_fa_file($contig_file);
|
|
370 return $contig_seqs->{$contig_name};
|
|
371 }
|
|
372
|
|
373 sub _get_reads{
|
|
374 my $self = shift;
|
|
375 my ($sam, $bp) = @_;
|
|
376
|
|
377 my ($chr, $pos) = ($bp->{chr}, $bp->{pos});
|
|
378 my ($start, $end) = ($pos, $pos);
|
|
379 if($bp->{all_pos}) {
|
|
380 my @tmp = @{$bp->{all_pos}};
|
|
381 @tmp = sort {$a <=> $b} @tmp;
|
|
382 ($start, $end) = ($tmp[0], $tmp[$#tmp]);
|
|
383 }
|
|
384 my %reads;
|
|
385 return if(scalar @{$bp->{reads}} == 0);
|
|
386 foreach my $r (@{$bp->{reads}}){
|
|
387 $reads{$r} = 1;
|
|
388 }
|
|
389
|
|
390 #my ($sam, $chr, $start, $end, $reads) =
|
|
391 # ($args{-SAM}, $args{-CHR}, $args{-START}, $args{-END}, $args{-READS}) ;
|
|
392
|
|
393 my @rtn;
|
|
394 my $range = $chr;
|
|
395 $range = $chr . ":" . $start . "-" . $end;
|
|
396 my($s, $e) = ($start, $end);
|
|
397 $sam->fetch($range, sub {
|
|
398 my $a = shift;
|
|
399 return unless (exists $reads{$a->qname});
|
|
400 return unless ($a->start && $a->end);
|
|
401 return unless (($a->start >= $start && $a->start <= $end)
|
|
402 || ($a->end >= $start && $a->end <= $end));
|
|
403 $s = $a->start if($s > $a->start);
|
|
404 $e = $a->end if($e < $a->end);
|
|
405 my $qscore = $a->qscore;
|
|
406 delete $reads{$a->qname};
|
|
407 push @rtn, {
|
|
408 name => $a->qname,
|
|
409 seq => $a->query->dna,
|
|
410 qual => $qscore,
|
|
411 };
|
|
412 } );
|
|
413 # $bp->{range} = [$s, $e];
|
|
414 if($self->{type} == INV) {
|
|
415 if($start == $bp->{left_pos} || $end == $bp->{left_pos}) {
|
|
416 $bp->{left_range} = [$s, $e];
|
|
417 }
|
|
418 else {
|
|
419 $bp->{right_range} = [$s, $e];
|
|
420 }
|
|
421 return @rtn;
|
|
422 }
|
|
423 if($chr eq $self->{first_bp}{left_chr} && ($start == $self->{first_bp}{left_pos} || $end == $self->{first_bp}{left_pos})){
|
|
424 $self->{first_bp}{left_range} = [$s, $e];
|
|
425 }
|
|
426 if($chr eq $self->{first_bp}{right_chr} && ($start == $self->{first_bp}{right_pos} || $end == $self->{first_bp}{right_pos})){
|
|
427 $self->{first_bp}{right_range} = [$s, $e];
|
|
428 }
|
|
429
|
|
430 return @rtn;
|
|
431 }
|
|
432
|
|
433 my $start_dir;
|
|
434 BEGIN {
|
|
435 $start_dir = getcwd;
|
|
436 }
|
|
437
|
|
438 sub _cat_genes {
|
|
439 my ($ort, @genes) = @_;
|
|
440 my $rtn;
|
|
441 my @names;
|
|
442 foreach my $g (@genes) {
|
|
443 push @names, $g->val->name;
|
|
444 }
|
|
445 @names = uniq @names;
|
|
446 $rtn = join(";", @names);
|
|
447 $rtn = $rtn . "($ort)" if($rtn);
|
|
448 return $rtn;
|
|
449 }
|
|
450
|
|
451 sub get_genes {
|
|
452 my $self = shift;
|
|
453 return ['NA', 'NA', 'NA'] unless $gm;
|
|
454
|
|
455 my ($gene1, $gene2);
|
|
456 my $dist;
|
|
457 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
|
|
458 my ($ort1, $ort2) = ($bp1->{left_ort}, $bp1->{right_ort});
|
|
459 my ($chr1, $chr2) = ($bp1->{left_chr}, $bp1->{right_chr});
|
|
460 my ($pos1, $pos2) = ($bp1->{pos}, $bp2->{pos});
|
|
461 ($pos1, $pos2) = ($pos2, $pos1) if($bp1->{right_pos} == $pos1);
|
|
462 my $range1 = $ort1 eq "+" ? [$pos1 - 5, $pos1] : [$pos1, $pos1 + 5];
|
|
463 my $range2 = $ort2 eq "+" ? [$pos2, $pos2 + 5] : [$pos2 - 5, $pos2];
|
|
464
|
|
465 my ($tmp1, $tmp2) = ($chr1 =~ m/chr/ ? $chr1 : "chr" . $chr1,
|
|
466 $chr2 =~ m/chr/ ? $chr2 : "chr" . $chr2);
|
|
467 my ($r_tree1, $f_tree1) = ($gm->sub_model($tmp1, "-"), $gm->sub_model($tmp1, "+"));
|
|
468 my ($r_tree2, $f_tree2) = ($gm->sub_model($tmp2, "-"), $gm->sub_model($tmp2, "+"));
|
|
469
|
|
470 my ($g_ort1, $g_ort2);
|
|
471 my @genes;
|
|
472 @genes = $r_tree1->intersect($range1) if($r_tree1);
|
|
473 $gene1 = _cat_genes("-", @genes) if(scalar @genes > 0 );
|
|
474 undef @genes;
|
|
475 @genes = $f_tree1->intersect($range1) if($f_tree1);
|
|
476 if(scalar @genes > 0) {
|
|
477 my $tmp = _cat_genes("+", @genes);
|
|
478 if($gene1) {
|
|
479 $gene1 .= ";" . $tmp;
|
|
480 }
|
|
481 else {
|
|
482 $gene1 = $tmp;
|
|
483 }
|
|
484 }
|
|
485 undef @genes;
|
|
486 @genes = $r_tree2->intersect($range2) if($r_tree2);
|
|
487 $gene2 = _cat_genes("-", @genes) if(scalar @genes > 0 );
|
|
488 undef @genes;
|
|
489 @genes = $f_tree2->intersect($range2) if($f_tree2);;
|
|
490 if(scalar @genes > 0) {
|
|
491 my $tmp = _cat_genes("+", @genes);
|
|
492 if($gene2) {
|
|
493 $gene2 .= ";" . $tmp;
|
|
494 }
|
|
495 else {
|
|
496 $gene2 = $tmp;
|
|
497 }
|
|
498 }
|
|
499
|
|
500 $gene1 = 'NA' unless $gene1;
|
|
501 $gene2 = 'NA' unless $gene2;
|
|
502 my $type = $self->{type};
|
|
503 if( $type == INS or $type == DEL ) {
|
|
504 ($pos1, $pos2) = ($pos2, $pos1) if($pos1 > $pos2);
|
|
505 @genes = $r_tree1->intersect([$pos1 - 5, $pos2 + 5]);
|
|
506 if(scalar @genes >= 2) {
|
|
507 $dist = scalar(@genes) - 2;
|
|
508 $dist .= "(-)";
|
|
509 }
|
|
510 @genes = $f_tree1->intersect([$pos1 - 5, $pos2 + 5]);
|
|
511 if(scalar @genes >= 2) {
|
|
512 if($dist) {
|
|
513 $dist .= ":" . (scalar (@genes) - 2);
|
|
514 }
|
|
515 else {
|
|
516 $dist = scalar (@genes) - 2;
|
|
517 }
|
|
518 $dist .= "(+)";
|
|
519 }
|
|
520 }
|
|
521 $dist = 'NA' unless $dist;
|
|
522 return [$gene1, $gene2, $dist];
|
|
523 }
|
|
524
|
|
525 sub to_full_string {
|
|
526 my $self = shift;
|
|
527 my $type = $self->{type};
|
|
528 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
|
|
529 return join("\t", ( $bp1->{left_chr}, $bp1->{left_pos}, $bp1->{left_ort},
|
|
530 $bp1->{right_chr}, $bp1->{right_pos}, $bp1->{right_ort}, $bp1->{sc_reads},
|
|
531 $bp2->{left_chr}, $bp2->{left_pos}, $bp2->{left_ort},
|
|
532 $bp2->{right_chr}, $bp2->{right_pos}, $bp2->{right_ort}, $bp2->{sc_reads},
|
|
533 $type2str{$type}));
|
|
534 }
|
|
535
|
|
536 sub filter {
|
|
537 my $self = shift;
|
|
538 print STDERR "SV filter starting....\n";
|
|
539 $self->{type} = $self->type;
|
|
540 $self->set_consensus;
|
|
541
|
|
542 if(!$self->{consensus} || length $self->{consensus} < $read_len * $min_percent_cons_of_read) {
|
|
543 $self->{error} = "No Consensus or consensus too short";
|
|
544 print STDERR "FAILED\n";
|
|
545 return;
|
|
546 }
|
|
547 if($self->{type} == INV && (!$self->{consensus2} || length $self->{consensus2} < $read_len * $min_percent_cons_of_read)) {
|
|
548 $self->{error} = "No Consensus or consensus too short";
|
|
549 print STDERR "FAILED\n";
|
|
550 return;
|
|
551 }
|
|
552
|
|
553 foreach my $f (values(%default_filters) ){
|
|
554 if(! $f->($self)){
|
|
555 print STDERR "FAILED\n";
|
|
556 return;
|
|
557 }
|
|
558 }
|
|
559 print STDERR "PASSED\n";
|
|
560 return 1;
|
|
561 }
|
|
562
|
|
563 sub _mapping_quality_filter {
|
|
564 my $self = shift;
|
|
565 if($self->{type} == INV) {
|
|
566 my $tmp1 = $self->_mapping_quality($self->{first_bp}, $self->{consensus}, 1);
|
|
567 my $tmp2 = $self->_mapping_quality($self->{second_bp}, $self->{consensus2}, 2);
|
|
568 return $tmp1 && $tmp2;
|
|
569 }
|
|
570 return $self->_mapping_quality($self->{first_bp}, $self->{consensus}, 1);
|
|
571 }
|
|
572
|
|
573 sub _mapping_quality {
|
|
574 my $self = shift;
|
|
575 my $bp = shift;
|
|
576 my $con = shift;
|
|
577 my $which = shift;
|
|
578 $which = "" unless $which == 2;
|
|
579 my $l = length($con);
|
|
580 print STDERR "Mapping quality filter ... ";
|
|
581
|
|
582 open my $QUERY, ">query.fa" or croak "can't open query.fa : $OS_ERROR";
|
|
583 print $QUERY ">query\n", $con, "\n";
|
|
584 close $QUERY;
|
|
585
|
|
586 my ($range1, $range2) = $self->_find_bp_ref_range($bp, $con);
|
|
587 my ($s1, $e1, $t_s1, $t_e1) = $self->_map_consensus($range1);
|
|
588 return unless $e1;
|
|
589 my ($chr, $s, $e) = split /[:|-]/, $range2;
|
|
590 my $offset = 0;
|
|
591 if($s < $t_e1 && $e > $t_s1) { #overlap
|
|
592 my $tmp = substr $con, 0, $s;
|
|
593 if($s1 < $l - $e1) {
|
|
594 $offset = $e1;
|
|
595 $tmp = substr $con, $e1 + 1;
|
|
596 }
|
|
597 open my $QUERY, ">query.fa" or croak "can't open query.fa : $OS_ERROR";
|
|
598 print $QUERY ">query\n", $tmp, "\n";
|
|
599 close $QUERY;
|
|
600 }
|
|
601 my ($s2, $e2, $t_s2, $t_e2) = $self->_map_consensus($range2);
|
|
602 return unless $e2;
|
|
603 $s2 += $offset;
|
|
604 $e2 += $offset;
|
|
605 if($s1 < $s2) {
|
|
606 if($e1 < $s2) {
|
|
607 $self->{"insert" . $which} = substr($con, $e1, $s2 - $e1);
|
|
608 }
|
|
609 $self->{"c_start" . $which} = $s1;
|
|
610 $self->{"t_start" . $which} = $t_s1;
|
|
611 $self->{"start_chr" . $which} = $bp->{left_chr};
|
|
612 $self->{"c_end" . $which} = $e2;
|
|
613 $self->{"t_end" . $which} = $t_e2;
|
|
614 $self->{"end_chr". $which} = $bp->{right_chr};
|
|
615 $s1 = 1;
|
|
616 ($e1, $s2) = ($s2, $e1);
|
|
617 $e2 = $l;
|
|
618 }
|
|
619 else {
|
|
620 if($e2 < $s1) {
|
|
621 $self->{"insert" . $which} = substr($con, $e2, $s1 - $e2);
|
|
622 }
|
|
623 $self->{"c_start" . $which} = $s2;
|
|
624 $self->{"t_start" . $which} = $t_s2;
|
|
625 $self->{"start_chr" . $which} = $bp->{right_chr};
|
|
626 $self->{"c_end" . $which} = $e1;
|
|
627 $self->{"t_end" . $which} = $t_e1;
|
|
628 $self->{"end_chr" . $which } = $bp->{left_chr};
|
|
629 $s2 = 1;
|
|
630 ($e2, $s1) = ($s1, $e2);
|
|
631 $e1 = $l;
|
|
632 }
|
|
633
|
|
634 my $n = 1;
|
|
635 foreach my $r( ($range1, $range2) ) {
|
|
636 my $r_a = $r eq $range1 ? $range2 : $range1;
|
|
637 my($chr, $s, $e) = split /[:|-]/, $r;
|
|
638 my($chr_a, $s_a, $e_a) = split /[:|-]/, $r_a;
|
|
639 my($s_c, $e_c) = $n == 1 ? ($s1, $e1) : ($s2, $e2);
|
|
640 $n++;
|
|
641 open my $QUERY, ">query.fa" or croak "can't open query.fa : $OS_ERROR";
|
|
642 print $QUERY ">query\n", substr($self->{consensus}, $s_c, $e_c - $s_c + 1), "\n";
|
|
643 close $QUERY;
|
|
644 my $hit = $aligner->run(-TARGET => $genome . ":" . $r, -QUERY => "query.fa");
|
|
645 my $h = $hit->{query};
|
|
646 return unless $h;
|
|
647 my $hsp = $h->next_hsp;
|
|
648 my @matches = $hsp->matches;
|
|
649 $hit = $mapper->run(-QUERY => "query.fa");
|
|
650 foreach $h (@{$hit->{query}}) {
|
|
651 next if($h->{tchr} eq $chr && $h->{tstart} >= $s - $l && $h->{tend} <= $e + $l); # the same one
|
|
652 return if($h->{matches} >= $matches[0]); #find a better or as good as match
|
|
653 return if($h->{tchr} eq $chr_a && $h->{matches} - $matches[0] >= -5 && $self->{type} == CTX);
|
|
654 }
|
|
655 }
|
|
656 print STDERR "PASSED\n";
|
|
657 return 1;
|
|
658 }
|
|
659
|
|
660 sub _map_consensus {
|
|
661 my $self = shift;
|
|
662 my $range = shift;
|
|
663 my ($s_c, $e_c, $s_t, $e_t);
|
|
664
|
|
665 my $hits = $aligner->run(-TARGET => $genome . ":" . $range, -QUERY => "query.fa");
|
|
666 my ($chr, $s, $e) = split /[:|-]/, $range;
|
|
667 my $h = $hits->{query};
|
|
668 return unless $h; # can't find a good enough match
|
|
669 my $hsp = $h->next_hsp;
|
|
670 my @matches = $hsp->matches;
|
|
671 ($s_c, $e_c, $s_t, $e_t) = ($hsp->start('query'), $hsp->end('query'),
|
|
672 $hsp->start('hit'), $hsp->end('hit'));
|
|
673 return if( ($e_c - $s_c + 1) * 0.97 > $matches[0]); # the alignment is not good
|
|
674 ($s_t, $e_t) = $hsp->strand == 1 ? ($s_t + $s, $e_t + $s) : ($e_t + $s, $s_t + $s);
|
|
675 return ($s_c, $e_c, $s_t, $e_t);
|
|
676 }
|
|
677
|
|
678 sub _find_bp_ref_range {
|
|
679 my $self = shift;
|
|
680 my $bp = shift;
|
|
681 my $con = shift;
|
|
682
|
|
683 my $l = int(length($con)/2);
|
|
684 my $ext = $l * 2;
|
|
685 $ext = 100000 if($RNASeq);
|
|
686 if($self->{type} == DEL && abs($bp->{left_pos} - $bp->{right_pos}) < $l ){
|
|
687 $l = abs($bp->{left_pos} - $bp->{right_pos}) - 1;
|
|
688 }
|
|
689
|
|
690 my($range1, $range2);
|
|
691 my ($chr, $p) = ($bp->{left_chr}, $bp->{left_pos});
|
|
692 my $ort = $bp->{left_ort};
|
|
693 my ($s, $e);
|
|
694 if($bp->{left_range} ) {
|
|
695 $s = $bp->{left_range}->[0] - $l;
|
|
696 $e = $bp->{left_range}->[1] + $l;
|
|
697 }
|
|
698 else {
|
|
699 $s = $e = $p;
|
|
700 $s = $ort eq "+" ? $p - $ext : $p - $l;
|
|
701 $e = $ort eq "+" ? $p + $l : $p + $ext;
|
|
702 }
|
|
703 $s = 1 if($s < 1);
|
|
704 $e = $bp->{left_pos} + $l if($self->{type} == DEL);
|
|
705 $range1 = $chr . ":" . $s . "-" . $e;
|
|
706
|
|
707 ($chr, $p) = ($bp->{right_chr}, $bp->{right_pos});
|
|
708 $ort = $bp->{left_ort};
|
|
709 if($bp->{right_range} ) {
|
|
710 $s = $bp->{right_range}->[0] - $l;
|
|
711 $e = $bp->{right_range}->[1] + $l;
|
|
712 }
|
|
713 else {
|
|
714 $s = $e = $p;
|
|
715 $s = $ort eq "-" ? $p - $ext : $p - $l;
|
|
716 $e = $ort eq "-" ? $p + $l : $p + $ext;
|
|
717 }
|
|
718 $s = $bp->{right_pos} - $l if($self->{type} == DEL);
|
|
719 $s = 1 if($s < 1);
|
|
720 $range2 = $chr . ":" . $s . "-" . $e;
|
|
721
|
|
722 return ($range1, $range2);
|
|
723 }
|
|
724
|
|
725 sub _find_ref_range {
|
|
726 my $self = shift;
|
|
727 my $l = int(length($self->{consensus})/2);
|
|
728 my ($range1, $range2);
|
|
729 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
|
|
730
|
|
731 my ($chr, $p) = ($bp1->{chr}, $bp1->{pos});
|
|
732 my ($s, $e);
|
|
733 my $ext = $l*2 ;
|
|
734 $ext = 100000 if($RNASeq);
|
|
735 if($self->{type} == DEL && abs($bp1->{pos} - $bp2->{pos}) < $l ){
|
|
736 $l = abs($bp1->{pos} - $bp2->{pos}) - 1;
|
|
737 }
|
|
738
|
|
739 my $ort = $bp1->{pos} == $bp1->{left_pos} ?
|
|
740 ($bp1->{left_ort} eq "+" ? "-" : "+") : $bp1->{right_ort};
|
|
741 if($bp1->{range}) {
|
|
742 $s = $bp1->{range}->[0] - $l;
|
|
743 $s = 1 if($s < 1);
|
|
744 $e = $bp1->{range}->[1] + $l;
|
|
745 }
|
|
746 else {
|
|
747 $s = $e = $p;
|
|
748 $s = $ort eq "+" ? $p - $l : $p - $ext;
|
|
749 $e = $ort eq "+" ? $p + $ext : $p + $l;
|
|
750 }
|
|
751 $s = 1 if($s < 1);
|
|
752 $e = $bp1->{pos} + $l if($self->{type} == DEL);
|
|
753
|
|
754 $range1 = $chr . ":" . $s . "-" . $e;
|
|
755
|
|
756 ($chr, $p) = ($bp2->{chr}, $bp2->{pos});
|
|
757 $ort = $bp2->{pos} == $bp2->{left_pos} ?
|
|
758 ($bp2->{left_ort} eq '+' ? '-' : '+') : $bp2->{right_ort};
|
|
759 if($self->{type} == INV) {
|
|
760 $ort = $bp1->{pos} == $bp1->{left_pos} ?
|
|
761 $bp1->{right_ort} : ($bp1->{left_ort} eq '+' ? '-' : '+');
|
|
762 }
|
|
763
|
|
764 if($bp2->{range} && $self->{type} != INV) {
|
|
765 $s = $bp2->{range}->[0] - $l;
|
|
766 $e = $bp2->{range}->[1] + $l;
|
|
767 }
|
|
768 else {
|
|
769 $s = $e = $p;
|
|
770 $s = $ort eq "+" ? $p - $l : $p - $ext;
|
|
771 $e = $ort eq "+" ? $p + $ext : $p + $l;
|
|
772 }
|
|
773 $s = $bp2->{pos} - $l if($self->{type} == DEL);
|
|
774 $s = 1 if($s < 1);
|
|
775
|
|
776 $range2 = $chr . ":" . $s . "-" . $e;
|
|
777 return($range1, $range2);
|
|
778 }
|
|
779
|
|
780 my $GAP = -5;
|
|
781 my $MATCH = 2;
|
|
782 my $MISMATCH = -5;
|
|
783 sub _refine_bp {
|
|
784 my $self = shift;
|
|
785 my $h = shift;
|
|
786 my $ref_seq = shift;
|
|
787 my $con = $self->{consensus};
|
|
788 my $hsp = $h->next_hsp;
|
|
789 my ($i, $j) = ($hsp->start("hit"), $hsp->start("query"));
|
|
790 if($hsp->strand == -1) {
|
|
791 $con = rev_comp($con);
|
|
792 $j = length($con) - $hsp->end("query");
|
|
793 }
|
|
794
|
|
795 my ($score, $g, $max_score, $s) = (0, 0, 0);
|
|
796 my ($current_s_r, $current_s_c) = ($i, $j);
|
|
797 my ($s_r, $s_c, $e_r, $e_c);
|
|
798 my $p;
|
|
799 my ($n_matches, $n_max_matches) = (0, 0);
|
|
800 my @bl_r = @{$hsp->gap_blocks('hit')};
|
|
801 my @bl_c = @{$hsp->gap_blocks('query')};
|
|
802 for( my $n = 0; $n < scalar @bl_r; $n++) {
|
|
803 my $m = $bl_r[$n]->[1];
|
|
804 if($p) { #gaps
|
|
805 if (!$RNASeq || $bl_r[$n]->[0] - $p < 25) {
|
|
806 $score += $GAP * ($bl_r[$n]->[0] - $p);
|
|
807 $score = 0 if($score < 0 )
|
|
808 }
|
|
809 $i += $m;
|
|
810 $j += $bl_c[$n]->[1];
|
|
811 }
|
|
812
|
|
813 while($m) { #matches / mismatches
|
|
814 ($current_s_r, $current_s_c, $n_matches) = ($i, $j, 0)
|
|
815 if($score == 0); #reset
|
|
816
|
|
817 if(substr($ref_seq, $i, 1) eq substr($con, $j, 1)) {
|
|
818 $score += $MATCH;
|
|
819 $n_matches++;
|
|
820 }
|
|
821 else {
|
|
822 $score += $MISMATCH;
|
|
823 }
|
|
824 if($score > $max_score) {
|
|
825 $n_max_matches = $n_matches;
|
|
826 $max_score = $score;
|
|
827 ($s_r, $s_c) = ($current_s_r, $current_s_c);
|
|
828 ($e_r, $e_c) = ($i, $j);
|
|
829 }
|
|
830 $n_matches = $score = 0 if($score < 0 );
|
|
831 $m--; $i++; $j++;
|
|
832 }
|
|
833 $p = $bl_r[$n]->[0] + $bl_r[$n]->[1];
|
|
834 }
|
|
835 return($n_max_matches, $s_r, $e_r, $s_c, $e_c);
|
|
836 }
|
|
837
|
|
838 sub _mapping_artifact_filter {
|
|
839 my $self = shift;
|
|
840 if($self->{type} == INV) {
|
|
841 return ($self->_mapping_artifact($self->{consensus})
|
|
842 || $self->_mapping_artifact($self->{consensus2}) );
|
|
843 }
|
|
844 return $self->_mapping_artifact($self->{consensus});
|
|
845 }
|
|
846
|
|
847 sub _mapping_artifact {
|
|
848 my $self = shift;
|
|
849 my $con = shift;
|
|
850 my $l = length $con;
|
|
851 print STDERR "Mapping artifact filter ... ";
|
|
852 $self->{error} = "Mapping Artifact";
|
|
853 $l = $l - 5;
|
|
854 my $options = " minIdentity=97 -out=psl -nohead";
|
|
855 $options = $options . " -maxIntron=5 " unless $RNASeq;
|
|
856 my $exp = $l;
|
|
857 if($self->{type} == DEL && $RNASeq) {
|
|
858 $exp = $self->{second_bp}{pos} - $self->{first_bp}{pos};
|
|
859 $options = $options . " -maxIntron=" . ($exp + 100) if($exp > 75000);
|
|
860 }
|
|
861
|
|
862 open my $QUERY, ">query.fa" or croak "can't open query.fa : $OS_ERROR";
|
|
863 print $QUERY ">query\n", $con, "\n";
|
|
864 close $QUERY;
|
|
865 my $tmp_mapping = $mapper->run(-QUERY => "query.fa", -OPTIONS => $options);
|
|
866 system("rm query.fa"); system("rm query.fa*");
|
|
867
|
|
868 $l = $l + 5;
|
|
869 if($RNASeq) {
|
|
870 #can't find a decent mapping for DEL event
|
|
871 return if($self->{type} == DEL && !$tmp_mapping->{query});
|
|
872 foreach my $h(@{$tmp_mapping->{query}}) {
|
|
873 #next if($h->{tend} - $h->{tstart} < $exp - 10);
|
|
874 return if ($self->{type} != DEL && $h->{qend} - $h->{qstart} > $l - 10);
|
|
875 my $chr = $h->{tchr};
|
|
876 $chr = "chr" . $chr unless $chr =~ m/^chr/;
|
|
877 my ($r_tree, $f_tree) = ($gm->sub_model($chr, "-"), $gm->sub_model($chr, "+"));
|
|
878 return unless ($f_tree && $r_tree);
|
|
879 my @f_genes = $f_tree->intersect([$h->{tstart}, $h->{tend}]);
|
|
880 my @r_genes = $r_tree->intersect([$h->{tstart}, $h->{tend}]);
|
|
881 return if(scalar @f_genes <= 1 && scalar @r_genes <= 1);
|
|
882 my @f_tmp_genes;
|
|
883 my @r_tmp_genes;
|
|
884
|
|
885 foreach my $g (@f_genes) {
|
|
886 foreach my $block (@{$h->{blocks}}) {
|
|
887 if($g->val->overlap($block)){
|
|
888 push @f_tmp_genes, $g;
|
|
889 last;
|
|
890 }
|
|
891 }
|
|
892 }
|
|
893 foreach my $g (@r_genes) {
|
|
894 foreach my $block (@{$h->{blocks}}) {
|
|
895 if($g->val->overlap($block)){
|
|
896 push @r_tmp_genes, $g;
|
|
897 last;
|
|
898 }
|
|
899 }
|
|
900 }
|
|
901 return if(scalar @f_tmp_genes < 1 && scalar @r_tmp_genes < 1);
|
|
902 }
|
|
903 }
|
|
904 else {
|
|
905 foreach my $h (@{$tmp_mapping->{query}}){
|
|
906 return if($h->{tend} - $h->{tstart} < $l + 10 && $h->{qend} - $h->{qstart} > $l - 5);
|
|
907 }
|
|
908 }
|
|
909 $self->{error} = undef;
|
|
910 return 1;
|
|
911 }
|
|
912
|
|
913 sub _germline_sclip_filter {
|
|
914 my $self = shift;
|
|
915 print STDERR "Germline sclip filter\n";
|
|
916 return 1 if(!$sam_g);
|
|
917 $self->{error} = "Germline Event";
|
|
918 my $rtn = _compare_seq($self->{first_bp}) &&
|
|
919 _compare_seq($self->{second_bp});
|
|
920 $self->{error} = undef if($rtn);
|
|
921 return $rtn;
|
|
922
|
|
923 }
|
|
924
|
|
925 sub _compare_seq {
|
|
926 my $bp = shift;
|
|
927 my $seq;
|
|
928 if($bp->{pos} == $bp->{left_pos}) {
|
|
929 my $seg = $sam_d->segment($bp->{right_chr}, $bp->{right_pos} -
|
|
930 $germline_seq_width, $bp->{right_pos} + $germline_seq_width);
|
|
931 $seq = $seg->dna;
|
|
932 }
|
|
933 else {
|
|
934 my $seg = $sam_d->segment($bp->{left_chr}, $bp->{left_pos} - $germline_seq_width,
|
|
935 $bp->{left_pos} + $germline_seq_width);
|
|
936 $seq = $seg->dna;
|
|
937 }
|
|
938 my $sclip_seq = _get_sclip_seqs($sam_g, $bp->{chr}, $bp->{pos} - $germline_search_width,
|
|
939 $bp->{pos} + $germline_search_width);
|
|
940
|
|
941
|
|
942 open my $TARGET, ">target.fa" or croak "can't open target.fa : $OS_ERROR";
|
|
943 print $TARGET ">target\n", $seq, "\n";
|
|
944 close $TARGET;
|
|
945
|
|
946 if(keys(%{$sclip_seq})) {
|
|
947 open my $QUERY, ">query.fa" or croak "can't open query.fa : $OS_ERROR";
|
|
948 foreach my $s (keys(%{$sclip_seq})) {
|
|
949 print $QUERY ">", $s, "\n", $sclip_seq->{$s}, "\n";
|
|
950 }
|
|
951 close($QUERY);
|
|
952 my $hits = $aligner->run(-TARGET => "target.fa", -QUERY => 'query.fa');
|
|
953 return if(keys(%{$hits}));
|
|
954 }
|
|
955 return 1;
|
|
956 }
|
|
957
|
|
958 sub _type_distance_filter {
|
|
959 my $self = shift;
|
|
960 my $type = $self->{type};
|
|
961 $self->{error} = "Type Distance";
|
|
962 print STDERR "Type distance filter\n";
|
|
963 if($type == UNK) { #those events are not sure, always ignore them
|
|
964 print STDERR $self->to_full_string, "\n";
|
|
965 return;
|
|
966 }
|
|
967 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
|
|
968 return 1 if($bp2->{sc_reads} == 0);
|
|
969 my ($d1, $d2);
|
|
970 if($type == INS || $type == DEL) {
|
|
971 ($d1, $d2) = ($bp1->{left_pos} - $bp2->{left_pos},
|
|
972 $bp1->{right_pos} - $bp2->{right_pos});
|
|
973 }
|
|
974 if( $type == INV ){
|
|
975 ($d1, $d2) = ($bp1->{left_pos} - $bp2->{left_pos},
|
|
976 $bp2->{right_pos} - $bp1->{right_pos});
|
|
977 }
|
|
978 if( $type == ITX ) {
|
|
979 ($d1, $d2) = ($bp1->{left_pos} - $bp2->{right_pos},
|
|
980 $bp2->{left_pos} - $bp1->{right_pos});
|
|
981 }
|
|
982 if($type == CTX ) {
|
|
983 if($bp1->{left_ort} eq $bp1->{right_ort} ||
|
|
984 $bp1->{sc_reads} == 0 || $bp2->{sc_reads} == 0) {
|
|
985 ($d1, $d2) = ($bp1->{left_pos} - $bp2->{left_pos}, $bp1->{right_pos} - $bp2->{right_pos}) ;
|
|
986 }
|
|
987 else {
|
|
988 ($d1, $d2) = ($bp1->{left_pos} - $bp2->{right_pos}, $bp2->{left_pos} - $bp1->{right_pos});
|
|
989 }
|
|
990 }
|
|
991 print STDERR $self->to_full_string, "\t", $d1, "\t", $d2, "\n";
|
|
992 if($d1 <= 1 && $d2 <= 1 || abs($d1 - $d2) <= 5) {
|
|
993 $self->{error} = undef;
|
|
994 return 1;
|
|
995 }
|
|
996 if(abs($d1) > $max_bp_dist || abs($d2) > $max_bp_dist){
|
|
997 print STDERR "Distance fail\n";
|
|
998 return;
|
|
999 }
|
|
1000 $self->{error} = undef;
|
|
1001 return 1; # the distance is small enough
|
|
1002 }
|
|
1003
|
|
1004 sub _germline_indel_filter {
|
|
1005 my $self = shift;
|
|
1006 my $type = $self->{type};
|
|
1007 print STDERR "Germline INDEL FILTER test\n";
|
|
1008 return if(abs($self->{first_bp}{pos} - $self->{second_bp}{pos}) < 40);
|
|
1009 return 1 if(!$sam_g);
|
|
1010 return 1 if( $type != INS && $type != DEL);
|
|
1011 return 1 if( abs($self->{first_bp}{pos} - $self->{second_bp}{pos}) > 50 );
|
|
1012 $self->{error} = "Germline INDEL";
|
|
1013 my ($start, $end ) = ($self->{first_bp}{left_pos}, $self->{first_bp}{right_pos});
|
|
1014 my $indel_len = abs($end - $start);
|
|
1015 ($start, $end) = ($end, $start) if($start > $end);
|
|
1016
|
|
1017 my $itr = $sam_g->features(-iterator => 1, -seq_id => $self->{first_bp}{chr},
|
|
1018 -start => $start, -end => $end);
|
|
1019 while( my $a = $itr->next_seq ) {
|
|
1020 next if($type == DEL && $a->cigar_str !~ m/D/);
|
|
1021 next if($type == INS && $a->cigar_str !~ m/I/);
|
|
1022 my $cigar_array = $a->cigar_array;
|
|
1023 my $pos = $a->pos;
|
|
1024 foreach my $ca (@{$cigar_array}) {
|
|
1025 if($ca->[0] eq 'M') {
|
|
1026 $pos += $ca->[1];
|
|
1027 next;
|
|
1028 }
|
|
1029 if($ca->[0] eq 'I' && $type == INS ){
|
|
1030 return if(abs($ca->[1] - $indel_len) <= 20 && abs($pos - $start) <= 20);
|
|
1031 next;
|
|
1032 }
|
|
1033 if($ca->[0] eq 'D' && $type == DEL) {
|
|
1034 return if(abs($ca->[1] - $indel_len) <= 20 && abs($pos - $start) <= 20);
|
|
1035 $pos += $ca->[1];
|
|
1036 next;
|
|
1037 }
|
|
1038 }
|
|
1039 }
|
|
1040 $self->{error} = undef;
|
|
1041 return 1;
|
|
1042 }
|
|
1043
|
|
1044 sub _tandem_repeat_filter {
|
|
1045 my $self = shift;
|
|
1046 my $con = $self->{consensus};
|
|
1047 print STDERR "low compexity filter\n";
|
|
1048 open my $QUERY, ">query.fa" or croak "can't open query.fa : $OS_ERROR";
|
|
1049 print $QUERY ">query\n", $self->{consensus}, "\n";
|
|
1050 close $QUERY;
|
|
1051
|
|
1052 system("ptrfinder -seq query.fa -repsize $tr_min_size,$tr_max_size -minrep $tr_min_num > query.fa.rep");
|
|
1053 if(-e "query.fa.rep") {
|
|
1054 open my $REP, "<query.fa.rep" or croak "can't open query.fa.rep:$OS_ERROR";
|
|
1055 while( my $line = <$REP>) {
|
|
1056 chomp $line;
|
|
1057 my($pattern, $len, $times, $s, $e) =
|
|
1058 $line =~ m/PATTERN\s(\w+)\sLENGTH\s(\d+)\sTIMES\s(\d+)\sSTART\s(\d+)\sSTOP\s(\d+)\sID/;
|
|
1059 if($self->{type} == DEL || $self->{type} == INS){
|
|
1060 if(abs($self->{first_bp}{left_pos} - $self->{first_bp}{right_pos}) < $tr_max_indel_size) {
|
|
1061 print STDERR "Tandem repeat mediated INDEL!\n";
|
|
1062 close $REP;
|
|
1063 return;
|
|
1064 }
|
|
1065 }
|
|
1066 else{
|
|
1067 if(($s < 5 || length($self->{consensus}) - $e < 5) && $len * $times > 30 ) {
|
|
1068 print STDERR "Tandem repeat mediated events!\n";
|
|
1069 close $REP;
|
|
1070 return;
|
|
1071 }
|
|
1072 }
|
|
1073 }
|
|
1074 }
|
|
1075 return 1;
|
|
1076 }
|
|
1077
|
|
1078 sub _get_sclip_seqs {
|
|
1079 my ($sam, $chr, $start, $end) = @_;
|
|
1080 my %rtn;
|
|
1081 my $range = $chr . ":" . $start . "-" . $end;
|
|
1082
|
|
1083 $sam->fetch($range, sub {
|
|
1084 my $a = shift;
|
|
1085 my $cigar_str = $a->cigar_str;
|
|
1086 return if($cigar_str !~ /S/);
|
|
1087 my ($sclip_len, $pos, $seq, $qual);
|
|
1088 my @cigar_array = @{$a->cigar_array};
|
|
1089 if($cigar_array[0]->[0] eq 'S' ) {
|
|
1090 $sclip_len = $cigar_array[0]->[1]; $pos = $a->start;
|
|
1091 return if($pos < $start or $pos > $end); # the softclipping position is not in range
|
|
1092 $seq = substr($a->query->dna, 0, $sclip_len );
|
|
1093 $rtn{$a->qname} = $seq if($sclip_len >= 10);
|
|
1094 }
|
|
1095 #if($cigar_str =~ m/S(\d+)$/) {
|
|
1096 if($cigar_array[$#cigar_array]->[0] eq 'S') {
|
|
1097 $sclip_len = $cigar_array[$#cigar_array]->[1];
|
|
1098 $pos = $a->end;
|
|
1099 return if($pos < $start or $pos > $end); # the softclipping position is not in range
|
|
1100 $seq = substr($a->qseq, $a->l_qseq - $sclip_len );
|
|
1101 $rtn{$a->qname} = $seq if($sclip_len >= 10);
|
|
1102 }
|
|
1103 }
|
|
1104 );
|
|
1105 return \%rtn;
|
|
1106 }
|
|
1107
|
|
1108 sub _RNASeq_strand_filter {
|
|
1109 my $self = shift;
|
|
1110 my $type = $self->type;
|
|
1111 print STDERR "RNASeq strand filter\n";
|
|
1112 return 1 unless $gm;
|
|
1113 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
|
|
1114 my ($ort1, $ort2) = ($bp1->{left_ort}, $bp1->{right_ort});
|
|
1115 my ($chr1, $chr2) = ($bp1->{left_chr}, $bp1->{right_chr});
|
|
1116 my ($pos1, $pos2) = ($bp1->{pos}, $bp2->{pos});
|
|
1117 ($pos1, $pos2) = ($pos2, $pos1) if($bp1->{right_pos} == $pos1);
|
|
1118 my ($tmp1, $tmp2) = ($chr1 =~ m/chr/ ? $chr1 : "chr" . $chr1,
|
|
1119 $chr2 =~ m/chr/ ? $chr2 : "chr" . $chr2);
|
|
1120 $tmp1 = "chrM" if($tmp1 eq "chrMT");
|
|
1121 $tmp2 = "chrM" if($tmp2 eq "chrMT");
|
|
1122 my ($r_tree1, $f_tree1) = ($gm->sub_model($tmp1, "-"), $gm->sub_model($tmp1, "+"));
|
|
1123 my ($r_tree2, $f_tree2) = ($gm->sub_model($tmp2, "-"), $gm->sub_model($tmp2, "+"));
|
|
1124 my ($g_ort1, $g_ort2);
|
|
1125 return 1 unless($r_tree1 && $r_tree2 && $f_tree1 && $f_tree2);
|
|
1126 $g_ort1 = "-" if(scalar($r_tree1->intersect([$pos1 - 5, $pos1])) > 0);
|
|
1127 $g_ort1 = "+" if(scalar($f_tree1->intersect([$pos1 - 5, $pos1])) > 0);
|
|
1128 $g_ort2 = "-" if(scalar($r_tree2->intersect([$pos2, $pos2 + 5])) > 0);
|
|
1129 $g_ort2 = "+" if(scalar($f_tree2->intersect([$pos2, $pos2 + 5])) > 0);
|
|
1130 return 1 unless($g_ort1 && $g_ort2);
|
|
1131 return 1 if($g_ort1 eq $ort1 && $g_ort2 eq $ort2);
|
|
1132 return 1 if($g_ort1 ne $ort1 && $g_ort2 ne $ort2);
|
|
1133 return;
|
|
1134 }
|
|
1135
|
|
1136 # INS filter check to make sure that the INS part overlap with any gene
|
|
1137 # if the INS part only in intron will return as a false positive
|
|
1138 sub _RNASeq_INS_filter {
|
|
1139 my $self = shift;
|
|
1140 my $type = $self->{type};
|
|
1141 return 1 if($type != INS);
|
|
1142 print STDERR "RNAseq INS filter\n";
|
|
1143 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
|
|
1144 my ($pos1, $pos2) = ($bp1->{pos}, $bp2->{pos});
|
|
1145 ($pos1, $pos2) = ($pos2, $pos1) if($pos2 < $pos1);
|
|
1146 my $chr = $bp1->{chr};
|
|
1147 $chr = "chr" . $chr unless $chr =~ m/chr/;
|
|
1148 my ($r_tree, $f_tree) = ($gm->sub_model($chr, "-"), $gm->sub_model($chr, "+"));
|
|
1149 foreach my $tree( ($r_tree, $f_tree) ) {
|
|
1150 my @tmp;
|
|
1151 @tmp = $f_tree->intersect([$pos1, $pos2]);
|
|
1152 foreach my $g (@tmp) {
|
|
1153 return 1 if($g->val->overlap([$pos1, $pos2]));
|
|
1154 }
|
|
1155 }
|
|
1156 return;
|
|
1157 }
|
|
1158
|
|
1159 sub _RNASeq_DEL_filter {
|
|
1160 my $self = shift;
|
|
1161 my $type = $self->type;
|
|
1162 return 1 if($type != DEL);
|
|
1163 print STDERR "RNAseq DEL filter\n";
|
|
1164
|
|
1165 my ($gene1, $gene2, $dist) = @{$self->get_genes};
|
|
1166 return if($gene1 eq $gene2);
|
|
1167 my ($d_plus, $d_minus) = (0, 0);
|
|
1168 $d_plus = $1 if($dist =~ m/(\d+)\(\+\)/);
|
|
1169 $d_minus = $1 if($dist =~ m/(\d+)\(\-\)/);
|
|
1170 return if($d_plus == 0 && $d_minus == 0);
|
|
1171 my ($bp1, $bp2) = ($self->{first_bp}, $self->{second_bp});
|
|
1172 return if($bp1->{cover} == 0 || $bp2->{cover} == 0);
|
|
1173 my($left_len, $right_len);
|
|
1174 if(exists $bp1->{left_len}) {
|
|
1175 ($left_len, $right_len ) = ($bp1->{left_len}, $bp1->{right_len});
|
|
1176 }
|
|
1177 else {
|
|
1178 $left_len = length($bp1->{sc_seq}) if($bp1->{sc_seq});
|
|
1179 $right_len = length($bp2->{sc_seq}) if($bp2->{sc_seq});
|
|
1180 }
|
|
1181 return if($left_len < 30 || $right_len < 30);
|
|
1182 return if($bp1->{sc_reads} < 10 || $bp2->{sc_reads} < 10);
|
|
1183 return unless $bp1->{left_gene}->overlap([$bp1->{pos} - $left_len, $bp1->{pos}]);
|
|
1184 return unless $bp1->{right_gene}->overlap([$bp2->{pos} + $right_len, $bp2->{pos}]);
|
|
1185 return 1;
|
|
1186 }
|
|
1187
|
|
1188 sub get_statistics {
|
|
1189 my $self = shift;
|
|
1190 my $half_width = shift;
|
|
1191 my $sam = $self->sam_d;
|
|
1192 my @rtn;
|
|
1193
|
|
1194 foreach my $bp ( ($self->{first_bp}, $self->{second_bp})) {
|
|
1195 my ($chr, $pos ) = ($bp->{chr}, $bp->{pos});
|
|
1196 my $range = $chr . ":" . ($pos - $half_width) . "-" . ($pos + $half_width);
|
|
1197 my ($n_seq, $n_rep, $total_pid) = (0, 0, 0);
|
|
1198
|
|
1199 $sam->fetch($range, sub {
|
|
1200 my $a = shift;
|
|
1201 return unless ($a->start && $a->end);
|
|
1202 return unless ($a->start >= $pos - $half_width && $a->end <= $pos + $half_width);
|
|
1203 $n_seq++;
|
|
1204 $total_pid += _cal_pid($a);
|
|
1205 if($a->has_tag("XT")) {
|
|
1206 $n_rep++ if($a->aux_get("XT") ne "U");
|
|
1207 }
|
|
1208 }
|
|
1209 );
|
|
1210 if($n_seq == 0) {
|
|
1211 push @rtn, (0, 0);
|
|
1212 }
|
|
1213 else {
|
|
1214 push @rtn, ($total_pid/$n_seq, $n_rep/$n_seq);
|
|
1215 }
|
|
1216 }
|
|
1217 return @rtn;
|
|
1218 }
|
|
1219
|
|
1220 sub _cal_pid {
|
|
1221 my $a = shift;
|
|
1222 my ($ref, $matches, $query) = $a->padded_alignment;
|
|
1223 my ($n_match, $n) = (0, 0);
|
|
1224 for( my $i = 0; $i < length($matches); $i++) {
|
|
1225 my $c = substr $matches, $i, 1;
|
|
1226 $n_match++ if($c eq "|");
|
|
1227 $n++;
|
|
1228 }
|
|
1229 return 0 if($n == 0);
|
|
1230 return $n_match/$n;
|
|
1231 }
|
|
1232
|
|
1233
|
|
1234 1;
|
|
1235 __END__;
|
|
1236
|
|
1237 =pod
|
|
1238
|
|
1239 =head1 NAME
|
|
1240
|