Mercurial > repos > jjohnson > crest
comparison CREST.pl @ 0:acc8d8bfeb9a
Uploaded
author | jjohnson |
---|---|
date | Wed, 08 Feb 2012 16:59:24 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc8d8bfeb9a |
---|---|
1 #!/usr/bin/perl -w | |
2 #use lib '/home/jwang2/AssembleTest/Detect/nonSJsrc/dev'; | |
3 use strict; | |
4 use Carp; | |
5 use Getopt::Long; | |
6 use English; | |
7 use Pod::Usage; | |
8 use Data::Dumper; | |
9 use Bio::DB::Sam; | |
10 use Bio::DB::Sam::Constants; | |
11 use Bio::SearchIO; | |
12 use Bio::SeqIO; | |
13 use File::Temp qw/ tempfile tempdir /; | |
14 use File::Spec; | |
15 use File::Path; | |
16 use File::Copy; | |
17 use File::Basename; | |
18 use List::MoreUtils qw/ uniq /; | |
19 use Cwd; | |
20 use SCValidator qw($lowqual_cutoff $min_percent_id $min_percent_hq LEFT_CLIP RIGHT_CLIP); | |
21 use SVUtil qw($use_scratch $work_dir clip_fa_file prepare_reads parse_range get_input_bam | |
22 find_smallest_cover get_work_dir is_PCR_dup read_fa_file get_sclip_reads); | |
23 use StructVar; | |
24 require SVExtTools; | |
25 | |
26 use Transcript; | |
27 use Gene; | |
28 use GeneModel; | |
29 | |
30 my $debug = 1; | |
31 $min_percent_id = 90; | |
32 # input/output | |
33 my ( $out_dir, $out_prefix, $range, $bam_d, $bam_g, $sclip_file); | |
34 my $out_suffix = "predSV.txt"; | |
35 my $read_len = 100; | |
36 my $ref_genome ; | |
37 | |
38 my $hetero_factor = 0.4; | |
39 my $triger_p_value = 0.05; | |
40 | |
41 #RNASeq support | |
42 my $RNASeq = 0; | |
43 my $gene_model_file; | |
44 my $gm_format = "REFFLAT"; | |
45 my $gm; | |
46 my $max_sclip_reads = 50; # we will consider the position if we have enough sclipped reads | |
47 my $min_one_side_reads = 5; | |
48 my $min_pct_sclip_reads = 5; # we require at least 5% reads have softclipping | |
49 | |
50 # external programs related variable | |
51 # 1. cap3 related variables | |
52 my $cap3 = "cap3"; | |
53 my $cap3_options=" -h 70 -y 10 > /dev/null"; | |
54 | |
55 # 2 blat related variables, using blat server and blat standalone | |
56 my $blat_client_exe = "gfClient"; | |
57 my $blat_client_options = ' -out=psl -nohead -minIdentity=95 -maxIntron=5'; | |
58 my $target_genome = "hg18.2bit"; | |
59 my $dir_2bit = "/"; | |
60 my $blat_server = "sjblat"; | |
61 my $blat_port = 50000; | |
62 my $blat_exe = "blat"; | |
63 my $blat_options = " -tileSize=7 -stepSize=1 -out=psl -minScore=15 -noHead -maxIntron=1 "; | |
64 | |
65 $use_scratch = 0; | |
66 my $paired = 1; | |
67 | |
68 my $rescue = 1; | |
69 # other options | |
70 my ($min_sclip_reads, $max_repetitive_cover, $min_sclip_len ); | |
71 my $min_hit_reads; | |
72 my $rmdup; | |
73 my $min_clip_len; | |
74 my ($max_score_diff, $max_num_hits); | |
75 my ($min_dist_diff, $min_hit_len) = (15, 15); | |
76 | |
77 #SV filter related parameters | |
78 my $max_bp_dist = 15; | |
79 my $min_percent_cons_of_read = 0.75; | |
80 my $germline_seq_width = 100; | |
81 my $germline_search_width = 50; | |
82 my $rm_tandem_repeat = 1; #tandem repeat mediated events | |
83 my $tr_max_indel_size = 100; #tandem repeat mediated indel size | |
84 my $tr_min_size = 2; | |
85 my $tr_max_size = 8; #tandem repeat max_size of single repeat | |
86 my $tr_min_num = 4; #tandem repeat minimum number of occurence | |
87 | |
88 # verification pupuse | |
89 my $verify_file; | |
90 my $sensitive; | |
91 my $cluster_size; | |
92 | |
93 # common help options | |
94 my ( $help, $man, $version, $usage ); | |
95 my $optionOK = GetOptions( | |
96 # SV validation related parameters | |
97 'max_bp_dist=i' => \$max_bp_dist, | |
98 'min_percent_cons_of_read=f' => \$min_percent_cons_of_read, | |
99 'germline_seq_width=i' => \$germline_seq_width, | |
100 'germline_search_width=i' => \$germline_search_width, | |
101 # tandem repeat related parameters | |
102 'rm_tandem_repeat!' => \$rm_tandem_repeat, | |
103 'tr_max_indel_size=i' => \$tr_max_indel_size, | |
104 'tr_min_size=i' => \$tr_min_size, | |
105 'tr_max_size=i' => \$tr_max_size, | |
106 'tr_min_num=i' => \$tr_min_num, | |
107 'hetero_factor=f' => \$hetero_factor, | |
108 'triger_p_value=f' => \$triger_p_value, | |
109 | |
110 # input/output | |
111 #'s|sample=s' => \$sample, | |
112 'd|input_d=s' => \$bam_d, | |
113 'g|inpug_g=s' => \$bam_g, | |
114 'f|sclipfile=s' => \$sclip_file, | |
115 'p|prefix=s' => \$out_prefix, | |
116 'o|out_dir=s' => \$out_dir, | |
117 'l|read_len=i' => \$read_len, | |
118 'ref_genome=s' => \$ref_genome, | |
119 'v|verify_file=s' => \$verify_file, | |
120 'sensitive' => \$sensitive, | |
121 'paired!' => \$paired, | |
122 'rescue!' => \$rescue, | |
123 'cluster_size=i' => \$cluster_size, | |
124 # external programs location and options | |
125 'scratch!' => \$use_scratch, | |
126 'cap3=s' => \$cap3, | |
127 'cap3opt=s' => \$cap3_options, | |
128 'blatclient=s' => \$blat_client_exe, | |
129 'blatclientopt=s' => \$blat_client_options, | |
130 'blat=s' => \$blat_exe, | |
131 't|target_genome=s' => \$target_genome, | |
132 'blatopt=s' => \$blat_options, | |
133 'blatserver=s' => \$blat_server, | |
134 'blatport=i' => \$blat_port, | |
135 '2bitdir=s' => \$dir_2bit, | |
136 #RNAseq support | |
137 'RNASeq' => \$RNASeq, | |
138 'genemodel=s' => \$gene_model_file, | |
139 'gmformat=s' => \$gm_format, | |
140 #other related parameters | |
141 'r|range=s' => \$range, | |
142 'max_score_diff=i' => \$max_score_diff, | |
143 'm|min_sclip_reads=i' => \$min_sclip_reads, | |
144 'min_one_side_reads=i' => \$min_one_side_reads, | |
145 'max_rep_cover=i' => \$max_repetitive_cover, | |
146 'min_sclip_len=i' => \$min_sclip_len, | |
147 'min_hit_len=i' => \$min_hit_len, | |
148 'min_dist_diff=i' => \$min_dist_diff, | |
149 'min_hit_reads=i' => \$min_hit_reads, | |
150 'rmdup!' => \$rmdup, | |
151 | |
152 # soft_clipping related parameters (from SVDector package) | |
153 'min_percent_id=i' => \$min_percent_id, | |
154 'min_percent_hq=i' => \$SCValidator::min_percent_hq, | |
155 'lowqual_cutoff=i' => \$lowqual_cutoff, | |
156 | |
157 # common help parameters | |
158 'h|help|?' => \$help, | |
159 'man' => \$man, | |
160 'usage' => \$usage, | |
161 'version' => \$version, | |
162 ); | |
163 | |
164 pod2usage(-verbose=>2) if($man or $usage); | |
165 pod2usage(1) if($help or $version ); | |
166 | |
167 my $start_dir = getcwd; | |
168 | |
169 # figure out input file | |
170 if(!$bam_d) { | |
171 pod2usage(1); | |
172 croak "You need specify input bam file(s)"; | |
173 } | |
174 | |
175 if(!$sclip_file) { | |
176 pod2usage(1); | |
177 croak "You need to specify the softclipping file"; | |
178 } | |
179 $sclip_file = File::Spec->rel2abs($sclip_file); | |
180 | |
181 if(!$ref_genome) { | |
182 pod2usage(1); | |
183 croak "You need to specify the reference genome used by bam file"; | |
184 } | |
185 | |
186 croak "The file you sepcified does not exist" unless ( | |
187 -e $sclip_file && -e $bam_d && -e $ref_genome && -e $target_genome); | |
188 my $input_base; | |
189 $bam_d = File::Spec->rel2abs($bam_d); | |
190 $input_base = fileparse($bam_d); | |
191 $bam_g = File::Spec->rel2abs($bam_g) if($bam_g); | |
192 | |
193 #RNASeq support | |
194 if($RNASeq) { | |
195 croak "You need specify the input gene model file" unless ($gene_model_file); | |
196 StructVar->add_RNASeq_filter(); | |
197 $min_sclip_reads = 10 unless $min_sclip_reads; | |
198 $max_repetitive_cover = 5000 unless $max_repetitive_cover; | |
199 $min_sclip_len = 20 unless $min_clip_len; | |
200 $max_score_diff = 5 unless $max_score_diff; | |
201 $max_num_hits = 3 unless $max_num_hits; | |
202 $min_hit_reads = 5 unless $min_hit_reads; | |
203 $cluster_size = 5 unless $cluster_size; | |
204 } | |
205 else { | |
206 $min_sclip_reads = 3 unless $min_sclip_reads; | |
207 $max_repetitive_cover = 500 unless $max_repetitive_cover; | |
208 $min_sclip_len = 20 unless $min_clip_len; | |
209 $max_score_diff = 5 unless $max_score_diff; | |
210 $max_num_hits = 10 unless $max_num_hits; | |
211 $min_hit_reads = 1 unless $min_hit_reads; | |
212 $cluster_size = 1 unless $cluster_size; | |
213 } | |
214 | |
215 if($gene_model_file) { | |
216 $gm = GeneModel->new if($gene_model_file); | |
217 $gm->from_file($gene_model_file, $gm_format); | |
218 StructVar->gene_model($gm); | |
219 } | |
220 | |
221 # set up the external programs and validators | |
222 # Those variable will be global | |
223 my $assembler = Assembler->new( | |
224 -PRG => $cap3, | |
225 -OPTIONS => $cap3_options | |
226 ); | |
227 | |
228 my $mapper = Mapper->new( | |
229 -PRG => join(' ', ($blat_client_exe, $blat_server, $blat_port)), | |
230 -OPTIONS => $blat_client_options, | |
231 -BIT2_DIR => $dir_2bit, | |
232 -MAX_SCORE_DIFF => $max_score_diff, | |
233 -MAX_NUM_HITS => $max_num_hits, | |
234 ); | |
235 | |
236 my $aligner = Aligner->new( | |
237 -PRG => $blat_exe, | |
238 -OPTIONS => $blat_options, | |
239 ); | |
240 | |
241 StructVar->assembler($assembler); | |
242 StructVar->read_len($read_len); | |
243 StructVar->aligner($aligner); | |
244 StructVar->mapper($mapper); | |
245 StructVar->RNASeq(1) if($RNASeq); | |
246 StructVar->genome($target_genome); | |
247 StructVar->remove_filter("tandem_repeat") unless($rm_tandem_repeat); | |
248 StructVar->tr_max_indel_size($tr_max_indel_size); | |
249 StructVar->tr_min_size($tr_min_size); | |
250 StructVar->tr_max_size($tr_max_size); | |
251 StructVar->tr_min_num($tr_min_num); | |
252 StructVar->max_bp_dist($max_bp_dist) if($max_bp_dist); | |
253 StructVar->germline_seq_width($germline_seq_width) if($germline_seq_width); | |
254 StructVar->germline_search_width($germline_search_width) if($germline_search_width); | |
255 | |
256 | |
257 my $validator = SCValidator->new(); | |
258 $validator->remove_validator('strand_validator') if(!$paired); | |
259 | |
260 #setup output and working directory | |
261 $out_dir = getcwd if(!$out_dir); | |
262 mkdir $out_dir if(!-e $out_dir || ! -d $out_dir); | |
263 $work_dir = get_work_dir(-SCRATCH => $use_scratch); | |
264 $work_dir = $out_dir if(!$work_dir); | |
265 $use_scratch = undef if($work_dir eq $out_dir); # don't erase the out_dir | |
266 chdir($work_dir); | |
267 | |
268 # figure out output prefix | |
269 if(!$out_prefix) { | |
270 $out_prefix = $input_base; | |
271 } | |
272 | |
273 my $sam_d = Bio::DB::Sam->new( -bam => $bam_d, -fasta => $ref_genome); | |
274 StructVar->sam_d($sam_d); | |
275 my $sam_g; | |
276 if($bam_g) { | |
277 $sam_g = Bio::DB::Sam->new( -bam => $bam_g); | |
278 StructVar->sam_g($sam_g); | |
279 } | |
280 | |
281 #my $output_file = File::Spec->catfile($outdir, $out_prefix . $out_suffix); | |
282 my $output_file; | |
283 $output_file = join('.', $out_prefix, $out_suffix); | |
284 $output_file = File::Spec->catfile($out_dir, $output_file); | |
285 | |
286 # the softclip file is sorted, so no need to re-sort it | |
287 open my $SCLIP, "<$sclip_file" or croak "can't open $sclip_file:$OS_ERROR"; | |
288 my %sclip; | |
289 my $pre_p; | |
290 while( my $line = <$SCLIP> ) { | |
291 chomp $line; | |
292 my ($chr, $pos, $ort, $cover, $C) = split /\t/, $line; | |
293 if( ! exists($sclip{$chr})) { | |
294 $sclip{$chr} = []; | |
295 $pre_p = $pos; | |
296 } | |
297 $C = 30 unless $C; | |
298 if($pos < $pre_p) { | |
299 print STDERR "The input file is not sorted!"; | |
300 exit(1); | |
301 } | |
302 push @{$sclip{$chr}}, [$pos, $ort, $cover, $C]; | |
303 } | |
304 close $SCLIP; | |
305 | |
306 my @final_SV; | |
307 my @s_cover = @{find_smallest_cover($min_sclip_reads, $hetero_factor, $triger_p_value)}; | |
308 if($range) { | |
309 @final_SV = detect_range_SVs($sam_d, $range, \%sclip, \@s_cover); | |
310 } | |
311 else { | |
312 foreach my $chr (keys %sclip) { | |
313 my @tmpSV = detect_range_SVs($sam_d, $chr, \%sclip, \@s_cover); | |
314 @final_SV = @tmpSV unless (@final_SV); | |
315 foreach my $sv (@tmpSV) { | |
316 push @final_SV, $sv if($sv && ! is_dup_SV(\@final_SV, $sv)); | |
317 } | |
318 } | |
319 } | |
320 undef %sclip; | |
321 open my $OUT, ">$output_file" or croak "can't open $output_file:$OS_ERROR"; | |
322 foreach my $SV (@final_SV) { | |
323 if($SV->filter) { | |
324 print $OUT $SV->to_string ; | |
325 if($RNASeq) { | |
326 print $OUT "\t", join("\t", @{$SV->get_genes}); | |
327 } | |
328 print $OUT "\n"; | |
329 } | |
330 } | |
331 chdir $start_dir; | |
332 exit(0); | |
333 | |
334 sub detect_range_SVs { | |
335 my ($sam_d, $range, $r_sclips, $r_scover) = @_; | |
336 my ($chr, $start, $end) = parse_range($range); | |
337 my ($tid) = $sam_d->header->parse_region($chr); | |
338 my $chr_len = $sam_d->header->target_len->[$tid]; | |
339 my $r_range_sclips = search_sclip($r_sclips, $range); | |
340 return unless $r_range_sclips; | |
341 my @scover = @{$r_scover}; | |
342 my @rtn; | |
343 | |
344 my ($f_tree, $r_tree); | |
345 if($RNASeq) { | |
346 my $full_chr = $chr; | |
347 my ($gm_chr) = $gm->get_all_chr; | |
348 $full_chr = "chr" . $chr if($chr !~ m/chr/ && $gm_chr =~ /chr/); | |
349 ($f_tree, $r_tree) = ($gm->sub_model($full_chr, "+"), $gm->sub_model($full_chr, "-")) if($RNASeq); | |
350 } | |
351 my $n = scalar @{$r_range_sclips}; | |
352 push @{$r_range_sclips}, [$r_range_sclips->[$n-1][0] + $cluster_size + 1, '+', 1, 50]; | |
353 push @{$r_range_sclips}, [$r_range_sclips->[$n-1][0] + $cluster_size + 1, '-', 1, 50]; | |
354 $n = $n + 2; | |
355 | |
356 # try to save the 1 base off problem of soft-clipping | |
357 my (@ss1, $p1, $c1); | |
358 my (@ss2, $p2, $c2); | |
359 my ($n1, $n2, $cover1, $cover2) = (0, 0, 0, 0); | |
360 for( my $i = 0; $i < $n; $i++) { | |
361 my $s = $r_range_sclips->[$i]; | |
362 # print join("\t", @{$s}), "\n" if($debug); | |
363 my $clip = $s->[1] eq '+' ? RIGHT_CLIP : LEFT_CLIP; | |
364 next if($s->[0] >= $chr_len ); | |
365 if($clip == RIGHT_CLIP) { | |
366 $p1 = $c1 = $s->[0] unless $p1; | |
367 if($s->[0] < $c1) { | |
368 print STDERR "The soft-clipping file has problem! It's either not sorted or | |
369 the file has mutliple parts for a chromsome!"; | |
370 last; | |
371 } | |
372 if($s->[0] - $c1 < $cluster_size) { | |
373 $n1 += $s->[2]; | |
374 $cover1 = $s->[3] if($cover1 < $s->[3]); | |
375 push @ss1, $s; | |
376 if($s->[0] < $c1) { | |
377 $p1 += $s->[0]; | |
378 $c1 = $p1 / scalar @ss1; | |
379 } | |
380 next; | |
381 } | |
382 } | |
383 else { | |
384 $p2 = $c2 = $s->[0] unless $p2; | |
385 if($s->[0] < $c2) { | |
386 print STDERR "The soft-clipping file has problem! It's either not sorted or | |
387 the file has mutliple parts for a chromsome!"; | |
388 last; | |
389 } | |
390 if($s->[0] - $p2 < $cluster_size) { | |
391 $n2 += $s->[2]; | |
392 $cover2 = $s->[3] if($cover2 < $s->[3]); | |
393 push @ss2, $s; | |
394 if($s->[0] < $c2) { | |
395 $p2 += $s->[0]; | |
396 $c2 = $p2 / scalar @ss2; | |
397 } | |
398 next; | |
399 } | |
400 } | |
401 | |
402 my @cc = $clip == LEFT_CLIP ? @ss2 : @ss1; | |
403 my $pos = $clip == LEFT_CLIP ? $ss2[$#ss2]->[0] : $ss1[0]->[0]; | |
404 my ($n_s, $cover_s) = $clip == LEFT_CLIP ? ($n2, $cover2) : ($n1, $cover1); | |
405 my $tmp_range = $clip == LEFT_CLIP ? [$ss2[0]->[0], $ss2[$#ss2]->[0] + $cluster_size] : | |
406 [$ss1[0]->[0] - $cluster_size, $ss1[$#ss1]->[0]]; | |
407 | |
408 if($clip == LEFT_CLIP) { | |
409 @ss2 = (); $p2 = $c2 = $s->[0]; | |
410 $n2 = $s->[2]; | |
411 $cover2 = $s->[3]; | |
412 push @ss2, $s; | |
413 } | |
414 else { | |
415 @ss1 = (); $p1 = $c1 = $s->[0]; | |
416 $n1 = $s->[2]; | |
417 $cover1 = $s->[3]; | |
418 push @ss1, $s; | |
419 } | |
420 | |
421 my @s_reads; | |
422 if($RNASeq) { | |
423 # print $n_s, "\n"; | |
424 next if($n_s < $min_sclip_reads && $cover_s > $s_cover[$n_s] ) ; | |
425 next if($n_s < $max_sclip_reads && $n_s * 100 <= $cover_s * $min_pct_sclip_reads); | |
426 my @f_genes = $f_tree->intersect($tmp_range); | |
427 my @r_genes = $r_tree->intersect($tmp_range); | |
428 next if(scalar @f_genes == 0 && scalar @r_genes == 0); | |
429 } | |
430 else { | |
431 if($n_s < $min_sclip_reads) { #too few covered | |
432 if($cover_s > $s_cover[$n_s]) { | |
433 next if(!$sensitive); | |
434 foreach my $c (@cc) { | |
435 next if($c->[0] >= $chr_len); | |
436 push @s_reads, get_sclip_reads(-SAM => $sam_d, | |
437 -CHR =>$chr, | |
438 -START => $c->[0], | |
439 -END => $c->[0], | |
440 -CLIP => $clip, | |
441 -MINCLIPLEN => 0, | |
442 -VALIDATOR => $validator, | |
443 -PAIRED => $paired, | |
444 -RMDUP => $rmdup, | |
445 -EXTRA => abs($c->[0] - $pos) ); | |
446 } | |
447 } | |
448 } | |
449 } | |
450 if(! @s_reads) { | |
451 foreach my $c (@cc) { | |
452 next if($c->[0] >= $chr_len); | |
453 push @s_reads, get_sclip_reads(-SAM => $sam_d, | |
454 -CHR =>$chr, | |
455 -START => $c->[0], | |
456 -END => $c->[0], | |
457 -CLIP => $clip, | |
458 -MINCLIPLEN => 0, | |
459 -VALIDATOR => $validator, | |
460 -PAIRED => $paired, | |
461 -RMDUP => $rmdup, | |
462 -EXTRA => abs($c->[0] - $pos) ); | |
463 } | |
464 } | |
465 my $l = 0; | |
466 foreach my $r (@s_reads) { | |
467 my $len = length $r->{seq}; | |
468 $l = $len if($l < $len); | |
469 } | |
470 next if ($l < $min_sclip_len); | |
471 print STDERR join("\t", ($chr, $pos, $clip == LEFT_CLIP ? "-" : "+", scalar @s_reads)), "\n"; | |
472 my @SV = detect_SV( | |
473 -SAM => $sam_d, | |
474 -CHR => $chr, | |
475 -POS => $pos, | |
476 -ORT => $clip == LEFT_CLIP ? "-" : "+", | |
477 -SCLIP => \%sclip, | |
478 -COVER => $cover_s, | |
479 -SREADS => \@s_reads); | |
480 foreach my $sv (@SV) { | |
481 $sv->{type} = $sv->type; | |
482 push @rtn, $sv if($sv && ! is_dup_SV(\@rtn, $sv)); | |
483 } | |
484 } | |
485 | |
486 if($RNASeq) { | |
487 push @rtn, find_del_SVs($sam_d, $chr, $start, $end); | |
488 } | |
489 return @rtn; | |
490 } | |
491 | |
492 sub is_dup_SV { | |
493 my($r_SVs, $sv) = @_; | |
494 foreach my $s (@{$r_SVs}) { | |
495 return 1 | |
496 if( $s->first_bp->{pos} == $sv->second_bp->{pos} && | |
497 $s->first_bp->{chr} eq $sv->second_bp->{chr} && | |
498 $s->second_bp->{pos} == $sv->first_bp->{pos} && | |
499 $s->second_bp->{chr} eq $sv->first_bp->{chr} && | |
500 $s->{type} == $sv->{type} ); | |
501 return 1 | |
502 if( $s->first_bp->{pos} == $sv->first_bp->{pos} && | |
503 $s->first_bp->{chr} eq $sv->first_bp->{chr} && | |
504 $s->second_bp->{pos} == $sv->second_bp->{pos} && | |
505 $s->second_bp->{chr} eq $sv->second_bp->{chr} && | |
506 $s->{type} == $sv->{type} ); | |
507 | |
508 } | |
509 return; | |
510 } | |
511 | |
512 sub search_sclip { | |
513 my ($r_sclip, $range) = @_; | |
514 my ($chr, $start, $end) = parse_range($range); | |
515 return $r_sclip->{$chr} if(!$start); | |
516 my $start_i = bin_search($r_sclip->{$chr}, $start); | |
517 my $end_i = bin_search($r_sclip->{$chr}, $end); | |
518 my @tmp = @{$r_sclip->{$chr}}; | |
519 if($start_i <= $end_i) { | |
520 @tmp = @tmp[$start_i .. $end_i]; | |
521 if($start_i == $end_i) { | |
522 return undef if($tmp[0]->[0] < $start || $tmp[0]->[0] > $end); | |
523 } | |
524 return \@tmp; | |
525 } | |
526 else { | |
527 return undef; | |
528 } | |
529 } | |
530 | |
531 sub bin_search { | |
532 my ($a, $p) = @_; | |
533 my ($s, $e) = (0, scalar(@{$a})-1); | |
534 my $m = int( ($s + $e)/2); | |
535 while(1) { | |
536 return $s if($a->[$s][0] >= $p); | |
537 return $e if($a->[$e][0] <= $p); | |
538 return $m if($a->[$m][0] == $p || ($a->[$m-1][0] < $p && $a->[$m+1][0] > $p)); | |
539 if($a->[$m][0] > $p) { | |
540 $e = $m; | |
541 } | |
542 else { | |
543 $s = $m; | |
544 } | |
545 $m = int( ($s+$e)/2 ); | |
546 } | |
547 } | |
548 | |
549 sub count_coverage { | |
550 my ($sam, $chr, $pos, $clip) = @_; | |
551 if($rmdup && !$RNASeq) { | |
552 my @pairs; | |
553 my $seg = $sam->segment(-seq_id => $chr, -start => $pos, -end => $pos); | |
554 my $n = 0; | |
555 return 0 unless $seg; | |
556 my $itr = $seg->features(-iterator => 1); | |
557 while( my $a = $itr->next_seq) { | |
558 next unless($a->start && $a->end); #why unmapped reads here? | |
559 my $sclip_len = 0; | |
560 if($clip) { | |
561 my @cigar_array = @{$a->cigar_array}; | |
562 #$sclip_len = $1 if($a->cigar_str =~ m/S(\d+)$/ && $clip == RIGHT_CLIP); | |
563 $sclip_len = $cigar_array[0]->[1] if($cigar_array[0]->[0] eq 'S' && $clip == RIGHT_CLIP); | |
564 #$sclip_len = $1 if($a->cigar_str =~ m/^S(\d+)/ && $clip == LEFT_CLIP); | |
565 $sclip_len = $cigar_array[$#cigar_array]->[1] if($cigar_array[$#cigar_array]->[0] eq 'S' && $clip == RIGHT_CLIP); | |
566 } | |
567 next if(@pairs > 0 && is_PCR_dup($a, \@pairs, $sclip_len)); | |
568 $n++; | |
569 return $n if( $n > $max_repetitive_cover); | |
570 push @pairs, [$a->start, $a->end, $a->mate_start ? $a->mate_start : 0, | |
571 $a->mate_end ? $a->mate_end : 0, $sclip_len]; | |
572 } | |
573 return $n; | |
574 } | |
575 else{ | |
576 my ($c) = $sam->features(-type => 'coverage', -seq_id=> $chr, | |
577 -start => $pos, -end => $pos); | |
578 return 0 unless $c; | |
579 my @c_d = $c->coverage; | |
580 return $c_d[0]; | |
581 } | |
582 } | |
583 | |
584 sub detect_SV { | |
585 my %param = @_; | |
586 my @s_reads = @{$param{-SREADS}}; | |
587 my($sam, $chr, $ort, $r_sclip, $coverage) = ( $param{-SAM}, | |
588 $param{-CHR}, $param{-ORT}, $param{-SCLIP}, $param{-COVER}); | |
589 my $clip = $ort eq '+' ? RIGHT_CLIP : LEFT_CLIP; | |
590 return if($coverage > $max_repetitive_cover); | |
591 my $fa_name = prepare_reads(\@s_reads, $clip); | |
592 my($contig_file, $sclip_count, $contig_reads) = $assembler->run($fa_name); | |
593 return if(!$contig_file or !(-e $contig_file)); | |
594 | |
595 $contig_file = clip_fa_file($contig_file, $clip); | |
596 my $contig_seqs = read_fa_file($contig_file); | |
597 if(scalar @s_reads == 1) { | |
598 $contig_file = clip_fa_file($fa_name, $clip); | |
599 $contig_seqs = read_fa_file($contig_file); | |
600 my @seq_names = keys %{$contig_seqs}; | |
601 $sclip_count->{$seq_names[0]} = 1; | |
602 } | |
603 | |
604 my @SV; | |
605 my $where = ($clip == LEFT_CLIP ? "right" : "left"); | |
606 my $mapping = $mapper->run( -QUERY => $contig_file ); | |
607 my (%reads, %quals, %s_lens, %pos); | |
608 foreach my $r (@s_reads) { | |
609 $reads{$r->{name}} = $r->{full_seq}; | |
610 $quals{$r->{name}} = $r->{full_qual}; | |
611 $s_lens{$r->{name}} = length($r->{seq}); | |
612 $pos{$r->{name}} = $r->{pos}; | |
613 } | |
614 | |
615 foreach my $s (keys(%{$mapping})) { | |
616 next if($sclip_count->{$s} < $min_sclip_reads); | |
617 | |
618 # try to find a better mapping | |
619 my @tmp_reads; | |
620 my $selected_read; | |
621 my @tmp_pos; | |
622 foreach my $n (@{$contig_reads->{$s}}) { | |
623 push @tmp_pos, $pos{$n}; | |
624 } | |
625 @tmp_pos = uniq @tmp_pos; | |
626 my $real_pos = $clip == LEFT_CLIP ? $tmp_pos[$#tmp_pos] : $tmp_pos[0]; | |
627 | |
628 my $n_new_SV = 0; | |
629 my $tmp_bp; | |
630 foreach my $t (@{$mapping->{$s}}) { | |
631 my $qort = $t->{qstrand}; | |
632 my $t_pos = $t->{tstart}; | |
633 $t_pos = $t->{tend} if( ($qort eq "+" && $clip == LEFT_CLIP) || | |
634 ($qort eq '-' && $clip == RIGHT_CLIP)); | |
635 if($t->{tchr} eq $chr && (abs($t_pos - $real_pos) < $min_dist_diff)) { # the soft clipping is incorrect | |
636 @SV = @SV[0 .. ($#SV - $n_new_SV + 1)] if($n_new_SV > 0); | |
637 last; | |
638 } | |
639 my $first_bp = {}; | |
640 $first_bp->{sc_reads} = $sclip_count->{$s}; | |
641 $first_bp->{$where . "_chr"} = $chr; | |
642 $first_bp->{$where . "_pos"} = $real_pos; | |
643 $first_bp->{all_pos} = \@tmp_pos; | |
644 $first_bp->{$where . "_ort"} = "+"; | |
645 $first_bp->{pos} = $real_pos; | |
646 $first_bp->{cover} = $coverage; | |
647 $first_bp->{chr} = $chr; | |
648 my $new_where = ($where eq "right" ? "left" : "right"); | |
649 | |
650 $first_bp->{$new_where . "_chr"} = $t->{tchr}; | |
651 $first_bp->{$new_where . "_pos"} = $t_pos; | |
652 $first_bp->{$new_where . "_ort"} = $qort; | |
653 $first_bp->{sc_seq} = $contig_seqs->{$s}; | |
654 $first_bp->{reads} = $contig_reads->{$s}; | |
655 | |
656 $tmp_bp = $first_bp unless $tmp_bp; | |
657 my $second_bp = check_sclip(-SAM => $sam, -TARGET => $t, -CHR => $chr, | |
658 -POS => $real_pos, -SCLIP => $r_sclip, -CLIP => $clip); | |
659 if(@{$second_bp}) { | |
660 foreach my $tmp_bp (@{$second_bp}) { | |
661 push @SV, StructVar->new(-FIRST_BP => $first_bp, -SECOND_BP => $tmp_bp); | |
662 $n_new_SV++; | |
663 } | |
664 } | |
665 } | |
666 # save some one side good soft-clipping SV | |
667 if( $rescue == 1 && | |
668 $n_new_SV == 0 && | |
669 $sclip_count->{$s} >= $min_one_side_reads && | |
670 (scalar(@{$mapping->{$s}}) == 1 || ($mapping->{$s}[0]{perfect} == 1 && $mapping->{$s}->[1]{perfect} == 0)) && | |
671 $tmp_bp->{chr} && | |
672 length($contig_seqs->{$s}) * 0.95 < $mapping->{$s}[0]{matches} ) { | |
673 | |
674 my %second_bp = %{$tmp_bp}; | |
675 $second_bp{sc_reads} = 0; | |
676 $second_bp{sc_seq} = ''; | |
677 ($second_bp{chr}, $second_bp{pos}) = $second_bp{pos} == $second_bp{left_pos} ? | |
678 ($second_bp{right_chr}, $second_bp{right_pos}) : ($second_bp{left_chr}, $second_bp{left_pos}); | |
679 $second_bp{cover} = count_coverage($sam, $second_bp{chr}, $second_bp{pos}); | |
680 $second_bp{reads} = []; | |
681 push @SV, StructVar->new(-FIRST_BP => $tmp_bp, -SECOND_BP => \%second_bp); | |
682 } | |
683 | |
684 } | |
685 system("rm $fa_name"); system("rm $fa_name*"); | |
686 return @SV; | |
687 } | |
688 | |
689 sub get_gene_range { | |
690 my ($chr, $pos, $clip) = @_; | |
691 my $r = $clip == LEFT_CLIP ? [$pos, $pos+5] : [$pos - 5, $pos]; | |
692 my @genes; | |
693 my $tmp = $chr; | |
694 $tmp = "chr" . $tmp if($tmp !~ m/chr/); | |
695 my ($f_tree, $r_tree) = ($gm->sub_model($tmp, "+"), $gm->sub_model($tmp, "-")); | |
696 push @genes, $f_tree->intersect($r); | |
697 push @genes, $r_tree->intersect($r); | |
698 my ($s, $e); | |
699 if(scalar @genes == 0) { #no gene here | |
700 return [$pos - $read_len, $pos + $read_len]; | |
701 } | |
702 foreach my $g (@genes) { | |
703 my $start = $g->val->get_start($pos, $read_len); | |
704 my $end = $g->val->get_end($pos, $read_len); | |
705 $s = $start if(!$s || $s > $start); | |
706 $e = $end if(!$e || $e < $end); | |
707 } | |
708 return [$s, $e]; | |
709 } | |
710 | |
711 sub check_sclip { | |
712 my %arg = @_; | |
713 my ($sam, $chr, $pos, $target, $r_sclip, $clip) = | |
714 ( $arg{-SAM}, $arg{-CHR}, $arg{-POS}, $arg{-TARGET}, $arg{-SCLIP}, $arg{-CLIP} ); | |
715 | |
716 my @bp; | |
717 | |
718 # identify the searching region for partner soft cliping | |
719 my $extension; | |
720 if($RNASeq) { | |
721 $extension = 50; | |
722 } | |
723 else { | |
724 $extension = $read_len - ($target->{qend} - $target->{qstart}); | |
725 } | |
726 my ($tpos, $ort) = ($target->{tend}, $target->{qstrand}); | |
727 $tpos = $target->{tstart} if( ($clip == LEFT_CLIP && $ort eq '-') | |
728 || ($clip == RIGHT_CLIP && $ort eq '+')); | |
729 $extension = abs($tpos - $pos) - 1 | |
730 if($chr eq $target->{tchr} && abs($tpos - $pos) <= $extension); # don't consider itself | |
731 my $range =$target->{tchr} . ":"; | |
732 if($tpos < $extension) { | |
733 $range .= '1'; | |
734 } | |
735 else{ | |
736 $range .= $tpos - $extension; | |
737 } | |
738 | |
739 $range .= "-"; $range .= $tpos + $extension; | |
740 | |
741 # the orginal genome sequence where we find the soft_clipping | |
742 my $orig; | |
743 my $tmp = $chr; | |
744 #$tmp = 'chr' . $tmp if($tmp !~ m/^chr/); | |
745 $orig = $target_genome . ":" . $tmp . ":"; | |
746 my $base_pos; | |
747 if($RNASeq) { #let's do a wild guess! | |
748 my $r = get_gene_range($tmp, $pos, $clip); | |
749 return \@bp unless $r; | |
750 $base_pos = $r->[0]; | |
751 $orig = join("", ($orig, $r->[0], "-", $r->[1])); | |
752 } | |
753 else { | |
754 $base_pos = $pos - $read_len; | |
755 $orig = join("", ($orig, $pos - $read_len, "-", $pos + $read_len)); | |
756 } | |
757 | |
758 return \@bp if(!exists( $r_sclip->{$target->{tchr}})); #mapped to chrY or other minor chrs | |
759 # the soft clipping happens in highly repetitive region | |
760 # return \@bp if($coverage > $max_repetitive_cover); | |
761 | |
762 my $r_pp = search_sclip($r_sclip, $range); | |
763 return \@bp if(!$r_pp); | |
764 | |
765 my $real_pos; | |
766 my $found = 0; | |
767 my @r_reads; | |
768 my @l_reads; | |
769 foreach my $s (@{$r_pp}) { | |
770 #next if($s->[2] < $min_hit_reads); | |
771 next if($chr ne $target->{tchr} && ( | |
772 ($clip == LEFT_CLIP && $ort ne $s->[1] ) || | |
773 ($clip == RIGHT_CLIP && $ort eq $s->[1] )) ); | |
774 | |
775 next if($chr eq $target->{tchr} && ($s->[0] < $tpos - $extension | |
776 || $s->[0] > $tpos + $extension)); | |
777 my $tort = $s->[1]; | |
778 next if($s->[3] > $max_repetitive_cover); | |
779 my $tclip = $tort eq '+' ? RIGHT_CLIP : LEFT_CLIP; | |
780 my @reads = get_sclip_reads( | |
781 -SAM => $sam, | |
782 -CHR => $target->{tchr}, | |
783 -START => $s->[0], | |
784 -END => $s->[0], | |
785 -CLIP => $tclip, | |
786 -MINCLIPLEN => $min_hit_len, | |
787 -VALIDATOR => $validator, | |
788 -PAIRED => $paired, | |
789 -RMDUP => $rmdup); | |
790 next if(!@reads || scalar(@reads) == 0); | |
791 # push @r_reads, @reads if($tclip == RIGHT_CLIP); | |
792 # push @l_reads, @reads if($tclip == LEFT_CLIP); | |
793 # } | |
794 | |
795 # foreach my $tclip ( (RIGHT_CLIP, LEFT_CLIP)) { | |
796 # my $s_reads = $tclip == RIGHT_CLIP ? \@r_reads : \@l_reads; | |
797 # next if(scalar @{$s_reads} == 0); | |
798 my %count; | |
799 # my $fa_name = prepare_reads($s_reads, $tclip); | |
800 my $fa_name = prepare_reads(\@reads, $tclip); | |
801 | |
802 my($contig_file, $sclip_count, $contig_reads, $singlet_file) = $assembler->run($fa_name); | |
803 my (%reads, %quals, %s_lens, %pos); | |
804 # foreach my $r (@{$s_reads}) { | |
805 foreach my $r (@reads) { | |
806 $reads{$r->{name}} = $r->{full_seq}; | |
807 $quals{$r->{name}} = $r->{full_qual}; | |
808 $s_lens{$r->{name}} = length($r->{seq}); | |
809 $pos{$r->{name}} = $r->{pos}; | |
810 } | |
811 | |
812 $contig_file = clip_fa_file($contig_file, $tclip); | |
813 if( -s $singlet_file ) { | |
814 $singlet_file = clip_fa_file($singlet_file, $tclip); | |
815 system("cat $singlet_file >> $contig_file"); | |
816 system("rm $singlet_file"); | |
817 } | |
818 my $seqs = read_fa_file($contig_file); | |
819 my $hits = $aligner->run( -TARGET => $orig, -QUERY => $contig_file); | |
820 foreach my $t (keys %{$hits}) { | |
821 my $h = $hits->{$t}; | |
822 my $tmp_bp; | |
823 my @all_pos; | |
824 my $all_reads_name; | |
825 if(exists $contig_reads->{$t}) { | |
826 foreach my $n (@{$contig_reads->{$t}}) { | |
827 push @all_pos, $pos{$n}; | |
828 } | |
829 $all_reads_name = $contig_reads->{$t}; | |
830 @all_pos = uniq @all_pos; | |
831 @all_pos = sort {$a <=> $b} @all_pos; | |
832 | |
833 } | |
834 else { | |
835 push @all_pos, $pos{$t}; | |
836 $all_reads_name = [$t]; | |
837 } | |
838 | |
839 if(is_good_hit($h, $clip, $tclip)) { | |
840 my $hit_ort = ($h->strand('query') == 1 ? "+" : "-"); | |
841 my $real_pos = $tclip == RIGHT_CLIP ? $all_pos[0] : $all_pos[$#all_pos]; | |
842 if($tclip == RIGHT_CLIP ) { | |
843 $tmp_bp = { | |
844 left_ort => '+', | |
845 left_pos => $real_pos, | |
846 left_chr => $target->{tchr}, | |
847 right_ort => $hit_ort, | |
848 right_pos => ($hit_ort eq "+" ? $h->start("hit") : $h->end("hit")) + $base_pos, | |
849 right_chr => $chr, | |
850 } | |
851 } | |
852 else { | |
853 $tmp_bp = { | |
854 left_ort => $hit_ort, | |
855 left_chr => $chr, | |
856 left_pos => ($hit_ort eq "+" ? $h->end("hit") : $h->start("hit")) + $base_pos, | |
857 right_ort => "+", | |
858 right_chr => $target->{tchr}, | |
859 right_pos => $real_pos, | |
860 } | |
861 } | |
862 $tmp_bp->{chr} = $target->{tchr}; | |
863 $tmp_bp->{pos} = $real_pos; | |
864 $tmp_bp->{all_pos} = \@all_pos; | |
865 $tmp_bp->{sc_seq} = $seqs->{$t}; | |
866 $tmp_bp->{cover} = count_coverage($sam, $target->{tchr}, $real_pos); | |
867 $tmp_bp->{sc_reads} = exists $sclip_count->{$t} ? $sclip_count->{$t} : 1; | |
868 $tmp_bp->{reads} = exists $contig_reads->{$t} ? $contig_reads->{$t} : [$t]; | |
869 | |
870 push @bp, $tmp_bp; | |
871 } | |
872 } | |
873 system("rm $fa_name"); system("rm $fa_name*"); | |
874 } | |
875 return \@bp; | |
876 } | |
877 | |
878 # many more filter can be added here | |
879 sub is_good_hit { | |
880 my ($hit, $clip, $tclip) = @_; | |
881 | |
882 return if($hit->length_aln('query') < $min_hit_len); | |
883 return if($hit->frac_identical * 100 < $min_percent_id ); | |
884 return 1; | |
885 | |
886 # do we want to do the check? | |
887 my $ort = ($hit->start('query') < $hit->end('query') ? "+" : "-"); | |
888 my ($dist, $t_dist, $qstart, $qend); | |
889 $qstart = $ort eq '+' ? $hit->start('query') : $hit->end('query') ; | |
890 $qend = $ort eq '+' ? $hit->end('query') : $hit->start('query'); | |
891 $t_dist = $qstart; | |
892 $t_dist = $hit->query_length - $qend if($tclip == LEFT_CLIP); | |
893 | |
894 } | |
895 | |
896 # RNASeq support to identify deletions | |
897 | |
898 sub cigar2hit { | |
899 my ($s, @cigar_array) = @_; | |
900 my @pos; | |
901 my $p = $s; | |
902 foreach my $c (@cigar_array){ | |
903 my($op, $len) = @{$c}; | |
904 if($op eq 'N') { | |
905 push @pos, [$s, $p]; | |
906 $s = $p + $len; | |
907 $p = $s; | |
908 next; | |
909 } | |
910 $p += $len if($op eq 'M' || $op eq 'D'); | |
911 } | |
912 push @pos, [$s, $p]; | |
913 return \@pos; | |
914 } | |
915 | |
916 sub find_del_SVs { | |
917 my ($sam, $chr, $start, $end) = @_; | |
918 my $max_sclip_len = 0; | |
919 my $range = $chr; | |
920 if($start && $end) { | |
921 $range = $chr . ":" . $start . "-" . $end; | |
922 } | |
923 my %SV; | |
924 my $tmp = $chr; | |
925 $tmp = "chr" . $chr if($chr !~ m/chr/); | |
926 | |
927 my($f_tree, $r_tree) = ($gm->sub_model($tmp, "+"), $gm->sub_model($tmp, "-")); | |
928 | |
929 $sam->fetch($range, sub { | |
930 my $a = shift; | |
931 my $cigar_str = $a->cigar_str; | |
932 # print $a->qname, "\t", $cigar_str, "\n"; | |
933 return unless $cigar_str =~ m/N/; # the read cross two genes must have N | |
934 # return unless $a->score >= 97; | |
935 my $sv = identify_del_SV($f_tree, $a); | |
936 if($sv){ | |
937 my $tmp1 = $sv->[0] . "+" . $sv->[1]; | |
938 if(!exists($SV{$tmp1})) { | |
939 $SV{$tmp1} = {}; | |
940 $SV{$tmp1}->{num} = 0; | |
941 $SV{$tmp1}->{left} = 0; | |
942 $SV{$tmp1}->{right} = 0; | |
943 } | |
944 $SV{$tmp1}->{num}++; | |
945 $SV{$tmp1}->{left} = $sv->[5] if($SV{$tmp1}->{left} < $sv->[5]); | |
946 $SV{$tmp1}->{right} = $sv->[6] if($SV{$tmp1}->{right} < $sv->[6]); | |
947 $SV{$tmp1}->{gene_ort} = "+"; | |
948 $SV{$tmp1}->{left_gene} = $sv->[3]; | |
949 $SV{$tmp1}->{right_gene} = $sv->[4]; | |
950 push @{$SV{$tmp1}->{reads}}, $a->qname; | |
951 | |
952 } | |
953 $sv = identify_del_SV($r_tree, $a); | |
954 if($sv){ | |
955 my $tmp1 = $sv->[0] . "-" . $sv->[1]; | |
956 if(!exists($SV{$tmp1})) { | |
957 $SV{$tmp1} = {}; | |
958 $SV{$tmp1}->{num} = 0; | |
959 $SV{$tmp1}->{left} = 0; | |
960 $SV{$tmp1}->{right} = 0; | |
961 $SV{$tmp1}->{reads} = []; | |
962 } | |
963 $SV{$tmp1}->{num}++; | |
964 $SV{$tmp1}->{left} = $sv->[5] if($SV{$tmp1}->{left} < $sv->[5]); | |
965 $SV{$tmp1}->{right} = $sv->[6] if($SV{$tmp1}->{right} < $sv->[6]); | |
966 $SV{$tmp1}->{gene_ort} = "-"; | |
967 $SV{$tmp1}->{left_gene} = $sv->[3]; | |
968 $SV{$tmp1}->{right_gene} = $sv->[4]; | |
969 push @{$SV{$tmp1}->{reads}}, $a->qname; | |
970 } | |
971 } | |
972 ); | |
973 | |
974 my @rtn; | |
975 foreach my $sv (keys %SV) { | |
976 my( $pos1, $pos2 ) = $sv =~ m/(\d+)[+|-](\d+)/; | |
977 print $pos1, "\t", $pos2, "\n"; | |
978 my( $cover1, $cover2) = (count_coverage($sam, $chr, $pos1 - 1), count_coverage($sam, $chr, $pos2)); | |
979 next if($SV{$sv}->{num} < $min_sclip_reads); | |
980 my $cover = $cover1 < $cover2 ? $cover1 : $cover2; | |
981 next if($SV{$sv}->{num} * 100 < $cover * $min_pct_sclip_reads && $SV{$sv} < 50); | |
982 push @rtn, StructVar->new ( | |
983 -FIRST_BP => { left_ort => '+', | |
984 left_pos => $pos1 - 1, | |
985 left_chr => $chr, | |
986 right_ort => '+', | |
987 right_pos => $pos2, | |
988 right_chr => $chr, | |
989 chr => $chr, | |
990 cover => count_coverage($sam, $chr, $pos1 - 1), | |
991 pos => $pos1 - 1, | |
992 sc_reads => $SV{$sv}->{num}, | |
993 left_len => $SV{$sv}->{left}, | |
994 right_len => $SV{$sv}->{right}, | |
995 gene_ort => $SV{$sv}->{gene_ort}, | |
996 left_gene => $SV{$sv}->{left_gene}, | |
997 right_gene => $SV{$sv}->{right_gene}, | |
998 reads => $SV{$sv}->{reads}, | |
999 }, | |
1000 -SECOND_BP => { left_ort => '+', | |
1001 left_pos => $pos1 - 1, | |
1002 left_chr => $chr, | |
1003 right_ort => '+', | |
1004 right_pos => $pos2, | |
1005 right_chr => $chr, | |
1006 chr => $chr, | |
1007 pos => $pos2, | |
1008 cover => count_coverage($sam, $chr, $pos2), | |
1009 sc_reads => $SV{$sv}->{num}, | |
1010 reads => [], | |
1011 } | |
1012 ); | |
1013 } | |
1014 return @rtn; | |
1015 } | |
1016 | |
1017 sub identify_del_SV { | |
1018 my ($tree, $a) = @_; | |
1019 my @cigar_array = @{$a->cigar_array}; | |
1020 return unless($a->start && $a->end); | |
1021 return if(scalar $tree->intersect([$a->start, $a->end]) <= 1); | |
1022 | |
1023 my @hits = @{ cigar2hit($a->start, @cigar_array) }; | |
1024 my @genes; | |
1025 my $last_gene; | |
1026 my @change; | |
1027 my ($left, $right) = (0, 0); | |
1028 for( my $i = 0; $i < scalar @hits; $i++) { | |
1029 my $h = $hits[$i]; | |
1030 my @g = $tree->intersect($h); | |
1031 my $g_size = scalar @g; | |
1032 if($g_size == 0) { | |
1033 # print STDERR $a->qname, " is mapped outside of gene\n"; | |
1034 next; | |
1035 } | |
1036 if( scalar @g == 1) { | |
1037 push @genes, $g[0] unless ($last_gene && $last_gene->val->name eq $g[0]->val->name); | |
1038 if($last_gene && $last_gene->val->name ne $g[0]->val->name) { | |
1039 $right += ($h->[1] - $h->[0] ); | |
1040 push @change, $i; | |
1041 } | |
1042 $left += ($h->[1] - $h->[0] ) if($right == 0); | |
1043 $last_gene = $g[0]; | |
1044 next; | |
1045 } | |
1046 # print STDERR $a->qname, " connect two genes into one?\n"; | |
1047 return; | |
1048 } | |
1049 return if(scalar @genes <= 1); | |
1050 if(scalar @genes > 2) { | |
1051 # print STDERR $a->qname, "crossed more than 2 genes!\n"; | |
1052 return; | |
1053 } | |
1054 # now we down to 2 genes | |
1055 my ($left_gene, $right_gene) = @genes; | |
1056 return unless $left_gene->val->overlap([$hits[$change[0]-1]->[0], $hits[$change[0]-1]->[1]]); | |
1057 return unless $right_gene->val->overlap([$hits[$change[0]]->[0], $hits[$change[0]]->[1]]); | |
1058 my $p = $left_gene->val->end; | |
1059 print join("\t", ( $a->score, | |
1060 $a->cigar_str, $left, $right, $hits[$change[0]-1]->[1], $hits[$change[0]]->[0], | |
1061 $left_gene->val->name, $right_gene->val->name)), "\n"; | |
1062 return [$hits[$change[0]-1]->[1], $hits[$change[0]]->[0], $a, $left_gene->val, $right_gene->val, $left, $right]; | |
1063 } | |
1064 | |
1065 =head1 NAME | |
1066 | |
1067 CREST.pl - a Structure Variation detection tools using softclipping for | |
1068 whole genome sequencing. | |
1069 | |
1070 | |
1071 =head1 VERSION | |
1072 | |
1073 This documentation refers to CREST.pl version 0.0.1. | |
1074 | |
1075 | |
1076 =head1 USAGE | |
1077 | |
1078 This program depends on several things that need to be installed and/or | |
1079 specified. The program uses BioPerl and Bio::DB::Sam module to parse | |
1080 the files and bam files. Also it uses Blat software suits to do genome | |
1081 mapping and alignment. To make the program efficient, it also requires | |
1082 a blat server setup. And the program uses CAP3 assembler. | |
1083 | |
1084 Identify somatic SVs from input: | |
1085 CREST.pl -f somatic.cover -d tumor.bam -g germline.bam | |
1086 Identify SVs from a single bam compared wtih reference genome: | |
1087 CREST.pl -f sclip.cover -d sample.bam | |
1088 Identify SVs from a single bam on chr1 only | |
1089 CREST.pl -f sclip.cover -d sample.bam -r chr1 | |
1090 | |
1091 =head1 REQUIRED ARGUMENTS | |
1092 | |
1093 To run the program, several parameter must specified. | |
1094 -d, --input_d: The input (diagnositic) bam file | |
1095 -f, --sclipfile: The soft clipping information generated by extractSClip.pl | |
1096 --ref_genome: The reference genome file in fa format | |
1097 -t, --target_genome: The 2bit genome file used by blat server and blat | |
1098 --blatserver: The blat server name or ip address | |
1099 --blatport: The blat server port | |
1100 | |
1101 =head1 OPTIONS | |
1102 | |
1103 The options that can be used for the program. | |
1104 -g, --input_g The germline (paired) bam file | |
1105 -p, --prefix The output prefix, default is the input bam file name | |
1106 -o, --out_dir Output directory, default is the working directory | |
1107 -l, --read_len The read length of the sequencing data, defaut 100 | |
1108 --sensitive The program will generate more SVs with higher false positive rate. | |
1109 | |
1110 --(no)scratch Use the scratch space, default is off. | |
1111 --(no)paired Use paired reads or not, defafult is on, so change to --nopaired for unpaired reads. | |
1112 --cap3 CAP3 executable position, default cap3 | |
1113 --cap3opt CAP3 options, single quoted, Default ' > /dev/null' | |
1114 --blatclient gfClient excutable position, default gfClient | |
1115 --blatclientopt gfClient options, single quoted, default '-out=psl -nohead' | |
1116 --blat blat executable potion, default balt | |
1117 --blatopt blat options, single quoted, default '-tileSize=7 -stepSize=1 -out=psl -minScore=15' | |
1118 | |
1119 -r, --range The range where SV will be detected, using chr1:100-200 format. | |
1120 --max_score_diff The maximum score difference when stopping select hit, default 10. | |
1121 --min_sclip_reads Minimum number of soft clipping read to triger the procedure, default 3. | |
1122 --max_rep_cover The min number of coverage to be called as repetitive and don't triger | |
1123 the procedure, default 500. | |
1124 --min_sclip_len The min length of soft clipping part at a position to triger the detection, | |
1125 default 20. | |
1126 --min_hit_len Min length of a hit for genome mapping | |
1127 --min_dist_diff Min distance between the mapped position and the soft clipping position, default 20. | |
1128 --(no)rmdup Remove PCR dumplicate. Default remove. | |
1129 | |
1130 --min_percent_id Min percentage of identity of soft clipping read mapping, default 90 | |
1131 --min_percent_hq Min percentage of high quality base in soft clipping reads, default 80 | |
1132 --lowqual_cutoff Low quality cutoff value, default 20. | |
1133 | |
1134 -h, --help, -? Help information | |
1135 --man Man page. | |
1136 --usage Usage information. | |
1137 --version Software version. | |
1138 | |
1139 --(no)rm_tandem_repeat Remove tandem repeat caused SV events, default is ON. When it's on ptrfinder program | |
1140 need to be on the path. | |
1141 --tr_max_indel_size Maximum tandem repeat mediated INDEL events, default 100 | |
1142 --tr_min_size Minimum tandem reapet size, default 2 | |
1143 --tr_max_size Maximum tandem repeat size, default 8 | |
1144 --tr_min_num Minimum tandem repeat number, defaut 4 | |
1145 --min_percent_cons_of_read Minimum percent of consensus length of read length, default 0.75 | |
1146 --max_bp_dist Maximum distance between two idenfitifed break points, default 15 | |
1147 --germline_seq_width Half window width of genomic sequence around break point for germline SV filtering, | |
1148 default 100 | |
1149 --germline_search_width Half window width for seaching soft-clipped reads around breakpoint for germline SV | |
1150 filtering, default 50. | |
1151 | |
1152 --hetero_factor The factor about the SV's heterogenirity and heterozygosity, default 0.4. | |
1153 --triger_p_value The p-value that will triger the SV detection when number of soft-clipped reads is small, | |
1154 defaut 0.05. | |
1155 | |
1156 --(no)rescue Use rescue mode, when it's on, the a SV with only 1 side with enough soft-clipped reads | |
1157 is considered as a valid one instead of rejecting it. Default on. | |
1158 --min_one_side_reads When rescure mode is on, the minimum number of soft-clipped reads on one side, default 5. | |
1159 | |
1160 --RNASeq RNAseq mode, default off | |
1161 --genemodel A gene model file, currently only refFlat format (BED) is supported. Requried for RNASeq. | |
1162 --cluster_size The soft-clipped reads within cluster_size will be considered together, default is 3, RNAseq mode | |
1163 only. | |
1164 | |
1165 | |
1166 =head1 DESCRIPTION | |
1167 | |
1168 This is a program designed to identify Structure Variations (SVs) using soft | |
1169 clipping reads. With the improvement of next-gen sequencing techniques, | |
1170 both the coverage and length of pair-end reads have been increased steady. | |
1171 Many SV detection software availalbe uses pair-end information due to the | |
1172 limitted read length. Now 100bp is pretty common and many reads will cross | |
1173 the break points. Some mapping programs ( bwa, etc ) have the ability to | |
1174 identify so called soft-clipping reads. A soft-clipping read is a read that | |
1175 different part can be mapped to different genomic posiiton, but the read can | |
1176 be uniquely positioned using the mate-pair position and the insert length. | |
1177 So this program use those reads to do an assembly-mapping-assembly-alignment | |
1178 procedure to identify potential structure variiations. | |
1179 | |
1180 | |
1181 =head1 DIAGNOSTICS | |
1182 | |
1183 A list of every error and warning message that the application can generate | |
1184 (even the ones that will "never happen"), with a full explanation of each | |
1185 problem, one or more likely causes, and any suggested remedies. If the | |
1186 application generates exit status codes (e.g., under Unix), then list the exit | |
1187 status associated with each error. | |
1188 | |
1189 =head1 CONFIGURATION AND ENVIRONMENT | |
1190 | |
1191 The program is designed to use under a cluster or high performance computing | |
1192 environment since it's dealing with over 100G input data. The program can be | |
1193 used as highly parallellized. And the program is developped under linux/unix. | |
1194 | |
1195 | |
1196 =head1 DEPENDENCIES | |
1197 | |
1198 The program depend on several packages: | |
1199 1. Bioperl perl module. | |
1200 2. Bio::DB::Sam, version 1.5 or later, it requires samtools lib installed. | |
1201 3. blat suits, include blat, gfClient, gfServer. | |
1202 4. CAP3 assembly program. | |
1203 | |
1204 | |
1205 =head1 BUGS AND LIMITATIONS | |
1206 | |
1207 There are no known bugs in this module, but the method is limitted to bam file | |
1208 that has soft-clipping cigar string generated.Please report problems to | |
1209 Jianmin Wang (Jianmin.Wang@stjude.org) | |
1210 Patches are welcome. | |
1211 | |
1212 =head1 AUTHOR | |
1213 | |
1214 Jianmin Wang (Jianmin.Wang@stjude.org) | |
1215 | |
1216 | |
1217 =head1 LICENCE AND COPYRIGHT | |
1218 | |
1219 Copyright (c) 2010 by St. Jude Children's Research Hospital. | |
1220 | |
1221 This program is free software: you can redistribute it and/or modify | |
1222 it under the terms of the GNU General Public License as published by | |
1223 the Free Software Foundation, either version 2 of the License, or | |
1224 (at your option) any later version. | |
1225 | |
1226 This program is distributed in the hope that it will be useful, | |
1227 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
1228 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
1229 GNU General Public License for more details. | |
1230 | |
1231 You should have received a copy of the GNU General Public License | |
1232 along with this program. If not, see <http://www.gnu.org/licenses/>. |