0
|
1 #!/usr/bin/perl -w
|
|
2 #use lib '/home/jwang2/AssembleTest/Detect/nonSJsrc/dev';
|
|
3 use strict;
|
|
4 use Carp;
|
|
5 use Getopt::Long;
|
|
6 use English;
|
|
7 use Pod::Usage;
|
|
8 use Data::Dumper;
|
|
9 use Bio::DB::Sam;
|
|
10 use Bio::DB::Sam::Constants;
|
|
11 use Bio::SearchIO;
|
|
12 use Bio::SeqIO;
|
|
13 use File::Temp qw/ tempfile tempdir /;
|
|
14 use File::Spec;
|
|
15 use File::Path;
|
|
16 use File::Copy;
|
|
17 use File::Basename;
|
|
18 use List::MoreUtils qw/ uniq /;
|
|
19 use Cwd;
|
|
20 use SCValidator qw($lowqual_cutoff $min_percent_id $min_percent_hq LEFT_CLIP RIGHT_CLIP);
|
|
21 use SVUtil qw($use_scratch $work_dir clip_fa_file prepare_reads parse_range get_input_bam
|
|
22 find_smallest_cover get_work_dir is_PCR_dup read_fa_file get_sclip_reads);
|
|
23 use StructVar;
|
|
24 require SVExtTools;
|
|
25
|
|
26 use Transcript;
|
|
27 use Gene;
|
|
28 use GeneModel;
|
|
29
|
|
30 my $debug = 1;
|
|
31 $min_percent_id = 90;
|
|
32 # input/output
|
|
33 my ( $out_dir, $out_prefix, $range, $bam_d, $bam_g, $sclip_file);
|
|
34 my $out_suffix = "predSV.txt";
|
|
35 my $read_len = 100;
|
|
36 my $ref_genome ;
|
|
37
|
|
38 my $hetero_factor = 0.4;
|
|
39 my $triger_p_value = 0.05;
|
|
40
|
|
41 #RNASeq support
|
|
42 my $RNASeq = 0;
|
|
43 my $gene_model_file;
|
|
44 my $gm_format = "REFFLAT";
|
|
45 my $gm;
|
|
46 my $max_sclip_reads = 50; # we will consider the position if we have enough sclipped reads
|
|
47 my $min_one_side_reads = 5;
|
|
48 my $min_pct_sclip_reads = 5; # we require at least 5% reads have softclipping
|
|
49
|
|
50 # external programs related variable
|
|
51 # 1. cap3 related variables
|
|
52 my $cap3 = "cap3";
|
|
53 my $cap3_options=" -h 70 -y 10 > /dev/null";
|
|
54
|
|
55 # 2 blat related variables, using blat server and blat standalone
|
|
56 my $blat_client_exe = "gfClient";
|
|
57 my $blat_client_options = ' -out=psl -nohead -minIdentity=95 -maxIntron=5';
|
|
58 my $target_genome = "hg18.2bit";
|
|
59 my $dir_2bit = "/";
|
|
60 my $blat_server = "sjblat";
|
|
61 my $blat_port = 50000;
|
|
62 my $blat_exe = "blat";
|
|
63 my $blat_options = " -tileSize=7 -stepSize=1 -out=psl -minScore=15 -noHead -maxIntron=1 ";
|
|
64
|
|
65 $use_scratch = 0;
|
|
66 my $paired = 1;
|
|
67
|
|
68 my $rescue = 1;
|
|
69 # other options
|
|
70 my ($min_sclip_reads, $max_repetitive_cover, $min_sclip_len );
|
|
71 my $min_hit_reads;
|
|
72 my $rmdup;
|
|
73 my $min_clip_len;
|
|
74 my ($max_score_diff, $max_num_hits);
|
|
75 my ($min_dist_diff, $min_hit_len) = (15, 15);
|
|
76
|
|
77 #SV filter related parameters
|
|
78 my $max_bp_dist = 15;
|
|
79 my $min_percent_cons_of_read = 0.75;
|
|
80 my $germline_seq_width = 100;
|
|
81 my $germline_search_width = 50;
|
|
82 my $rm_tandem_repeat = 1; #tandem repeat mediated events
|
|
83 my $tr_max_indel_size = 100; #tandem repeat mediated indel size
|
|
84 my $tr_min_size = 2;
|
|
85 my $tr_max_size = 8; #tandem repeat max_size of single repeat
|
|
86 my $tr_min_num = 4; #tandem repeat minimum number of occurence
|
|
87
|
|
88 # verification pupuse
|
|
89 my $verify_file;
|
|
90 my $sensitive;
|
|
91 my $cluster_size;
|
|
92
|
|
93 # common help options
|
|
94 my ( $help, $man, $version, $usage );
|
|
95 my $optionOK = GetOptions(
|
|
96 # SV validation related parameters
|
|
97 'max_bp_dist=i' => \$max_bp_dist,
|
|
98 'min_percent_cons_of_read=f' => \$min_percent_cons_of_read,
|
|
99 'germline_seq_width=i' => \$germline_seq_width,
|
|
100 'germline_search_width=i' => \$germline_search_width,
|
|
101 # tandem repeat related parameters
|
|
102 'rm_tandem_repeat!' => \$rm_tandem_repeat,
|
|
103 'tr_max_indel_size=i' => \$tr_max_indel_size,
|
|
104 'tr_min_size=i' => \$tr_min_size,
|
|
105 'tr_max_size=i' => \$tr_max_size,
|
|
106 'tr_min_num=i' => \$tr_min_num,
|
|
107 'hetero_factor=f' => \$hetero_factor,
|
|
108 'triger_p_value=f' => \$triger_p_value,
|
|
109
|
|
110 # input/output
|
|
111 #'s|sample=s' => \$sample,
|
|
112 'd|input_d=s' => \$bam_d,
|
|
113 'g|inpug_g=s' => \$bam_g,
|
|
114 'f|sclipfile=s' => \$sclip_file,
|
|
115 'p|prefix=s' => \$out_prefix,
|
|
116 'o|out_dir=s' => \$out_dir,
|
|
117 'l|read_len=i' => \$read_len,
|
|
118 'ref_genome=s' => \$ref_genome,
|
|
119 'v|verify_file=s' => \$verify_file,
|
|
120 'sensitive' => \$sensitive,
|
|
121 'paired!' => \$paired,
|
|
122 'rescue!' => \$rescue,
|
|
123 'cluster_size=i' => \$cluster_size,
|
|
124 # external programs location and options
|
|
125 'scratch!' => \$use_scratch,
|
|
126 'cap3=s' => \$cap3,
|
|
127 'cap3opt=s' => \$cap3_options,
|
|
128 'blatclient=s' => \$blat_client_exe,
|
|
129 'blatclientopt=s' => \$blat_client_options,
|
|
130 'blat=s' => \$blat_exe,
|
|
131 't|target_genome=s' => \$target_genome,
|
|
132 'blatopt=s' => \$blat_options,
|
|
133 'blatserver=s' => \$blat_server,
|
|
134 'blatport=i' => \$blat_port,
|
|
135 '2bitdir=s' => \$dir_2bit,
|
|
136 #RNAseq support
|
|
137 'RNASeq' => \$RNASeq,
|
|
138 'genemodel=s' => \$gene_model_file,
|
|
139 'gmformat=s' => \$gm_format,
|
|
140 #other related parameters
|
|
141 'r|range=s' => \$range,
|
|
142 'max_score_diff=i' => \$max_score_diff,
|
|
143 'm|min_sclip_reads=i' => \$min_sclip_reads,
|
|
144 'min_one_side_reads=i' => \$min_one_side_reads,
|
|
145 'max_rep_cover=i' => \$max_repetitive_cover,
|
|
146 'min_sclip_len=i' => \$min_sclip_len,
|
|
147 'min_hit_len=i' => \$min_hit_len,
|
|
148 'min_dist_diff=i' => \$min_dist_diff,
|
|
149 'min_hit_reads=i' => \$min_hit_reads,
|
|
150 'rmdup!' => \$rmdup,
|
|
151
|
|
152 # soft_clipping related parameters (from SVDector package)
|
|
153 'min_percent_id=i' => \$min_percent_id,
|
|
154 'min_percent_hq=i' => \$SCValidator::min_percent_hq,
|
|
155 'lowqual_cutoff=i' => \$lowqual_cutoff,
|
|
156
|
|
157 # common help parameters
|
|
158 'h|help|?' => \$help,
|
|
159 'man' => \$man,
|
|
160 'usage' => \$usage,
|
|
161 'version' => \$version,
|
|
162 );
|
|
163
|
|
164 pod2usage(-verbose=>2) if($man or $usage);
|
|
165 pod2usage(1) if($help or $version );
|
|
166
|
|
167 my $start_dir = getcwd;
|
|
168
|
|
169 # figure out input file
|
|
170 if(!$bam_d) {
|
|
171 pod2usage(1);
|
|
172 croak "You need specify input bam file(s)";
|
|
173 }
|
|
174
|
|
175 if(!$sclip_file) {
|
|
176 pod2usage(1);
|
|
177 croak "You need to specify the softclipping file";
|
|
178 }
|
|
179 $sclip_file = File::Spec->rel2abs($sclip_file);
|
|
180
|
|
181 if(!$ref_genome) {
|
|
182 pod2usage(1);
|
|
183 croak "You need to specify the reference genome used by bam file";
|
|
184 }
|
|
185
|
|
186 croak "The file you sepcified does not exist" unless (
|
|
187 -e $sclip_file && -e $bam_d && -e $ref_genome && -e $target_genome);
|
|
188 my $input_base;
|
|
189 $bam_d = File::Spec->rel2abs($bam_d);
|
|
190 $input_base = fileparse($bam_d);
|
|
191 $bam_g = File::Spec->rel2abs($bam_g) if($bam_g);
|
|
192
|
|
193 #RNASeq support
|
|
194 if($RNASeq) {
|
|
195 croak "You need specify the input gene model file" unless ($gene_model_file);
|
|
196 StructVar->add_RNASeq_filter();
|
|
197 $min_sclip_reads = 10 unless $min_sclip_reads;
|
|
198 $max_repetitive_cover = 5000 unless $max_repetitive_cover;
|
|
199 $min_sclip_len = 20 unless $min_clip_len;
|
|
200 $max_score_diff = 5 unless $max_score_diff;
|
|
201 $max_num_hits = 3 unless $max_num_hits;
|
|
202 $min_hit_reads = 5 unless $min_hit_reads;
|
|
203 $cluster_size = 5 unless $cluster_size;
|
|
204 }
|
|
205 else {
|
|
206 $min_sclip_reads = 3 unless $min_sclip_reads;
|
|
207 $max_repetitive_cover = 500 unless $max_repetitive_cover;
|
|
208 $min_sclip_len = 20 unless $min_clip_len;
|
|
209 $max_score_diff = 5 unless $max_score_diff;
|
|
210 $max_num_hits = 10 unless $max_num_hits;
|
|
211 $min_hit_reads = 1 unless $min_hit_reads;
|
|
212 $cluster_size = 1 unless $cluster_size;
|
|
213 }
|
|
214
|
|
215 if($gene_model_file) {
|
|
216 $gm = GeneModel->new if($gene_model_file);
|
|
217 $gm->from_file($gene_model_file, $gm_format);
|
|
218 StructVar->gene_model($gm);
|
|
219 }
|
|
220
|
|
221 # set up the external programs and validators
|
|
222 # Those variable will be global
|
|
223 my $assembler = Assembler->new(
|
|
224 -PRG => $cap3,
|
|
225 -OPTIONS => $cap3_options
|
|
226 );
|
|
227
|
|
228 my $mapper = Mapper->new(
|
|
229 -PRG => join(' ', ($blat_client_exe, $blat_server, $blat_port)),
|
|
230 -OPTIONS => $blat_client_options,
|
|
231 -BIT2_DIR => $dir_2bit,
|
|
232 -MAX_SCORE_DIFF => $max_score_diff,
|
|
233 -MAX_NUM_HITS => $max_num_hits,
|
|
234 );
|
|
235
|
|
236 my $aligner = Aligner->new(
|
|
237 -PRG => $blat_exe,
|
|
238 -OPTIONS => $blat_options,
|
|
239 );
|
|
240
|
|
241 StructVar->assembler($assembler);
|
|
242 StructVar->read_len($read_len);
|
|
243 StructVar->aligner($aligner);
|
|
244 StructVar->mapper($mapper);
|
|
245 StructVar->RNASeq(1) if($RNASeq);
|
|
246 StructVar->genome($target_genome);
|
|
247 StructVar->remove_filter("tandem_repeat") unless($rm_tandem_repeat);
|
|
248 StructVar->tr_max_indel_size($tr_max_indel_size);
|
|
249 StructVar->tr_min_size($tr_min_size);
|
|
250 StructVar->tr_max_size($tr_max_size);
|
|
251 StructVar->tr_min_num($tr_min_num);
|
|
252 StructVar->max_bp_dist($max_bp_dist) if($max_bp_dist);
|
|
253 StructVar->germline_seq_width($germline_seq_width) if($germline_seq_width);
|
|
254 StructVar->germline_search_width($germline_search_width) if($germline_search_width);
|
|
255
|
|
256
|
|
257 my $validator = SCValidator->new();
|
|
258 $validator->remove_validator('strand_validator') if(!$paired);
|
|
259
|
|
260 #setup output and working directory
|
|
261 $out_dir = getcwd if(!$out_dir);
|
|
262 mkdir $out_dir if(!-e $out_dir || ! -d $out_dir);
|
|
263 $work_dir = get_work_dir(-SCRATCH => $use_scratch);
|
|
264 $work_dir = $out_dir if(!$work_dir);
|
|
265 $use_scratch = undef if($work_dir eq $out_dir); # don't erase the out_dir
|
|
266 chdir($work_dir);
|
|
267
|
|
268 # figure out output prefix
|
|
269 if(!$out_prefix) {
|
|
270 $out_prefix = $input_base;
|
|
271 }
|
|
272
|
|
273 my $sam_d = Bio::DB::Sam->new( -bam => $bam_d, -fasta => $ref_genome);
|
|
274 StructVar->sam_d($sam_d);
|
|
275 my $sam_g;
|
|
276 if($bam_g) {
|
|
277 $sam_g = Bio::DB::Sam->new( -bam => $bam_g);
|
|
278 StructVar->sam_g($sam_g);
|
|
279 }
|
|
280
|
|
281 #my $output_file = File::Spec->catfile($outdir, $out_prefix . $out_suffix);
|
|
282 my $output_file;
|
|
283 $output_file = join('.', $out_prefix, $out_suffix);
|
|
284 $output_file = File::Spec->catfile($out_dir, $output_file);
|
|
285
|
|
286 # the softclip file is sorted, so no need to re-sort it
|
|
287 open my $SCLIP, "<$sclip_file" or croak "can't open $sclip_file:$OS_ERROR";
|
|
288 my %sclip;
|
|
289 my $pre_p;
|
|
290 while( my $line = <$SCLIP> ) {
|
|
291 chomp $line;
|
|
292 my ($chr, $pos, $ort, $cover, $C) = split /\t/, $line;
|
|
293 if( ! exists($sclip{$chr})) {
|
|
294 $sclip{$chr} = [];
|
|
295 $pre_p = $pos;
|
|
296 }
|
|
297 $C = 30 unless $C;
|
|
298 if($pos < $pre_p) {
|
|
299 print STDERR "The input file is not sorted!";
|
|
300 exit(1);
|
|
301 }
|
|
302 push @{$sclip{$chr}}, [$pos, $ort, $cover, $C];
|
|
303 }
|
|
304 close $SCLIP;
|
|
305
|
|
306 my @final_SV;
|
|
307 my @s_cover = @{find_smallest_cover($min_sclip_reads, $hetero_factor, $triger_p_value)};
|
|
308 if($range) {
|
|
309 @final_SV = detect_range_SVs($sam_d, $range, \%sclip, \@s_cover);
|
|
310 }
|
|
311 else {
|
|
312 foreach my $chr (keys %sclip) {
|
|
313 my @tmpSV = detect_range_SVs($sam_d, $chr, \%sclip, \@s_cover);
|
|
314 @final_SV = @tmpSV unless (@final_SV);
|
|
315 foreach my $sv (@tmpSV) {
|
|
316 push @final_SV, $sv if($sv && ! is_dup_SV(\@final_SV, $sv));
|
|
317 }
|
|
318 }
|
|
319 }
|
|
320 undef %sclip;
|
|
321 open my $OUT, ">$output_file" or croak "can't open $output_file:$OS_ERROR";
|
|
322 foreach my $SV (@final_SV) {
|
|
323 if($SV->filter) {
|
|
324 print $OUT $SV->to_string ;
|
|
325 if($RNASeq) {
|
|
326 print $OUT "\t", join("\t", @{$SV->get_genes});
|
|
327 }
|
|
328 print $OUT "\n";
|
|
329 }
|
|
330 }
|
|
331 chdir $start_dir;
|
|
332 exit(0);
|
|
333
|
|
334 sub detect_range_SVs {
|
|
335 my ($sam_d, $range, $r_sclips, $r_scover) = @_;
|
|
336 my ($chr, $start, $end) = parse_range($range);
|
|
337 my ($tid) = $sam_d->header->parse_region($chr);
|
|
338 my $chr_len = $sam_d->header->target_len->[$tid];
|
|
339 my $r_range_sclips = search_sclip($r_sclips, $range);
|
|
340 return unless $r_range_sclips;
|
|
341 my @scover = @{$r_scover};
|
|
342 my @rtn;
|
|
343
|
|
344 my ($f_tree, $r_tree);
|
|
345 if($RNASeq) {
|
|
346 my $full_chr = $chr;
|
|
347 my ($gm_chr) = $gm->get_all_chr;
|
|
348 $full_chr = "chr" . $chr if($chr !~ m/chr/ && $gm_chr =~ /chr/);
|
|
349 ($f_tree, $r_tree) = ($gm->sub_model($full_chr, "+"), $gm->sub_model($full_chr, "-")) if($RNASeq);
|
|
350 }
|
|
351 my $n = scalar @{$r_range_sclips};
|
|
352 push @{$r_range_sclips}, [$r_range_sclips->[$n-1][0] + $cluster_size + 1, '+', 1, 50];
|
|
353 push @{$r_range_sclips}, [$r_range_sclips->[$n-1][0] + $cluster_size + 1, '-', 1, 50];
|
|
354 $n = $n + 2;
|
|
355
|
|
356 # try to save the 1 base off problem of soft-clipping
|
|
357 my (@ss1, $p1, $c1);
|
|
358 my (@ss2, $p2, $c2);
|
|
359 my ($n1, $n2, $cover1, $cover2) = (0, 0, 0, 0);
|
|
360 for( my $i = 0; $i < $n; $i++) {
|
|
361 my $s = $r_range_sclips->[$i];
|
|
362 # print join("\t", @{$s}), "\n" if($debug);
|
|
363 my $clip = $s->[1] eq '+' ? RIGHT_CLIP : LEFT_CLIP;
|
|
364 next if($s->[0] >= $chr_len );
|
|
365 if($clip == RIGHT_CLIP) {
|
|
366 $p1 = $c1 = $s->[0] unless $p1;
|
|
367 if($s->[0] < $c1) {
|
|
368 print STDERR "The soft-clipping file has problem! It's either not sorted or
|
|
369 the file has mutliple parts for a chromsome!";
|
|
370 last;
|
|
371 }
|
|
372 if($s->[0] - $c1 < $cluster_size) {
|
|
373 $n1 += $s->[2];
|
|
374 $cover1 = $s->[3] if($cover1 < $s->[3]);
|
|
375 push @ss1, $s;
|
|
376 if($s->[0] < $c1) {
|
|
377 $p1 += $s->[0];
|
|
378 $c1 = $p1 / scalar @ss1;
|
|
379 }
|
|
380 next;
|
|
381 }
|
|
382 }
|
|
383 else {
|
|
384 $p2 = $c2 = $s->[0] unless $p2;
|
|
385 if($s->[0] < $c2) {
|
|
386 print STDERR "The soft-clipping file has problem! It's either not sorted or
|
|
387 the file has mutliple parts for a chromsome!";
|
|
388 last;
|
|
389 }
|
|
390 if($s->[0] - $p2 < $cluster_size) {
|
|
391 $n2 += $s->[2];
|
|
392 $cover2 = $s->[3] if($cover2 < $s->[3]);
|
|
393 push @ss2, $s;
|
|
394 if($s->[0] < $c2) {
|
|
395 $p2 += $s->[0];
|
|
396 $c2 = $p2 / scalar @ss2;
|
|
397 }
|
|
398 next;
|
|
399 }
|
|
400 }
|
|
401
|
|
402 my @cc = $clip == LEFT_CLIP ? @ss2 : @ss1;
|
|
403 my $pos = $clip == LEFT_CLIP ? $ss2[$#ss2]->[0] : $ss1[0]->[0];
|
|
404 my ($n_s, $cover_s) = $clip == LEFT_CLIP ? ($n2, $cover2) : ($n1, $cover1);
|
|
405 my $tmp_range = $clip == LEFT_CLIP ? [$ss2[0]->[0], $ss2[$#ss2]->[0] + $cluster_size] :
|
|
406 [$ss1[0]->[0] - $cluster_size, $ss1[$#ss1]->[0]];
|
|
407
|
|
408 if($clip == LEFT_CLIP) {
|
|
409 @ss2 = (); $p2 = $c2 = $s->[0];
|
|
410 $n2 = $s->[2];
|
|
411 $cover2 = $s->[3];
|
|
412 push @ss2, $s;
|
|
413 }
|
|
414 else {
|
|
415 @ss1 = (); $p1 = $c1 = $s->[0];
|
|
416 $n1 = $s->[2];
|
|
417 $cover1 = $s->[3];
|
|
418 push @ss1, $s;
|
|
419 }
|
|
420
|
|
421 my @s_reads;
|
|
422 if($RNASeq) {
|
|
423 # print $n_s, "\n";
|
|
424 next if($n_s < $min_sclip_reads && $cover_s > $s_cover[$n_s] ) ;
|
|
425 next if($n_s < $max_sclip_reads && $n_s * 100 <= $cover_s * $min_pct_sclip_reads);
|
|
426 my @f_genes = $f_tree->intersect($tmp_range);
|
|
427 my @r_genes = $r_tree->intersect($tmp_range);
|
|
428 next if(scalar @f_genes == 0 && scalar @r_genes == 0);
|
|
429 }
|
|
430 else {
|
|
431 if($n_s < $min_sclip_reads) { #too few covered
|
|
432 if($cover_s > $s_cover[$n_s]) {
|
|
433 next if(!$sensitive);
|
|
434 foreach my $c (@cc) {
|
|
435 next if($c->[0] >= $chr_len);
|
|
436 push @s_reads, get_sclip_reads(-SAM => $sam_d,
|
|
437 -CHR =>$chr,
|
|
438 -START => $c->[0],
|
|
439 -END => $c->[0],
|
|
440 -CLIP => $clip,
|
|
441 -MINCLIPLEN => 0,
|
|
442 -VALIDATOR => $validator,
|
|
443 -PAIRED => $paired,
|
|
444 -RMDUP => $rmdup,
|
|
445 -EXTRA => abs($c->[0] - $pos) );
|
|
446 }
|
|
447 }
|
|
448 }
|
|
449 }
|
|
450 if(! @s_reads) {
|
|
451 foreach my $c (@cc) {
|
|
452 next if($c->[0] >= $chr_len);
|
|
453 push @s_reads, get_sclip_reads(-SAM => $sam_d,
|
|
454 -CHR =>$chr,
|
|
455 -START => $c->[0],
|
|
456 -END => $c->[0],
|
|
457 -CLIP => $clip,
|
|
458 -MINCLIPLEN => 0,
|
|
459 -VALIDATOR => $validator,
|
|
460 -PAIRED => $paired,
|
|
461 -RMDUP => $rmdup,
|
|
462 -EXTRA => abs($c->[0] - $pos) );
|
|
463 }
|
|
464 }
|
|
465 my $l = 0;
|
|
466 foreach my $r (@s_reads) {
|
|
467 my $len = length $r->{seq};
|
|
468 $l = $len if($l < $len);
|
|
469 }
|
|
470 next if ($l < $min_sclip_len);
|
|
471 print STDERR join("\t", ($chr, $pos, $clip == LEFT_CLIP ? "-" : "+", scalar @s_reads)), "\n";
|
|
472 my @SV = detect_SV(
|
|
473 -SAM => $sam_d,
|
|
474 -CHR => $chr,
|
|
475 -POS => $pos,
|
|
476 -ORT => $clip == LEFT_CLIP ? "-" : "+",
|
|
477 -SCLIP => \%sclip,
|
|
478 -COVER => $cover_s,
|
|
479 -SREADS => \@s_reads);
|
|
480 foreach my $sv (@SV) {
|
|
481 $sv->{type} = $sv->type;
|
|
482 push @rtn, $sv if($sv && ! is_dup_SV(\@rtn, $sv));
|
|
483 }
|
|
484 }
|
|
485
|
|
486 if($RNASeq) {
|
|
487 push @rtn, find_del_SVs($sam_d, $chr, $start, $end);
|
|
488 }
|
|
489 return @rtn;
|
|
490 }
|
|
491
|
|
492 sub is_dup_SV {
|
|
493 my($r_SVs, $sv) = @_;
|
|
494 foreach my $s (@{$r_SVs}) {
|
|
495 return 1
|
|
496 if( $s->first_bp->{pos} == $sv->second_bp->{pos} &&
|
|
497 $s->first_bp->{chr} eq $sv->second_bp->{chr} &&
|
|
498 $s->second_bp->{pos} == $sv->first_bp->{pos} &&
|
|
499 $s->second_bp->{chr} eq $sv->first_bp->{chr} &&
|
|
500 $s->{type} == $sv->{type} );
|
|
501 return 1
|
|
502 if( $s->first_bp->{pos} == $sv->first_bp->{pos} &&
|
|
503 $s->first_bp->{chr} eq $sv->first_bp->{chr} &&
|
|
504 $s->second_bp->{pos} == $sv->second_bp->{pos} &&
|
|
505 $s->second_bp->{chr} eq $sv->second_bp->{chr} &&
|
|
506 $s->{type} == $sv->{type} );
|
|
507
|
|
508 }
|
|
509 return;
|
|
510 }
|
|
511
|
|
512 sub search_sclip {
|
|
513 my ($r_sclip, $range) = @_;
|
|
514 my ($chr, $start, $end) = parse_range($range);
|
|
515 return $r_sclip->{$chr} if(!$start);
|
|
516 my $start_i = bin_search($r_sclip->{$chr}, $start);
|
|
517 my $end_i = bin_search($r_sclip->{$chr}, $end);
|
|
518 my @tmp = @{$r_sclip->{$chr}};
|
|
519 if($start_i <= $end_i) {
|
|
520 @tmp = @tmp[$start_i .. $end_i];
|
|
521 if($start_i == $end_i) {
|
|
522 return undef if($tmp[0]->[0] < $start || $tmp[0]->[0] > $end);
|
|
523 }
|
|
524 return \@tmp;
|
|
525 }
|
|
526 else {
|
|
527 return undef;
|
|
528 }
|
|
529 }
|
|
530
|
|
531 sub bin_search {
|
|
532 my ($a, $p) = @_;
|
|
533 my ($s, $e) = (0, scalar(@{$a})-1);
|
|
534 my $m = int( ($s + $e)/2);
|
|
535 while(1) {
|
|
536 return $s if($a->[$s][0] >= $p);
|
|
537 return $e if($a->[$e][0] <= $p);
|
|
538 return $m if($a->[$m][0] == $p || ($a->[$m-1][0] < $p && $a->[$m+1][0] > $p));
|
|
539 if($a->[$m][0] > $p) {
|
|
540 $e = $m;
|
|
541 }
|
|
542 else {
|
|
543 $s = $m;
|
|
544 }
|
|
545 $m = int( ($s+$e)/2 );
|
|
546 }
|
|
547 }
|
|
548
|
|
549 sub count_coverage {
|
|
550 my ($sam, $chr, $pos, $clip) = @_;
|
|
551 if($rmdup && !$RNASeq) {
|
|
552 my @pairs;
|
|
553 my $seg = $sam->segment(-seq_id => $chr, -start => $pos, -end => $pos);
|
|
554 my $n = 0;
|
|
555 return 0 unless $seg;
|
|
556 my $itr = $seg->features(-iterator => 1);
|
|
557 while( my $a = $itr->next_seq) {
|
|
558 next unless($a->start && $a->end); #why unmapped reads here?
|
|
559 my $sclip_len = 0;
|
|
560 if($clip) {
|
|
561 my @cigar_array = @{$a->cigar_array};
|
|
562 #$sclip_len = $1 if($a->cigar_str =~ m/S(\d+)$/ && $clip == RIGHT_CLIP);
|
|
563 $sclip_len = $cigar_array[0]->[1] if($cigar_array[0]->[0] eq 'S' && $clip == RIGHT_CLIP);
|
|
564 #$sclip_len = $1 if($a->cigar_str =~ m/^S(\d+)/ && $clip == LEFT_CLIP);
|
|
565 $sclip_len = $cigar_array[$#cigar_array]->[1] if($cigar_array[$#cigar_array]->[0] eq 'S' && $clip == RIGHT_CLIP);
|
|
566 }
|
|
567 next if(@pairs > 0 && is_PCR_dup($a, \@pairs, $sclip_len));
|
|
568 $n++;
|
|
569 return $n if( $n > $max_repetitive_cover);
|
|
570 push @pairs, [$a->start, $a->end, $a->mate_start ? $a->mate_start : 0,
|
|
571 $a->mate_end ? $a->mate_end : 0, $sclip_len];
|
|
572 }
|
|
573 return $n;
|
|
574 }
|
|
575 else{
|
|
576 my ($c) = $sam->features(-type => 'coverage', -seq_id=> $chr,
|
|
577 -start => $pos, -end => $pos);
|
|
578 return 0 unless $c;
|
|
579 my @c_d = $c->coverage;
|
|
580 return $c_d[0];
|
|
581 }
|
|
582 }
|
|
583
|
|
584 sub detect_SV {
|
|
585 my %param = @_;
|
|
586 my @s_reads = @{$param{-SREADS}};
|
|
587 my($sam, $chr, $ort, $r_sclip, $coverage) = ( $param{-SAM},
|
|
588 $param{-CHR}, $param{-ORT}, $param{-SCLIP}, $param{-COVER});
|
|
589 my $clip = $ort eq '+' ? RIGHT_CLIP : LEFT_CLIP;
|
|
590 return if($coverage > $max_repetitive_cover);
|
|
591 my $fa_name = prepare_reads(\@s_reads, $clip);
|
|
592 my($contig_file, $sclip_count, $contig_reads) = $assembler->run($fa_name);
|
|
593 return if(!$contig_file or !(-e $contig_file));
|
|
594
|
|
595 $contig_file = clip_fa_file($contig_file, $clip);
|
|
596 my $contig_seqs = read_fa_file($contig_file);
|
|
597 if(scalar @s_reads == 1) {
|
|
598 $contig_file = clip_fa_file($fa_name, $clip);
|
|
599 $contig_seqs = read_fa_file($contig_file);
|
|
600 my @seq_names = keys %{$contig_seqs};
|
|
601 $sclip_count->{$seq_names[0]} = 1;
|
|
602 }
|
|
603
|
|
604 my @SV;
|
|
605 my $where = ($clip == LEFT_CLIP ? "right" : "left");
|
|
606 my $mapping = $mapper->run( -QUERY => $contig_file );
|
|
607 my (%reads, %quals, %s_lens, %pos);
|
|
608 foreach my $r (@s_reads) {
|
|
609 $reads{$r->{name}} = $r->{full_seq};
|
|
610 $quals{$r->{name}} = $r->{full_qual};
|
|
611 $s_lens{$r->{name}} = length($r->{seq});
|
|
612 $pos{$r->{name}} = $r->{pos};
|
|
613 }
|
|
614
|
|
615 foreach my $s (keys(%{$mapping})) {
|
|
616 next if($sclip_count->{$s} < $min_sclip_reads);
|
|
617
|
|
618 # try to find a better mapping
|
|
619 my @tmp_reads;
|
|
620 my $selected_read;
|
|
621 my @tmp_pos;
|
|
622 foreach my $n (@{$contig_reads->{$s}}) {
|
|
623 push @tmp_pos, $pos{$n};
|
|
624 }
|
|
625 @tmp_pos = uniq @tmp_pos;
|
|
626 my $real_pos = $clip == LEFT_CLIP ? $tmp_pos[$#tmp_pos] : $tmp_pos[0];
|
|
627
|
|
628 my $n_new_SV = 0;
|
|
629 my $tmp_bp;
|
|
630 foreach my $t (@{$mapping->{$s}}) {
|
|
631 my $qort = $t->{qstrand};
|
|
632 my $t_pos = $t->{tstart};
|
|
633 $t_pos = $t->{tend} if( ($qort eq "+" && $clip == LEFT_CLIP) ||
|
|
634 ($qort eq '-' && $clip == RIGHT_CLIP));
|
|
635 if($t->{tchr} eq $chr && (abs($t_pos - $real_pos) < $min_dist_diff)) { # the soft clipping is incorrect
|
|
636 @SV = @SV[0 .. ($#SV - $n_new_SV + 1)] if($n_new_SV > 0);
|
|
637 last;
|
|
638 }
|
|
639 my $first_bp = {};
|
|
640 $first_bp->{sc_reads} = $sclip_count->{$s};
|
|
641 $first_bp->{$where . "_chr"} = $chr;
|
|
642 $first_bp->{$where . "_pos"} = $real_pos;
|
|
643 $first_bp->{all_pos} = \@tmp_pos;
|
|
644 $first_bp->{$where . "_ort"} = "+";
|
|
645 $first_bp->{pos} = $real_pos;
|
|
646 $first_bp->{cover} = $coverage;
|
|
647 $first_bp->{chr} = $chr;
|
|
648 my $new_where = ($where eq "right" ? "left" : "right");
|
|
649
|
|
650 $first_bp->{$new_where . "_chr"} = $t->{tchr};
|
|
651 $first_bp->{$new_where . "_pos"} = $t_pos;
|
|
652 $first_bp->{$new_where . "_ort"} = $qort;
|
|
653 $first_bp->{sc_seq} = $contig_seqs->{$s};
|
|
654 $first_bp->{reads} = $contig_reads->{$s};
|
|
655
|
|
656 $tmp_bp = $first_bp unless $tmp_bp;
|
|
657 my $second_bp = check_sclip(-SAM => $sam, -TARGET => $t, -CHR => $chr,
|
|
658 -POS => $real_pos, -SCLIP => $r_sclip, -CLIP => $clip);
|
|
659 if(@{$second_bp}) {
|
|
660 foreach my $tmp_bp (@{$second_bp}) {
|
|
661 push @SV, StructVar->new(-FIRST_BP => $first_bp, -SECOND_BP => $tmp_bp);
|
|
662 $n_new_SV++;
|
|
663 }
|
|
664 }
|
|
665 }
|
|
666 # save some one side good soft-clipping SV
|
|
667 if( $rescue == 1 &&
|
|
668 $n_new_SV == 0 &&
|
|
669 $sclip_count->{$s} >= $min_one_side_reads &&
|
|
670 (scalar(@{$mapping->{$s}}) == 1 || ($mapping->{$s}[0]{perfect} == 1 && $mapping->{$s}->[1]{perfect} == 0)) &&
|
|
671 $tmp_bp->{chr} &&
|
|
672 length($contig_seqs->{$s}) * 0.95 < $mapping->{$s}[0]{matches} ) {
|
|
673
|
|
674 my %second_bp = %{$tmp_bp};
|
|
675 $second_bp{sc_reads} = 0;
|
|
676 $second_bp{sc_seq} = '';
|
|
677 ($second_bp{chr}, $second_bp{pos}) = $second_bp{pos} == $second_bp{left_pos} ?
|
|
678 ($second_bp{right_chr}, $second_bp{right_pos}) : ($second_bp{left_chr}, $second_bp{left_pos});
|
|
679 $second_bp{cover} = count_coverage($sam, $second_bp{chr}, $second_bp{pos});
|
|
680 $second_bp{reads} = [];
|
|
681 push @SV, StructVar->new(-FIRST_BP => $tmp_bp, -SECOND_BP => \%second_bp);
|
|
682 }
|
|
683
|
|
684 }
|
|
685 system("rm $fa_name"); system("rm $fa_name*");
|
|
686 return @SV;
|
|
687 }
|
|
688
|
|
689 sub get_gene_range {
|
|
690 my ($chr, $pos, $clip) = @_;
|
|
691 my $r = $clip == LEFT_CLIP ? [$pos, $pos+5] : [$pos - 5, $pos];
|
|
692 my @genes;
|
|
693 my $tmp = $chr;
|
|
694 $tmp = "chr" . $tmp if($tmp !~ m/chr/);
|
|
695 my ($f_tree, $r_tree) = ($gm->sub_model($tmp, "+"), $gm->sub_model($tmp, "-"));
|
|
696 push @genes, $f_tree->intersect($r);
|
|
697 push @genes, $r_tree->intersect($r);
|
|
698 my ($s, $e);
|
|
699 if(scalar @genes == 0) { #no gene here
|
|
700 return [$pos - $read_len, $pos + $read_len];
|
|
701 }
|
|
702 foreach my $g (@genes) {
|
|
703 my $start = $g->val->get_start($pos, $read_len);
|
|
704 my $end = $g->val->get_end($pos, $read_len);
|
|
705 $s = $start if(!$s || $s > $start);
|
|
706 $e = $end if(!$e || $e < $end);
|
|
707 }
|
|
708 return [$s, $e];
|
|
709 }
|
|
710
|
|
711 sub check_sclip {
|
|
712 my %arg = @_;
|
|
713 my ($sam, $chr, $pos, $target, $r_sclip, $clip) =
|
|
714 ( $arg{-SAM}, $arg{-CHR}, $arg{-POS}, $arg{-TARGET}, $arg{-SCLIP}, $arg{-CLIP} );
|
|
715
|
|
716 my @bp;
|
|
717
|
|
718 # identify the searching region for partner soft cliping
|
|
719 my $extension;
|
|
720 if($RNASeq) {
|
|
721 $extension = 50;
|
|
722 }
|
|
723 else {
|
|
724 $extension = $read_len - ($target->{qend} - $target->{qstart});
|
|
725 }
|
|
726 my ($tpos, $ort) = ($target->{tend}, $target->{qstrand});
|
|
727 $tpos = $target->{tstart} if( ($clip == LEFT_CLIP && $ort eq '-')
|
|
728 || ($clip == RIGHT_CLIP && $ort eq '+'));
|
|
729 $extension = abs($tpos - $pos) - 1
|
|
730 if($chr eq $target->{tchr} && abs($tpos - $pos) <= $extension); # don't consider itself
|
|
731 my $range =$target->{tchr} . ":";
|
|
732 if($tpos < $extension) {
|
|
733 $range .= '1';
|
|
734 }
|
|
735 else{
|
|
736 $range .= $tpos - $extension;
|
|
737 }
|
|
738
|
|
739 $range .= "-"; $range .= $tpos + $extension;
|
|
740
|
|
741 # the orginal genome sequence where we find the soft_clipping
|
|
742 my $orig;
|
|
743 my $tmp = $chr;
|
|
744 #$tmp = 'chr' . $tmp if($tmp !~ m/^chr/);
|
|
745 $orig = $target_genome . ":" . $tmp . ":";
|
|
746 my $base_pos;
|
|
747 if($RNASeq) { #let's do a wild guess!
|
|
748 my $r = get_gene_range($tmp, $pos, $clip);
|
|
749 return \@bp unless $r;
|
|
750 $base_pos = $r->[0];
|
|
751 $orig = join("", ($orig, $r->[0], "-", $r->[1]));
|
|
752 }
|
|
753 else {
|
|
754 $base_pos = $pos - $read_len;
|
|
755 $orig = join("", ($orig, $pos - $read_len, "-", $pos + $read_len));
|
|
756 }
|
|
757
|
|
758 return \@bp if(!exists( $r_sclip->{$target->{tchr}})); #mapped to chrY or other minor chrs
|
|
759 # the soft clipping happens in highly repetitive region
|
|
760 # return \@bp if($coverage > $max_repetitive_cover);
|
|
761
|
|
762 my $r_pp = search_sclip($r_sclip, $range);
|
|
763 return \@bp if(!$r_pp);
|
|
764
|
|
765 my $real_pos;
|
|
766 my $found = 0;
|
|
767 my @r_reads;
|
|
768 my @l_reads;
|
|
769 foreach my $s (@{$r_pp}) {
|
|
770 #next if($s->[2] < $min_hit_reads);
|
|
771 next if($chr ne $target->{tchr} && (
|
|
772 ($clip == LEFT_CLIP && $ort ne $s->[1] ) ||
|
|
773 ($clip == RIGHT_CLIP && $ort eq $s->[1] )) );
|
|
774
|
|
775 next if($chr eq $target->{tchr} && ($s->[0] < $tpos - $extension
|
|
776 || $s->[0] > $tpos + $extension));
|
|
777 my $tort = $s->[1];
|
|
778 next if($s->[3] > $max_repetitive_cover);
|
|
779 my $tclip = $tort eq '+' ? RIGHT_CLIP : LEFT_CLIP;
|
|
780 my @reads = get_sclip_reads(
|
|
781 -SAM => $sam,
|
|
782 -CHR => $target->{tchr},
|
|
783 -START => $s->[0],
|
|
784 -END => $s->[0],
|
|
785 -CLIP => $tclip,
|
|
786 -MINCLIPLEN => $min_hit_len,
|
|
787 -VALIDATOR => $validator,
|
|
788 -PAIRED => $paired,
|
|
789 -RMDUP => $rmdup);
|
|
790 next if(!@reads || scalar(@reads) == 0);
|
|
791 # push @r_reads, @reads if($tclip == RIGHT_CLIP);
|
|
792 # push @l_reads, @reads if($tclip == LEFT_CLIP);
|
|
793 # }
|
|
794
|
|
795 # foreach my $tclip ( (RIGHT_CLIP, LEFT_CLIP)) {
|
|
796 # my $s_reads = $tclip == RIGHT_CLIP ? \@r_reads : \@l_reads;
|
|
797 # next if(scalar @{$s_reads} == 0);
|
|
798 my %count;
|
|
799 # my $fa_name = prepare_reads($s_reads, $tclip);
|
|
800 my $fa_name = prepare_reads(\@reads, $tclip);
|
|
801
|
|
802 my($contig_file, $sclip_count, $contig_reads, $singlet_file) = $assembler->run($fa_name);
|
|
803 my (%reads, %quals, %s_lens, %pos);
|
|
804 # foreach my $r (@{$s_reads}) {
|
|
805 foreach my $r (@reads) {
|
|
806 $reads{$r->{name}} = $r->{full_seq};
|
|
807 $quals{$r->{name}} = $r->{full_qual};
|
|
808 $s_lens{$r->{name}} = length($r->{seq});
|
|
809 $pos{$r->{name}} = $r->{pos};
|
|
810 }
|
|
811
|
|
812 $contig_file = clip_fa_file($contig_file, $tclip);
|
|
813 if( -s $singlet_file ) {
|
|
814 $singlet_file = clip_fa_file($singlet_file, $tclip);
|
|
815 system("cat $singlet_file >> $contig_file");
|
|
816 system("rm $singlet_file");
|
|
817 }
|
|
818 my $seqs = read_fa_file($contig_file);
|
|
819 my $hits = $aligner->run( -TARGET => $orig, -QUERY => $contig_file);
|
|
820 foreach my $t (keys %{$hits}) {
|
|
821 my $h = $hits->{$t};
|
|
822 my $tmp_bp;
|
|
823 my @all_pos;
|
|
824 my $all_reads_name;
|
|
825 if(exists $contig_reads->{$t}) {
|
|
826 foreach my $n (@{$contig_reads->{$t}}) {
|
|
827 push @all_pos, $pos{$n};
|
|
828 }
|
|
829 $all_reads_name = $contig_reads->{$t};
|
|
830 @all_pos = uniq @all_pos;
|
|
831 @all_pos = sort {$a <=> $b} @all_pos;
|
|
832
|
|
833 }
|
|
834 else {
|
|
835 push @all_pos, $pos{$t};
|
|
836 $all_reads_name = [$t];
|
|
837 }
|
|
838
|
|
839 if(is_good_hit($h, $clip, $tclip)) {
|
|
840 my $hit_ort = ($h->strand('query') == 1 ? "+" : "-");
|
|
841 my $real_pos = $tclip == RIGHT_CLIP ? $all_pos[0] : $all_pos[$#all_pos];
|
|
842 if($tclip == RIGHT_CLIP ) {
|
|
843 $tmp_bp = {
|
|
844 left_ort => '+',
|
|
845 left_pos => $real_pos,
|
|
846 left_chr => $target->{tchr},
|
|
847 right_ort => $hit_ort,
|
|
848 right_pos => ($hit_ort eq "+" ? $h->start("hit") : $h->end("hit")) + $base_pos,
|
|
849 right_chr => $chr,
|
|
850 }
|
|
851 }
|
|
852 else {
|
|
853 $tmp_bp = {
|
|
854 left_ort => $hit_ort,
|
|
855 left_chr => $chr,
|
|
856 left_pos => ($hit_ort eq "+" ? $h->end("hit") : $h->start("hit")) + $base_pos,
|
|
857 right_ort => "+",
|
|
858 right_chr => $target->{tchr},
|
|
859 right_pos => $real_pos,
|
|
860 }
|
|
861 }
|
|
862 $tmp_bp->{chr} = $target->{tchr};
|
|
863 $tmp_bp->{pos} = $real_pos;
|
|
864 $tmp_bp->{all_pos} = \@all_pos;
|
|
865 $tmp_bp->{sc_seq} = $seqs->{$t};
|
|
866 $tmp_bp->{cover} = count_coverage($sam, $target->{tchr}, $real_pos);
|
|
867 $tmp_bp->{sc_reads} = exists $sclip_count->{$t} ? $sclip_count->{$t} : 1;
|
|
868 $tmp_bp->{reads} = exists $contig_reads->{$t} ? $contig_reads->{$t} : [$t];
|
|
869
|
|
870 push @bp, $tmp_bp;
|
|
871 }
|
|
872 }
|
|
873 system("rm $fa_name"); system("rm $fa_name*");
|
|
874 }
|
|
875 return \@bp;
|
|
876 }
|
|
877
|
|
878 # many more filter can be added here
|
|
879 sub is_good_hit {
|
|
880 my ($hit, $clip, $tclip) = @_;
|
|
881
|
|
882 return if($hit->length_aln('query') < $min_hit_len);
|
|
883 return if($hit->frac_identical * 100 < $min_percent_id );
|
|
884 return 1;
|
|
885
|
|
886 # do we want to do the check?
|
|
887 my $ort = ($hit->start('query') < $hit->end('query') ? "+" : "-");
|
|
888 my ($dist, $t_dist, $qstart, $qend);
|
|
889 $qstart = $ort eq '+' ? $hit->start('query') : $hit->end('query') ;
|
|
890 $qend = $ort eq '+' ? $hit->end('query') : $hit->start('query');
|
|
891 $t_dist = $qstart;
|
|
892 $t_dist = $hit->query_length - $qend if($tclip == LEFT_CLIP);
|
|
893
|
|
894 }
|
|
895
|
|
896 # RNASeq support to identify deletions
|
|
897
|
|
898 sub cigar2hit {
|
|
899 my ($s, @cigar_array) = @_;
|
|
900 my @pos;
|
|
901 my $p = $s;
|
|
902 foreach my $c (@cigar_array){
|
|
903 my($op, $len) = @{$c};
|
|
904 if($op eq 'N') {
|
|
905 push @pos, [$s, $p];
|
|
906 $s = $p + $len;
|
|
907 $p = $s;
|
|
908 next;
|
|
909 }
|
|
910 $p += $len if($op eq 'M' || $op eq 'D');
|
|
911 }
|
|
912 push @pos, [$s, $p];
|
|
913 return \@pos;
|
|
914 }
|
|
915
|
|
916 sub find_del_SVs {
|
|
917 my ($sam, $chr, $start, $end) = @_;
|
|
918 my $max_sclip_len = 0;
|
|
919 my $range = $chr;
|
|
920 if($start && $end) {
|
|
921 $range = $chr . ":" . $start . "-" . $end;
|
|
922 }
|
|
923 my %SV;
|
|
924 my $tmp = $chr;
|
|
925 $tmp = "chr" . $chr if($chr !~ m/chr/);
|
|
926
|
|
927 my($f_tree, $r_tree) = ($gm->sub_model($tmp, "+"), $gm->sub_model($tmp, "-"));
|
|
928
|
|
929 $sam->fetch($range, sub {
|
|
930 my $a = shift;
|
|
931 my $cigar_str = $a->cigar_str;
|
|
932 # print $a->qname, "\t", $cigar_str, "\n";
|
|
933 return unless $cigar_str =~ m/N/; # the read cross two genes must have N
|
|
934 # return unless $a->score >= 97;
|
|
935 my $sv = identify_del_SV($f_tree, $a);
|
|
936 if($sv){
|
|
937 my $tmp1 = $sv->[0] . "+" . $sv->[1];
|
|
938 if(!exists($SV{$tmp1})) {
|
|
939 $SV{$tmp1} = {};
|
|
940 $SV{$tmp1}->{num} = 0;
|
|
941 $SV{$tmp1}->{left} = 0;
|
|
942 $SV{$tmp1}->{right} = 0;
|
|
943 }
|
|
944 $SV{$tmp1}->{num}++;
|
|
945 $SV{$tmp1}->{left} = $sv->[5] if($SV{$tmp1}->{left} < $sv->[5]);
|
|
946 $SV{$tmp1}->{right} = $sv->[6] if($SV{$tmp1}->{right} < $sv->[6]);
|
|
947 $SV{$tmp1}->{gene_ort} = "+";
|
|
948 $SV{$tmp1}->{left_gene} = $sv->[3];
|
|
949 $SV{$tmp1}->{right_gene} = $sv->[4];
|
|
950 push @{$SV{$tmp1}->{reads}}, $a->qname;
|
|
951
|
|
952 }
|
|
953 $sv = identify_del_SV($r_tree, $a);
|
|
954 if($sv){
|
|
955 my $tmp1 = $sv->[0] . "-" . $sv->[1];
|
|
956 if(!exists($SV{$tmp1})) {
|
|
957 $SV{$tmp1} = {};
|
|
958 $SV{$tmp1}->{num} = 0;
|
|
959 $SV{$tmp1}->{left} = 0;
|
|
960 $SV{$tmp1}->{right} = 0;
|
|
961 $SV{$tmp1}->{reads} = [];
|
|
962 }
|
|
963 $SV{$tmp1}->{num}++;
|
|
964 $SV{$tmp1}->{left} = $sv->[5] if($SV{$tmp1}->{left} < $sv->[5]);
|
|
965 $SV{$tmp1}->{right} = $sv->[6] if($SV{$tmp1}->{right} < $sv->[6]);
|
|
966 $SV{$tmp1}->{gene_ort} = "-";
|
|
967 $SV{$tmp1}->{left_gene} = $sv->[3];
|
|
968 $SV{$tmp1}->{right_gene} = $sv->[4];
|
|
969 push @{$SV{$tmp1}->{reads}}, $a->qname;
|
|
970 }
|
|
971 }
|
|
972 );
|
|
973
|
|
974 my @rtn;
|
|
975 foreach my $sv (keys %SV) {
|
|
976 my( $pos1, $pos2 ) = $sv =~ m/(\d+)[+|-](\d+)/;
|
|
977 print $pos1, "\t", $pos2, "\n";
|
|
978 my( $cover1, $cover2) = (count_coverage($sam, $chr, $pos1 - 1), count_coverage($sam, $chr, $pos2));
|
|
979 next if($SV{$sv}->{num} < $min_sclip_reads);
|
|
980 my $cover = $cover1 < $cover2 ? $cover1 : $cover2;
|
|
981 next if($SV{$sv}->{num} * 100 < $cover * $min_pct_sclip_reads && $SV{$sv} < 50);
|
|
982 push @rtn, StructVar->new (
|
|
983 -FIRST_BP => { left_ort => '+',
|
|
984 left_pos => $pos1 - 1,
|
|
985 left_chr => $chr,
|
|
986 right_ort => '+',
|
|
987 right_pos => $pos2,
|
|
988 right_chr => $chr,
|
|
989 chr => $chr,
|
|
990 cover => count_coverage($sam, $chr, $pos1 - 1),
|
|
991 pos => $pos1 - 1,
|
|
992 sc_reads => $SV{$sv}->{num},
|
|
993 left_len => $SV{$sv}->{left},
|
|
994 right_len => $SV{$sv}->{right},
|
|
995 gene_ort => $SV{$sv}->{gene_ort},
|
|
996 left_gene => $SV{$sv}->{left_gene},
|
|
997 right_gene => $SV{$sv}->{right_gene},
|
|
998 reads => $SV{$sv}->{reads},
|
|
999 },
|
|
1000 -SECOND_BP => { left_ort => '+',
|
|
1001 left_pos => $pos1 - 1,
|
|
1002 left_chr => $chr,
|
|
1003 right_ort => '+',
|
|
1004 right_pos => $pos2,
|
|
1005 right_chr => $chr,
|
|
1006 chr => $chr,
|
|
1007 pos => $pos2,
|
|
1008 cover => count_coverage($sam, $chr, $pos2),
|
|
1009 sc_reads => $SV{$sv}->{num},
|
|
1010 reads => [],
|
|
1011 }
|
|
1012 );
|
|
1013 }
|
|
1014 return @rtn;
|
|
1015 }
|
|
1016
|
|
1017 sub identify_del_SV {
|
|
1018 my ($tree, $a) = @_;
|
|
1019 my @cigar_array = @{$a->cigar_array};
|
|
1020 return unless($a->start && $a->end);
|
|
1021 return if(scalar $tree->intersect([$a->start, $a->end]) <= 1);
|
|
1022
|
|
1023 my @hits = @{ cigar2hit($a->start, @cigar_array) };
|
|
1024 my @genes;
|
|
1025 my $last_gene;
|
|
1026 my @change;
|
|
1027 my ($left, $right) = (0, 0);
|
|
1028 for( my $i = 0; $i < scalar @hits; $i++) {
|
|
1029 my $h = $hits[$i];
|
|
1030 my @g = $tree->intersect($h);
|
|
1031 my $g_size = scalar @g;
|
|
1032 if($g_size == 0) {
|
|
1033 # print STDERR $a->qname, " is mapped outside of gene\n";
|
|
1034 next;
|
|
1035 }
|
|
1036 if( scalar @g == 1) {
|
|
1037 push @genes, $g[0] unless ($last_gene && $last_gene->val->name eq $g[0]->val->name);
|
|
1038 if($last_gene && $last_gene->val->name ne $g[0]->val->name) {
|
|
1039 $right += ($h->[1] - $h->[0] );
|
|
1040 push @change, $i;
|
|
1041 }
|
|
1042 $left += ($h->[1] - $h->[0] ) if($right == 0);
|
|
1043 $last_gene = $g[0];
|
|
1044 next;
|
|
1045 }
|
|
1046 # print STDERR $a->qname, " connect two genes into one?\n";
|
|
1047 return;
|
|
1048 }
|
|
1049 return if(scalar @genes <= 1);
|
|
1050 if(scalar @genes > 2) {
|
|
1051 # print STDERR $a->qname, "crossed more than 2 genes!\n";
|
|
1052 return;
|
|
1053 }
|
|
1054 # now we down to 2 genes
|
|
1055 my ($left_gene, $right_gene) = @genes;
|
|
1056 return unless $left_gene->val->overlap([$hits[$change[0]-1]->[0], $hits[$change[0]-1]->[1]]);
|
|
1057 return unless $right_gene->val->overlap([$hits[$change[0]]->[0], $hits[$change[0]]->[1]]);
|
|
1058 my $p = $left_gene->val->end;
|
|
1059 print join("\t", ( $a->score,
|
|
1060 $a->cigar_str, $left, $right, $hits[$change[0]-1]->[1], $hits[$change[0]]->[0],
|
|
1061 $left_gene->val->name, $right_gene->val->name)), "\n";
|
|
1062 return [$hits[$change[0]-1]->[1], $hits[$change[0]]->[0], $a, $left_gene->val, $right_gene->val, $left, $right];
|
|
1063 }
|
|
1064
|
|
1065 =head1 NAME
|
|
1066
|
|
1067 CREST.pl - a Structure Variation detection tools using softclipping for
|
|
1068 whole genome sequencing.
|
|
1069
|
|
1070
|
|
1071 =head1 VERSION
|
|
1072
|
|
1073 This documentation refers to CREST.pl version 0.0.1.
|
|
1074
|
|
1075
|
|
1076 =head1 USAGE
|
|
1077
|
|
1078 This program depends on several things that need to be installed and/or
|
|
1079 specified. The program uses BioPerl and Bio::DB::Sam module to parse
|
|
1080 the files and bam files. Also it uses Blat software suits to do genome
|
|
1081 mapping and alignment. To make the program efficient, it also requires
|
|
1082 a blat server setup. And the program uses CAP3 assembler.
|
|
1083
|
|
1084 Identify somatic SVs from input:
|
|
1085 CREST.pl -f somatic.cover -d tumor.bam -g germline.bam
|
|
1086 Identify SVs from a single bam compared wtih reference genome:
|
|
1087 CREST.pl -f sclip.cover -d sample.bam
|
|
1088 Identify SVs from a single bam on chr1 only
|
|
1089 CREST.pl -f sclip.cover -d sample.bam -r chr1
|
|
1090
|
|
1091 =head1 REQUIRED ARGUMENTS
|
|
1092
|
|
1093 To run the program, several parameter must specified.
|
|
1094 -d, --input_d: The input (diagnositic) bam file
|
|
1095 -f, --sclipfile: The soft clipping information generated by extractSClip.pl
|
|
1096 --ref_genome: The reference genome file in fa format
|
|
1097 -t, --target_genome: The 2bit genome file used by blat server and blat
|
|
1098 --blatserver: The blat server name or ip address
|
|
1099 --blatport: The blat server port
|
|
1100
|
|
1101 =head1 OPTIONS
|
|
1102
|
|
1103 The options that can be used for the program.
|
|
1104 -g, --input_g The germline (paired) bam file
|
|
1105 -p, --prefix The output prefix, default is the input bam file name
|
|
1106 -o, --out_dir Output directory, default is the working directory
|
|
1107 -l, --read_len The read length of the sequencing data, defaut 100
|
|
1108 --sensitive The program will generate more SVs with higher false positive rate.
|
|
1109
|
|
1110 --(no)scratch Use the scratch space, default is off.
|
|
1111 --(no)paired Use paired reads or not, defafult is on, so change to --nopaired for unpaired reads.
|
|
1112 --cap3 CAP3 executable position, default cap3
|
|
1113 --cap3opt CAP3 options, single quoted, Default ' > /dev/null'
|
|
1114 --blatclient gfClient excutable position, default gfClient
|
|
1115 --blatclientopt gfClient options, single quoted, default '-out=psl -nohead'
|
|
1116 --blat blat executable potion, default balt
|
|
1117 --blatopt blat options, single quoted, default '-tileSize=7 -stepSize=1 -out=psl -minScore=15'
|
|
1118
|
|
1119 -r, --range The range where SV will be detected, using chr1:100-200 format.
|
|
1120 --max_score_diff The maximum score difference when stopping select hit, default 10.
|
|
1121 --min_sclip_reads Minimum number of soft clipping read to triger the procedure, default 3.
|
|
1122 --max_rep_cover The min number of coverage to be called as repetitive and don't triger
|
|
1123 the procedure, default 500.
|
|
1124 --min_sclip_len The min length of soft clipping part at a position to triger the detection,
|
|
1125 default 20.
|
|
1126 --min_hit_len Min length of a hit for genome mapping
|
|
1127 --min_dist_diff Min distance between the mapped position and the soft clipping position, default 20.
|
|
1128 --(no)rmdup Remove PCR dumplicate. Default remove.
|
|
1129
|
|
1130 --min_percent_id Min percentage of identity of soft clipping read mapping, default 90
|
|
1131 --min_percent_hq Min percentage of high quality base in soft clipping reads, default 80
|
|
1132 --lowqual_cutoff Low quality cutoff value, default 20.
|
|
1133
|
|
1134 -h, --help, -? Help information
|
|
1135 --man Man page.
|
|
1136 --usage Usage information.
|
|
1137 --version Software version.
|
|
1138
|
|
1139 --(no)rm_tandem_repeat Remove tandem repeat caused SV events, default is ON. When it's on ptrfinder program
|
|
1140 need to be on the path.
|
|
1141 --tr_max_indel_size Maximum tandem repeat mediated INDEL events, default 100
|
|
1142 --tr_min_size Minimum tandem reapet size, default 2
|
|
1143 --tr_max_size Maximum tandem repeat size, default 8
|
|
1144 --tr_min_num Minimum tandem repeat number, defaut 4
|
|
1145 --min_percent_cons_of_read Minimum percent of consensus length of read length, default 0.75
|
|
1146 --max_bp_dist Maximum distance between two idenfitifed break points, default 15
|
|
1147 --germline_seq_width Half window width of genomic sequence around break point for germline SV filtering,
|
|
1148 default 100
|
|
1149 --germline_search_width Half window width for seaching soft-clipped reads around breakpoint for germline SV
|
|
1150 filtering, default 50.
|
|
1151
|
|
1152 --hetero_factor The factor about the SV's heterogenirity and heterozygosity, default 0.4.
|
|
1153 --triger_p_value The p-value that will triger the SV detection when number of soft-clipped reads is small,
|
|
1154 defaut 0.05.
|
|
1155
|
|
1156 --(no)rescue Use rescue mode, when it's on, the a SV with only 1 side with enough soft-clipped reads
|
|
1157 is considered as a valid one instead of rejecting it. Default on.
|
|
1158 --min_one_side_reads When rescure mode is on, the minimum number of soft-clipped reads on one side, default 5.
|
|
1159
|
|
1160 --RNASeq RNAseq mode, default off
|
|
1161 --genemodel A gene model file, currently only refFlat format (BED) is supported. Requried for RNASeq.
|
|
1162 --cluster_size The soft-clipped reads within cluster_size will be considered together, default is 3, RNAseq mode
|
|
1163 only.
|
|
1164
|
|
1165
|
|
1166 =head1 DESCRIPTION
|
|
1167
|
|
1168 This is a program designed to identify Structure Variations (SVs) using soft
|
|
1169 clipping reads. With the improvement of next-gen sequencing techniques,
|
|
1170 both the coverage and length of pair-end reads have been increased steady.
|
|
1171 Many SV detection software availalbe uses pair-end information due to the
|
|
1172 limitted read length. Now 100bp is pretty common and many reads will cross
|
|
1173 the break points. Some mapping programs ( bwa, etc ) have the ability to
|
|
1174 identify so called soft-clipping reads. A soft-clipping read is a read that
|
|
1175 different part can be mapped to different genomic posiiton, but the read can
|
|
1176 be uniquely positioned using the mate-pair position and the insert length.
|
|
1177 So this program use those reads to do an assembly-mapping-assembly-alignment
|
|
1178 procedure to identify potential structure variiations.
|
|
1179
|
|
1180
|
|
1181 =head1 DIAGNOSTICS
|
|
1182
|
|
1183 A list of every error and warning message that the application can generate
|
|
1184 (even the ones that will "never happen"), with a full explanation of each
|
|
1185 problem, one or more likely causes, and any suggested remedies. If the
|
|
1186 application generates exit status codes (e.g., under Unix), then list the exit
|
|
1187 status associated with each error.
|
|
1188
|
|
1189 =head1 CONFIGURATION AND ENVIRONMENT
|
|
1190
|
|
1191 The program is designed to use under a cluster or high performance computing
|
|
1192 environment since it's dealing with over 100G input data. The program can be
|
|
1193 used as highly parallellized. And the program is developped under linux/unix.
|
|
1194
|
|
1195
|
|
1196 =head1 DEPENDENCIES
|
|
1197
|
|
1198 The program depend on several packages:
|
|
1199 1. Bioperl perl module.
|
|
1200 2. Bio::DB::Sam, version 1.5 or later, it requires samtools lib installed.
|
|
1201 3. blat suits, include blat, gfClient, gfServer.
|
|
1202 4. CAP3 assembly program.
|
|
1203
|
|
1204
|
|
1205 =head1 BUGS AND LIMITATIONS
|
|
1206
|
|
1207 There are no known bugs in this module, but the method is limitted to bam file
|
|
1208 that has soft-clipping cigar string generated.Please report problems to
|
|
1209 Jianmin Wang (Jianmin.Wang@stjude.org)
|
|
1210 Patches are welcome.
|
|
1211
|
|
1212 =head1 AUTHOR
|
|
1213
|
|
1214 Jianmin Wang (Jianmin.Wang@stjude.org)
|
|
1215
|
|
1216
|
|
1217 =head1 LICENCE AND COPYRIGHT
|
|
1218
|
|
1219 Copyright (c) 2010 by St. Jude Children's Research Hospital.
|
|
1220
|
|
1221 This program is free software: you can redistribute it and/or modify
|
|
1222 it under the terms of the GNU General Public License as published by
|
|
1223 the Free Software Foundation, either version 2 of the License, or
|
|
1224 (at your option) any later version.
|
|
1225
|
|
1226 This program is distributed in the hope that it will be useful,
|
|
1227 but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
1228 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
1229 GNU General Public License for more details.
|
|
1230
|
|
1231 You should have received a copy of the GNU General Public License
|
|
1232 along with this program. If not, see <http://www.gnu.org/licenses/>.
|