Mercurial > repos > bzeitouni > svdetect
comparison SVDetect_run_parallel.pl @ 5:ba8c5e544948 draft
Uploaded
author | bzeitouni |
---|---|
date | Mon, 11 Jun 2012 12:31:07 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
4:f7a84d31bd83 | 5:ba8c5e544948 |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 =pod | |
4 | |
5 =head1 NAME | |
6 | |
7 SVDetect - Program designed to the detection of structural variations | |
8 from paired-end/mate-pair sequencing data, compatible with SOLiD and Illumina (>=1.3) reads | |
9 | |
10 Version: 0.8 for Galaxy | |
11 | |
12 =head1 SYNOPSIS | |
13 | |
14 SVDetect <command> -conf <configuration_file> [-help] [-man] | |
15 | |
16 Command: | |
17 | |
18 linking detection and isolation of links | |
19 filtering filtering of links according different parameters | |
20 links2circos links conversion to circos format | |
21 links2bed paired-ends of links converted to bed format (UCSC) | |
22 links2SV formatted output to show most significant SVs | |
23 cnv calculate copy-number profiles | |
24 ratio2circos ratio conversion to circos density format | |
25 ratio2bedgraph ratio conversion to bedGraph density format (UCSC) | |
26 | |
27 =head1 DESCRIPTION | |
28 | |
29 This is a command-line interface to SVDetect. | |
30 | |
31 | |
32 =head1 AUTHORS | |
33 | |
34 Bruno Zeitouni E<lt>bruno.zeitouni@curie.frE<gt>, | |
35 Valentina Boeva E<lt>valentina.boeva@curie.frE<gt> | |
36 | |
37 =cut | |
38 | |
39 # ------------------------------------------------------------------- | |
40 | |
41 use strict; | |
42 use warnings; | |
43 | |
44 use Pod::Usage; | |
45 use Getopt::Long; | |
46 | |
47 use Config::General; | |
48 use Tie::IxHash; | |
49 use FileHandle; | |
50 use Parallel::ForkManager; | |
51 | |
52 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
53 #PARSE THE COMMAND LINE | |
54 my %OPT; | |
55 GetOptions(\%OPT, | |
56 'conf=s', | |
57 'out1=s', #GALAXY | |
58 'out2=s', #GALAXY | |
59 'out3=s', #GALAXY | |
60 'out4=s', #GALAXY | |
61 'out5=s', #GALAXY | |
62 'l=s', #GALAXY | |
63 'N=s',#GALAXY | |
64 'help',#GALAXY | |
65 'man' | |
66 ); | |
67 | |
68 pod2usage() if $OPT{help}; | |
69 pod2usage(-verbose=>2) if $OPT{man}; | |
70 pod2usage(-message=> "$!", -exitval => 2) if (!defined $OPT{conf}); | |
71 | |
72 pod2usage() if(@ARGV<1); | |
73 | |
74 tie (my %func, 'Tie::IxHash',linking=>\&createlinks, | |
75 filtering=>\&filterlinks, | |
76 links2circos=>\&links2circos, | |
77 links2bed=>\&links2bed, | |
78 links2compare=>\&links2compare, | |
79 links2SV=>\&links2SV, | |
80 cnv=>\&cnv, | |
81 ratio2circos=>\&ratio2circos, | |
82 ratio2bedgraph=>\&ratio2bedgraph); | |
83 | |
84 foreach my $command (@ARGV){ | |
85 pod2usage(-message=> "Unknown command \"$command\"", -exitval => 2) if (!defined($func{$command})); | |
86 } | |
87 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
88 | |
89 | |
90 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
91 #READ THE CONFIGURATION FILE | |
92 my $conf=Config::General->new( -ConfigFile => $OPT{conf}, | |
93 -Tie => "Tie::IxHash", | |
94 -AllowMultiOptions => 1, | |
95 -LowerCaseNames => 1, | |
96 -AutoTrue => 1); | |
97 my %CONF= $conf->getall; | |
98 validateconfiguration(\%CONF); #validation of the configuration parameters | |
99 | |
100 my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; #GALAXY | |
101 | |
102 my $pt_log_file=$OPT{l}; #GALAXY | |
103 my $pt_links_file=$OPT{out1} if($OPT{out1}); #GALAXY | |
104 my $pt_flinks_file=$OPT{out2} if($OPT{out2}); #GALAXY | |
105 my $pt_sv_file=$OPT{out3} if($OPT{out3}); #GALAXY | |
106 my $pt_circos_file=$OPT{out4} if($OPT{out4}); #GALAXY | |
107 my $pt_bed_file=$OPT{out5} if($OPT{out5}); #GALAXY | |
108 | |
109 $CONF{general}{mates_file}=readlink($CONF{general}{mates_file});#GALAXY | |
110 $CONF{general}{cmap_file}=readlink($CONF{general}{cmap_file});#GALAXY | |
111 | |
112 my $log_file=$CONF{general}{output_dir}.$OPT{N}.".svdetect_run.log"; #GALAXY | |
113 open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";#GALAXY | |
114 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
115 | |
116 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
117 #COMMAND EXECUTION | |
118 foreach my $command (@ARGV){ | |
119 &{$func{$command}}(); | |
120 } | |
121 print LOG "-- end\n";#GALAXY | |
122 | |
123 close LOG;#GALAXY | |
124 system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY | |
125 exit(0); | |
126 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
127 | |
128 | |
129 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# | |
130 #FUNCTIONS | |
131 #------------------------------------------------------------------------------# | |
132 #MAIN FUNCTION number 1: Detection of links from mate-pairs data | |
133 sub createlinks{ | |
134 | |
135 my %CHR; #main hash table 1: fragments, links | |
136 my %CHRID; | |
137 my @MATEFILES; | |
138 | |
139 my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}; | |
140 my @path=split(/\//,$output_prefix); | |
141 $output_prefix=$CONF{general}{output_dir}.$path[$#path]; | |
142 my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path]; | |
143 my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path]; | |
144 | |
145 shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters | |
146 $CONF{detection}{window_size}, | |
147 $CONF{detection}{step_length}, | |
148 $CONF{general}{cmap_file}); | |
149 | |
150 if($CONF{detection}{split_mate_file}){ | |
151 | |
152 splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix, | |
153 $CONF{general}{sv_type}, | |
154 $CONF{general}{mates_file}, | |
155 $CONF{general}{input_format}, | |
156 $CONF{general}{read_lengths} | |
157 ); | |
158 }else{ | |
159 | |
160 @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted mate files already created at $CONF{general}{tmp_dir} :$!"; | |
161 chomp(@MATEFILES); | |
162 print LOG "# Splitted mate files already created.\n"; | |
163 } | |
164 | |
165 | |
166 #Parallelization of the linking per chromosome for intra + interchrs | |
167 my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); | |
168 | |
169 foreach my $matefile (@MATEFILES){ | |
170 | |
171 my $pid = $pm->start and next; | |
172 getlinks(\%CHR, \%CHRID, $matefile); | |
173 $pm->finish; | |
174 | |
175 } | |
176 $pm->wait_all_children; | |
177 | |
178 #Merge the chromosome links file into only one | |
179 my @LINKFILES= qx{ls $tmp_links_prefix*links} or die "# Error: No links files created at $CONF{general}{tmp_dir} :$!"; | |
180 chomp(@LINKFILES); | |
181 catFiles( \@LINKFILES => "$output_prefix.links" ); | |
182 | |
183 system "rm $pt_links_file; ln -s $output_prefix.links $pt_links_file" if (defined $pt_links_file); #GALAXY | |
184 print LOG "# Linking end procedure : output created: $output_prefix.links\n"; | |
185 #unlink(@LINKFILES); | |
186 #unlink(@MATEFILES); | |
187 | |
188 undef %CHR; | |
189 undef %CHRID; | |
190 | |
191 } | |
192 #------------------------------------------------------------------------------# | |
193 #------------------------------------------------------------------------------# | |
194 sub getlinks { | |
195 | |
196 my ($chr,$chrID,$tmp_mates_prefix)=@_; | |
197 | |
198 my $tmp_links_prefix=$tmp_mates_prefix; | |
199 $tmp_links_prefix=~s/\/mates\//\/links\//; | |
200 | |
201 my %PAIR; #main hash table 2: pairs | |
202 | |
203 linking($chr,$chrID, \%PAIR, #creation of all links from chromosome coordinates of pairs | |
204 $CONF{general}{read_lengths}, | |
205 $CONF{detection}{window_size}, | |
206 $CONF{detection}{step_length}, | |
207 $tmp_mates_prefix, | |
208 $CONF{general}{input_format}, | |
209 $CONF{general}{sv_type}, | |
210 "$tmp_links_prefix.links.mapped" | |
211 ); | |
212 | |
213 getUniqueLinks("$tmp_links_prefix.links.mapped", #remove the doublons | |
214 "$tmp_links_prefix.links.unique"); | |
215 | |
216 defineCoordsLinks($chr,$chrID, \%PAIR, #definition of the precise coordinates of links | |
217 $CONF{general}{input_format}, | |
218 $CONF{general}{sv_type}, | |
219 $CONF{general}{read_lengths}, | |
220 "$tmp_links_prefix.links.unique", | |
221 "$tmp_links_prefix.links.unique_defined"); | |
222 | |
223 sortLinks("$tmp_links_prefix.links.unique_defined", #sorting links from coordinates | |
224 "$tmp_links_prefix.links.sorted"); | |
225 | |
226 removeFullyOverlappedLinks("$tmp_links_prefix.links.sorted", #remove redundant links | |
227 "$tmp_links_prefix.links",1); #file output | |
228 | |
229 | |
230 undef %PAIR; | |
231 | |
232 unlink("$tmp_links_prefix.links.mapped", | |
233 "$tmp_links_prefix.links.unique", | |
234 "$tmp_links_prefix.links.unique_defined", | |
235 "$tmp_links_prefix.links.sorted"); | |
236 } | |
237 #------------------------------------------------------------------------------# | |
238 #------------------------------------------------------------------------------# | |
239 sub splitMateFile{ | |
240 | |
241 my ($chr,$chrID,$files_list,$output_prefix,$sv_type,$mates_file,$input_format,$tag_length)=@_; | |
242 | |
243 print LOG "# Splitting the mate file \"$mates_file\" for parallel processing...\n"; | |
244 | |
245 my %filesHandle; | |
246 | |
247 #fichier matefile inter | |
248 if($sv_type=~/^(all|inter)$/){ | |
249 my $newFileName="$output_prefix.interchrs"; | |
250 push(@{$files_list},$newFileName); | |
251 my $fh = new FileHandle; | |
252 $fh->open(">$newFileName"); | |
253 $filesHandle{inter}=$fh; | |
254 } | |
255 | |
256 #fichiers matefiles intra | |
257 if($sv_type=~/^(all|intra)$/){ | |
258 foreach my $k (1..$chr->{nb_chrs}){ | |
259 my $newFileName=$output_prefix.".".$chr->{$k}->{name}; | |
260 push(@{$files_list},$newFileName); | |
261 my $fh = new FileHandle; | |
262 $fh->open(">$newFileName"); | |
263 $filesHandle{$k}=$fh; | |
264 } | |
265 } | |
266 | |
267 if ($mates_file =~ /.gz$/) { | |
268 open(MATES, "gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat | |
269 }elsif($mates_file =~ /.bam$/){ | |
270 open(MATES, "$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY | |
271 }else{ | |
272 open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; | |
273 } | |
274 | |
275 | |
276 while(<MATES>){ | |
277 | |
278 my @t=split; | |
279 my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2); | |
280 | |
281 next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); | |
282 | |
283 next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); | |
284 | |
285 ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); | |
286 | |
287 if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){ | |
288 my $fh2print=$filesHandle{inter}; | |
289 print $fh2print join("\t",@t)."\n"; | |
290 } | |
291 | |
292 if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){ | |
293 my $fh2print=$filesHandle{$chr_read1}; | |
294 print $fh2print join("\t",@t)."\n"; | |
295 | |
296 } | |
297 } | |
298 | |
299 foreach my $name (keys %filesHandle){ | |
300 my $fh=$filesHandle{$name}; | |
301 $fh->close; | |
302 } | |
303 | |
304 print LOG "# Splitted mate files of \"$mates_file\" created.\n"; | |
305 } | |
306 | |
307 | |
308 #------------------------------------------------------------------------------# | |
309 #------------------------------------------------------------------------------# | |
310 sub filterlinks{ | |
311 | |
312 my %CHR; | |
313 my %CHRID; | |
314 my @LINKFILES; | |
315 my @FLINKFILES; | |
316 | |
317 my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}; | |
318 my @path=split(/\//,$output_prefix); | |
319 $output_prefix=$CONF{general}{output_dir}.$path[$#path]; | |
320 my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path]; | |
321 | |
322 createChrHashTables(\%CHR,\%CHRID, | |
323 $CONF{general}{cmap_file}); | |
324 | |
325 if($CONF{filtering}{split_link_file}){ | |
326 | |
327 splitLinkFile(\%CHR, \%CHRID, \@LINKFILES, | |
328 $tmp_links_prefix, | |
329 $CONF{general}{sv_type}, | |
330 "$output_prefix.links", | |
331 ); | |
332 }else{ | |
333 | |
334 @LINKFILES=qx{ls $tmp_links_prefix*links} or die "# Error: No splitted link files already created\n"; | |
335 chomp(@LINKFILES); | |
336 print LOG "# Splitted link files already created.\n"; | |
337 } | |
338 | |
339 #Parallelization of the filtering per chromosome for intra + interchrs | |
340 my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); | |
341 | |
342 foreach my $linkfile (@LINKFILES){ | |
343 | |
344 my $pid = $pm->start and next; | |
345 getFilteredlinks(\%CHR, \%CHRID, $linkfile); | |
346 $pm->finish; | |
347 | |
348 } | |
349 $pm->wait_all_children; | |
350 | |
351 #Merge the chromosome links file into only one | |
352 @FLINKFILES= qx{ls $tmp_links_prefix*filtered} or die "# Error: No links files created\n"; | |
353 chomp(@FLINKFILES); | |
354 catFiles( \@FLINKFILES => "$output_prefix.links.filtered" ); | |
355 | |
356 system "rm $pt_flinks_file; ln -s $output_prefix.links.filtered $pt_flinks_file" if (defined $pt_flinks_file); #GALAXY | |
357 print LOG"# Filtering end procedure : output created: $output_prefix.links.filtered\n"; | |
358 | |
359 undef %CHR; | |
360 undef %CHRID; | |
361 | |
362 } | |
363 #------------------------------------------------------------------------------# | |
364 #------------------------------------------------------------------------------# | |
365 sub splitLinkFile{ | |
366 | |
367 my ($chr,$chrID,$files_list,$input_prefix,$sv_type,$link_file)=@_; | |
368 | |
369 print LOG "# Splitting the link file for parallel processing...\n"; | |
370 | |
371 my %filesHandle; | |
372 | |
373 #fichier matefile inter | |
374 if($sv_type=~/^(all|inter)$/){ | |
375 my $newFileName="$input_prefix.interchrs.links"; | |
376 push(@{$files_list},$newFileName); | |
377 my $fh = new FileHandle; | |
378 $fh->open(">$newFileName"); | |
379 $filesHandle{inter}=$fh; | |
380 } | |
381 | |
382 #fichiers matefiles intra | |
383 if($sv_type=~/^(all|intra)$/){ | |
384 foreach my $k (1..$chr->{nb_chrs}){ | |
385 my $newFileName=$input_prefix.".".$chr->{$k}->{name}.".links"; | |
386 push(@{$files_list},$newFileName); | |
387 my $fh = new FileHandle; | |
388 $fh->open(">$newFileName"); | |
389 $filesHandle{$k}=$fh; | |
390 } | |
391 } | |
392 | |
393 open LINKS, "<".$link_file or die "$0: can't open ".$link_file.":$!\n"; | |
394 while(<LINKS>){ | |
395 | |
396 my @t=split; | |
397 my ($chr_read1,$chr_read2)=($t[0],$t[3]); | |
398 | |
399 next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); | |
400 | |
401 ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); | |
402 | |
403 if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){ | |
404 my $fh2print=$filesHandle{inter}; | |
405 print $fh2print join("\t",@t)."\n"; | |
406 } | |
407 | |
408 if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){ | |
409 my $fh2print=$filesHandle{$chr_read1}; | |
410 print $fh2print join("\t",@t)."\n"; | |
411 | |
412 } | |
413 } | |
414 | |
415 foreach my $name (keys %filesHandle){ | |
416 my $fh=$filesHandle{$name}; | |
417 $fh->close; | |
418 } | |
419 | |
420 print LOG "# Splitted link files created.\n"; | |
421 } | |
422 | |
423 | |
424 #------------------------------------------------------------------------------# | |
425 #------------------------------------------------------------------------------# | |
426 #MAIN FUNCTION number 2: Filtering processing | |
427 sub getFilteredlinks { | |
428 | |
429 my ($chr,$chrID,$tmp_links_prefix)=@_; | |
430 my %PAIR; | |
431 | |
432 strandFiltering($chr,$chrID, | |
433 $CONF{filtering}{nb_pairs_threshold}, #filtering of links | |
434 $CONF{filtering}{strand_filtering}, | |
435 $CONF{filtering}{chromosomes}, | |
436 $CONF{general}{input_format}, | |
437 $CONF{general}{cmap_file}, | |
438 $CONF{general}{mates_orientation}, | |
439 $CONF{general}{read_lengths}, | |
440 $tmp_links_prefix, | |
441 "$tmp_links_prefix.filtered", | |
442 ); | |
443 | |
444 if($CONF{filtering}{strand_filtering}){ #re-definition of links coordinates with strand filtering | |
445 | |
446 my @tmpfiles; | |
447 | |
448 rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_unique"); | |
449 | |
450 getUniqueLinks("$tmp_links_prefix.filtered_unique", | |
451 "$tmp_links_prefix.filtered"); | |
452 | |
453 push(@tmpfiles,"$tmp_links_prefix.filtered_unique"); | |
454 | |
455 if($CONF{filtering}{order_filtering}){ #filtering using the order | |
456 | |
457 rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_ordered"); | |
458 | |
459 orderFiltering($chr,$chrID, | |
460 $CONF{filtering}{nb_pairs_threshold}, | |
461 $CONF{filtering}{nb_pairs_order_threshold}, | |
462 $CONF{filtering}{mu_length}, | |
463 $CONF{filtering}{sigma_length}, | |
464 $CONF{general}{mates_orientation}, | |
465 $CONF{general}{read_lengths}, | |
466 "$tmp_links_prefix.filtered_ordered", | |
467 "$tmp_links_prefix.filtered", | |
468 ); | |
469 | |
470 push(@tmpfiles,"$tmp_links_prefix.filtered_ordered"); | |
471 } | |
472 | |
473 if (($CONF{filtering}{insert_size_filtering})&& | |
474 ($CONF{general}{sv_type} ne 'inter')){ | |
475 | |
476 rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_withoutIndelSize"); | |
477 | |
478 addInsertionInfo($chr,$chrID, | |
479 $CONF{filtering}{nb_pairs_threshold}, | |
480 $CONF{filtering}{order_filtering}, | |
481 $CONF{filtering}{indel_sigma_threshold}, | |
482 $CONF{filtering}{dup_sigma_threshold}, | |
483 $CONF{filtering}{singleton_sigma_threshold}, | |
484 $CONF{filtering}{mu_length}, | |
485 $CONF{filtering}{sigma_length}, | |
486 $CONF{general}{mates_orientation}, | |
487 $CONF{general}{read_lengths}, | |
488 "$tmp_links_prefix.filtered_withoutIndelSize", | |
489 "$tmp_links_prefix.filtered" | |
490 ); | |
491 | |
492 push(@tmpfiles,"$tmp_links_prefix.filtered_withoutIndelSize"); | |
493 } | |
494 | |
495 sortLinks("$tmp_links_prefix.filtered", | |
496 "$tmp_links_prefix.filtered_sorted"); | |
497 | |
498 removeFullyOverlappedLinks("$tmp_links_prefix.filtered_sorted", | |
499 "$tmp_links_prefix.filtered_nodup", | |
500 ); | |
501 | |
502 postFiltering("$tmp_links_prefix.filtered_nodup", | |
503 "$tmp_links_prefix.filtered", | |
504 $CONF{filtering}{final_score_threshold}); | |
505 | |
506 push(@tmpfiles,"$tmp_links_prefix.filtered_sorted","$tmp_links_prefix.filtered_nodup"); | |
507 | |
508 unlink(@tmpfiles); | |
509 | |
510 | |
511 } | |
512 undef %PAIR; | |
513 | |
514 } | |
515 #------------------------------------------------------------------------------# | |
516 #------------------------------------------------------------------------------# | |
517 #MAIN FUNCTION number 3: Circos format conversion for links | |
518 sub links2circos{ | |
519 | |
520 my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; | |
521 my @path=split(/\//,$input_file); | |
522 $input_file=$CONF{general}{output_dir}.$path[$#path]; | |
523 | |
524 my $output_file.=$input_file.".segdup.txt"; | |
525 | |
526 links2segdup($CONF{circos}{organism_id}, | |
527 $CONF{circos}{colorcode}, | |
528 $input_file, | |
529 $output_file); #circos file output | |
530 | |
531 system "rm $pt_circos_file; ln -s $output_file $pt_circos_file" if (defined $pt_circos_file); #GALAXY | |
532 } | |
533 #------------------------------------------------------------------------------# | |
534 #------------------------------------------------------------------------------# | |
535 #MAIN FUNCTION number 4: Bed format conversion for links | |
536 sub links2bed{ | |
537 | |
538 my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; | |
539 my @path=split(/\//,$input_file); | |
540 $input_file=$CONF{general}{output_dir}.$path[$#path]; | |
541 | |
542 my $output_file.=$input_file.".bed.txt"; | |
543 | |
544 links2bedfile($CONF{general}{read_lengths}, | |
545 $CONF{bed}{colorcode}, | |
546 $input_file, | |
547 $output_file); #bed file output | |
548 | |
549 system "rm $pt_bed_file; ln -s $output_file $pt_bed_file" if (defined $pt_bed_file); #GALAXY | |
550 | |
551 } | |
552 #------------------------------------------------------------------------------# | |
553 #------------------------------------------------------------------------------# | |
554 #MAIN FUNCTION number 6: Bed format conversion for links | |
555 sub links2SV{ | |
556 | |
557 my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; | |
558 | |
559 my @path=split(/\//,$input_file); | |
560 $input_file=$CONF{general}{output_dir}.$path[$#path]; | |
561 | |
562 my $output_file.=$input_file.".sv.txt"; | |
563 | |
564 | |
565 links2SVfile( $input_file, | |
566 $output_file); | |
567 | |
568 system "rm $pt_sv_file; ln -s $output_file $pt_sv_file" if (defined $pt_sv_file); #GALAXY | |
569 } | |
570 #------------------------------------------------------------------------------# | |
571 #------------------------------------------------------------------------------# | |
572 #MAIN FUNCTION number 7: copy number variations, coverage ratio calculation | |
573 sub cnv{ | |
574 | |
575 my %CHR; | |
576 my %CHRID; | |
577 my @MATEFILES; | |
578 my @MATEFILES_REF; | |
579 | |
580 my $output_prefix=$CONF{general}{mates_file}; | |
581 my $output_prefix_ref=$CONF{detection}{mates_file_ref}; | |
582 my @path=split(/\//,$output_prefix); | |
583 my @path_ref=split(/\//,$output_prefix_ref); | |
584 $output_prefix=$CONF{general}{output_dir}.$path[$#path]; | |
585 $output_prefix_ref=$CONF{general}{output_dir}.$path_ref[$#path_ref]; | |
586 my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path]; | |
587 my $tmp_mates_prefix_ref=$CONF{general}{tmp_dir}."mates/".$path_ref[$#path_ref]; | |
588 my $tmp_density_prefix=$CONF{general}{tmp_dir}."density/".$path[$#path]; | |
589 | |
590 shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters | |
591 $CONF{detection}{window_size}, | |
592 $CONF{detection}{step_length}, | |
593 $CONF{general}{cmap_file}); | |
594 | |
595 if($CONF{detection}{split_mate_file}){ | |
596 | |
597 splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix, | |
598 "intra", | |
599 $CONF{general}{mates_file}, | |
600 $CONF{general}{input_format}, | |
601 $CONF{general}{read_lengths} | |
602 ); | |
603 | |
604 splitMateFile(\%CHR, \%CHRID, \@MATEFILES_REF, $tmp_mates_prefix_ref, | |
605 "intra", | |
606 $CONF{detection}{mates_file_ref}, | |
607 $CONF{general}{input_format}, | |
608 $CONF{general}{read_lengths} | |
609 ); | |
610 | |
611 | |
612 }else{ | |
613 | |
614 @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted sample mate files of \"$CONF{general}{mates_file}\" already created at $CONF{general}{tmp_dir} :$!"; | |
615 chomp(@MATEFILES); | |
616 @MATEFILES_REF=qx{ls $tmp_mates_prefix_ref*} or die "# Error: No splitted reference mate files of \"$CONF{detection}{mates_file_ref}\" already created at $CONF{general}{tmp_dir} :$!"; | |
617 chomp(@MATEFILES_REF); | |
618 print LOG "# Splitted sample and reference mate files already created.\n"; | |
619 } | |
620 | |
621 #Parallelization of the cnv per chromosome | |
622 my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); | |
623 | |
624 foreach my $file (0..$#MATEFILES){ | |
625 | |
626 my $pid = $pm->start and next; | |
627 | |
628 densityCalculation(\%CHR, \%CHRID, $file, | |
629 $CONF{general}{read_lengths}, | |
630 $CONF{detection}{window_size}, | |
631 $CONF{detection}{step_length}, | |
632 \@MATEFILES, | |
633 \@MATEFILES_REF, | |
634 $MATEFILES[$file].".density", | |
635 $CONF{general}{input_format}); | |
636 | |
637 $pm->finish; | |
638 | |
639 } | |
640 $pm->wait_all_children; | |
641 | |
642 #Merge the chromosome links file into only one | |
643 my @DENSITYFILES= qx{ls $tmp_density_prefix*density} or die "# Error: No density files created at $CONF{general}{tmp_dir} :$!"; | |
644 chomp(@DENSITYFILES); | |
645 catFiles( \@DENSITYFILES => "$output_prefix.density" ); | |
646 | |
647 print LOG "# cnv end procedure : output created: $output_prefix.density\n"; | |
648 | |
649 | |
650 undef %CHR; | |
651 undef %CHRID; | |
652 | |
653 } | |
654 #------------------------------------------------------------------------------# | |
655 #------------------------------------------------------------------------------# | |
656 #MAIN FUNCTION number 8: Circos format conversion for cnv ratios | |
657 sub ratio2circos{ | |
658 | |
659 my $input_file=$CONF{general}{mates_file}.".density"; | |
660 | |
661 my @path=split(/\//,$input_file); | |
662 $input_file=$CONF{general}{output_dir}.$path[$#path]; | |
663 | |
664 my $output_file.=$input_file.".segdup.txt"; | |
665 | |
666 ratio2segdup($CONF{circos}{organism_id}, | |
667 $input_file, | |
668 $output_file); | |
669 } | |
670 #------------------------------------------------------------------------------# | |
671 #MAIN FUNCTION number 9: BedGraph format conversion for cnv ratios | |
672 sub ratio2bedgraph{ | |
673 | |
674 my $input_file=$CONF{general}{mates_file}.".density"; | |
675 | |
676 my @path=split(/\//,$input_file); | |
677 $input_file=$CONF{general}{output_dir}.$path[$#path]; | |
678 | |
679 my $output_file.=$input_file.".bedgraph.txt"; | |
680 | |
681 ratio2bedfile($input_file, | |
682 $output_file); #bed file output | |
683 } | |
684 #------------------------------------------------------------------------------# | |
685 #------------------------------------------------------------------------------# | |
686 #Creation of the fragment library | |
687 sub shearingChromosome{ | |
688 | |
689 print LOG "# Making the fragments library...\n"; | |
690 | |
691 my ($chr,$chrID,$window,$step,$cmap_file)=@_; #window and step sizes parameters | |
692 | |
693 createChrHashTables($chr,$chrID,$cmap_file); #hash tables: chromosome ID <=> chromsomes Name | |
694 | |
695 foreach my $k (1..$chr->{nb_chrs}){ | |
696 | |
697 print LOG"-- $chr->{$k}->{name}\n"; | |
698 | |
699 my $frag=1; | |
700 for (my $start=0; $start<$chr->{$k}->{length}; $start+=$step){ | |
701 | |
702 my $end=($start<($chr->{$k}->{length})-$window)? $start+$window-1:($chr->{$k}->{length})-1; | |
703 $chr->{$k}->{$frag}=[$start,$end]; #creation of fragments, coordinates storage | |
704 | |
705 if($end==($chr->{$k}->{length})-1){ | |
706 $chr->{$k}->{nb_frag}=$frag; #nb of fragments per chromosome | |
707 last; | |
708 } | |
709 $frag++; | |
710 } | |
711 } | |
712 } | |
713 #------------------------------------------------------------------------------# | |
714 #------------------------------------------------------------------------------# | |
715 #Creation of chromosome hash tables from the cmap file | |
716 sub createChrHashTables{ | |
717 | |
718 my ($chr,$chrID,$cmap_file)=@_; | |
719 $chr->{nb_chrs}=0; | |
720 | |
721 open CMAP, "<".$cmap_file or die "$0: can't open ".$cmap_file.":$!\n"; | |
722 while(<CMAP>){ | |
723 | |
724 if(/^\s+$/){ next;} | |
725 my ($k,$name,$length) = split; | |
726 $chr->{$k}->{name}=$name; | |
727 $chr->{$k}->{length}=$length; | |
728 $chrID->{$name}=$k; | |
729 $chr->{nb_chrs}++; | |
730 | |
731 } | |
732 close CMAP; | |
733 } | |
734 #------------------------------------------------------------------------------# | |
735 #------------------------------------------------------------------------------# | |
736 #Read the mate file according the input format file (solid, eland or sam) | |
737 sub readMateFile{ | |
738 | |
739 my ($chr1,$chr2,$pos1,$pos2,$order1,$order2,$t,$file_type,$tag_length)=@_; | |
740 my ($strand1,$strand2); | |
741 | |
742 if($file_type eq "solid"){ | |
743 | |
744 ($$chr1,$$chr2,$$pos1,$$pos2,$$order1,$$order2)=($$t[6],$$t[7],$$t[8]+1,$$t[9]+1,1,2); #0-based | |
745 | |
746 }else{ | |
747 my ($tag_length1,$tag_length2); | |
748 ($$chr1,$$chr2,$$pos1,$strand1,$$pos2,$strand2,$$order1,$$order2,$tag_length1,$tag_length2)=($$t[11],$$t[12],$$t[7],$$t[8],$$t[9],$$t[10],1,2,length($$t[1]),length($$t[2])) #1-based | |
749 if($file_type eq "eland"); | |
750 | |
751 if($file_type eq "sam"){ | |
752 | |
753 return 0 if ($$t[0]=~/^@/); #header sam filtered out | |
754 | |
755 ($$chr1,$$chr2,$$pos1,$$pos2)=($$t[2],$$t[6],$$t[3],$$t[7]); | |
756 | |
757 return 0 if ($$chr1 eq "*" || $$chr2 eq "*"); | |
758 | |
759 $$chr2=$$chr1 if($$chr2 eq "="); | |
760 | |
761 $strand1 = (($$t[1]&0x0010))? 'R':'F'; | |
762 $strand2 = (($$t[1]&0x0020))? 'R':'F'; | |
763 | |
764 $$order1= (($$t[1]&0x0040))? '1':'2'; | |
765 $$order2= (($$t[1]&0x0080))? '1':'2'; | |
766 $tag_length1 = $tag_length->{$$order1}; | |
767 $tag_length2 = $tag_length->{$$order2}; | |
768 } | |
769 | |
770 $$pos1 = -($$pos1+$tag_length1) if ($strand1 eq "R"); #get sequencing starts | |
771 $$pos2 = -($$pos2+$tag_length2) if ($strand2 eq "R"); | |
772 } | |
773 return 1; | |
774 } | |
775 #------------------------------------------------------------------------------# | |
776 #------------------------------------------------------------------------------# | |
777 #Parsing of the mates files and creation of links between 2 chromosomal fragments | |
778 sub linking{ | |
779 | |
780 my ($chr,$chrID,$pair,$tag_length,$window_dist,$step,$mates_file,$input_format,$sv_type,$links_file)=@_; | |
781 my %link; | |
782 | |
783 my $record=0; | |
784 my $nb_links=0; | |
785 my $warn=10000; | |
786 | |
787 my @sfile=split(/\./,$mates_file); | |
788 my $fchr=$sfile[$#sfile]; | |
789 | |
790 my $fh = new FileHandle; | |
791 | |
792 print LOG "# $fchr : Linking procedure...\n"; | |
793 print LOG "-- file=$mates_file\n". | |
794 "-- chromosome=$fchr\n". | |
795 "-- input format=$input_format\n". | |
796 "-- type=$sv_type\n". | |
797 "-- read1 length=$tag_length->{1}, read2 length=$tag_length->{2}\n". | |
798 "-- window size=$window_dist, step length=$step\n"; | |
799 | |
800 if ($mates_file =~ /.gz$/) { | |
801 $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat | |
802 }elsif($mates_file =~ /.bam$/){ | |
803 $fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY | |
804 }else{ | |
805 $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n"; | |
806 } | |
807 | |
808 | |
809 while(<$fh>){ | |
810 | |
811 my @t=split; #for each mate-pair | |
812 my $mate=$t[0]; | |
813 my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2); | |
814 | |
815 next if(exists $$pair{$mate}); | |
816 | |
817 next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); | |
818 | |
819 next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); | |
820 | |
821 ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); | |
822 | |
823 if($sv_type ne "all"){ | |
824 if( ($sv_type eq "inter") && ($chr_read1 ne $chr_read2) || | |
825 ($sv_type eq "intra") && ($chr_read1 eq $chr_read2) ){ | |
826 }else{ | |
827 next; | |
828 } | |
829 } | |
830 | |
831 $$pair{$mate}=[$chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2 ]; #fill out the hash pair table (ready for the defineCoordsLinks function) | |
832 | |
833 $record++; | |
834 | |
835 my ($coord_start_read1,$coord_end_read1,$coord_start_read2,$coord_end_read2); #get the coordinates of each read | |
836 | |
837 recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format); | |
838 recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format); | |
839 | |
840 for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){ #fast genome parsing for link creation | |
841 | |
842 if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){ | |
843 | |
844 if(overlap($coord_start_read1,$coord_end_read1,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1])){ | |
845 | |
846 for(my $j=1;$j<=$chr->{$chr_read2}->{'nb_frag'};$j++){ | |
847 | |
848 if (abs ($coord_start_read2-${$chr->{$chr_read2}->{$j}}[0]) <= $window_dist) { | |
849 | |
850 if(overlap($coord_start_read2,$coord_end_read2,${$chr->{$chr_read2}->{$j}}[0],${$chr->{$chr_read2}->{$j}}[1])){ | |
851 | |
852 makeLink(\%link,$chr_read1,$i,$chr_read2,$j,$mate,\$nb_links); #make the link | |
853 } | |
854 | |
855 }else{ | |
856 | |
857 $j=getNextFrag($coord_start_read2,$j,${$chr->{$chr_read2}->{$j}}[0],$chr->{$chr_read2}->{nb_frag},$window_dist,$step); | |
858 } | |
859 } | |
860 } | |
861 | |
862 }else{ | |
863 | |
864 $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step); | |
865 } | |
866 } | |
867 | |
868 if($record>=$warn){ | |
869 print LOG "-- $fchr : $warn mate-pairs analysed - $nb_links links done\n"; | |
870 $warn+=10000; | |
871 } | |
872 } | |
873 $fh->close; | |
874 | |
875 if(!$nb_links){ | |
876 print LOG "-- $fchr : No mate-pairs !\n". | |
877 "-- $fchr : No links have been found with the selected type of structural variations \($sv_type\)\n"; | |
878 } | |
879 | |
880 print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_links links done\n"; | |
881 | |
882 print LOG "-- $fchr : writing...\n"; | |
883 | |
884 $fh = new FileHandle; | |
885 | |
886 $fh->open(">".$links_file) or die "$0: can't write in the output ".$links_file." :$!\n"; | |
887 | |
888 foreach my $chr1 ( sort { $a <=> $b} keys %link){ #Sorted links output | |
889 | |
890 foreach my $chr2 ( sort { $a <=> $b} keys %{$link{$chr1}}){ | |
891 | |
892 foreach my $frag1 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}}){ | |
893 | |
894 foreach my $frag2 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}{$frag1}}){ | |
895 | |
896 my @count=split(",",$link{$chr1}{$chr2}{$frag1}{$frag2}); | |
897 print $fh "$chr->{$chr1}->{name}\t".(${$chr->{$chr1}->{$frag1}}[0]+1)."\t".(${$chr->{$chr1}->{$frag1}}[1]+1)."\t". | |
898 "$chr->{$chr2}->{name}\t".(${$chr->{$chr2}->{$frag2}}[0]+1)."\t".(${$chr->{$chr2}->{$frag2}}[1]+1)."\t". | |
899 scalar @count."\t". #nb of read | |
900 $link{$chr1}{$chr2}{$frag1}{$frag2}."\n"; #mate list | |
901 } | |
902 } | |
903 } | |
904 } | |
905 | |
906 $fh->close; | |
907 | |
908 undef %link; | |
909 | |
910 } | |
911 #------------------------------------------------------------------------------# | |
912 #------------------------------------------------------------------------------# | |
913 #remove exact links doublons according to the mate list | |
914 sub getUniqueLinks{ | |
915 | |
916 my ($links_file,$nrlinks_file)=@_; | |
917 my %links; | |
918 my %pt; | |
919 my $nb_links; | |
920 my $n=1; | |
921 | |
922 my $record=0; | |
923 my $warn=300000; | |
924 | |
925 my @sfile=split(/\./,$links_file); | |
926 my $fchr=$sfile[$#sfile-2]; | |
927 | |
928 my $fh = new FileHandle; | |
929 | |
930 print LOG "# $fchr : Getting unique links...\n"; | |
931 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; | |
932 | |
933 while(<$fh>){ | |
934 | |
935 my @t=split; | |
936 my $mates=$t[7]; | |
937 $record++; | |
938 | |
939 if(!exists $links{$mates}){ #Unique links selection | |
940 | |
941 $links{$mates}=[@t]; | |
942 $pt{$n}=$links{$mates}; | |
943 $n++; | |
944 | |
945 | |
946 }else{ #get the link coordinates from the mate-pairs list | |
947 | |
948 for my $i (1,2,4,5){ #get the shortest regions | |
949 | |
950 $links{$mates}->[$i]=($t[$i]>$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #maximum start | |
951 if($i==1 || $i==4); | |
952 $links{$mates}->[$i]=($t[$i]<$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #minimum end | |
953 if($i==2 || $i==5); | |
954 } | |
955 } | |
956 if($record>=$warn){ | |
957 print LOG "-- $fchr : $warn links analysed - ".($n-1)." unique links done\n"; | |
958 $warn+=300000; | |
959 } | |
960 } | |
961 $fh->close; | |
962 | |
963 $nb_links=$n-1; | |
964 print LOG "-- $fchr : Total : $record links analysed - $nb_links unique links done\n"; | |
965 | |
966 $fh = new FileHandle; | |
967 $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n"; | |
968 print LOG "-- $fchr : writing...\n"; | |
969 for my $i (1..$nb_links){ | |
970 | |
971 print $fh join("\t",@{$pt{$i}})."\n"; #all links output | |
972 } | |
973 | |
974 $fh->close; | |
975 | |
976 undef %links; | |
977 undef %pt; | |
978 | |
979 } | |
980 #------------------------------------------------------------------------------# | |
981 #------------------------------------------------------------------------------# | |
982 #get the new coordinates of each link from the mate list | |
983 sub defineCoordsLinks{ | |
984 | |
985 my ($chr,$chrID,$pair,$input_format,$sv_type,$tag_length,$links_file,$clinks_file)=@_; | |
986 | |
987 my @sfile=split(/\./,$links_file); | |
988 my $fchr=$sfile[$#sfile-2]; | |
989 | |
990 my $fh = new FileHandle; | |
991 my $fh2 = new FileHandle; | |
992 | |
993 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; | |
994 $fh2->open(">$clinks_file") or die "$0: can't write in the output: $clinks_file :$!\n"; | |
995 | |
996 print LOG "# $fchr : Defining precise link coordinates...\n"; | |
997 | |
998 my $record=0; | |
999 my $warn=100000; | |
1000 | |
1001 my %coords; | |
1002 my %strands; | |
1003 my %order; | |
1004 my %ends_order; | |
1005 | |
1006 while(<$fh>){ | |
1007 | |
1008 | |
1009 my ($col1,$col2)=(1,2); #for an intrachromosomal link | |
1010 my $diffchr=0; #difference between chr1 and chr2 | |
1011 my ($chr1,$chr2,$mates_list,$npairs)=(split)[0,3,7,8]; | |
1012 ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2}); | |
1013 if ($chr1 != $chr2){ #for an interchromosomal link | |
1014 $col1=$col2=0; #no distinction | |
1015 $diffchr=1; | |
1016 } | |
1017 | |
1018 my @pairs=split(",",$mates_list); | |
1019 | |
1020 $coords{$col1}{$chr1}->{start}=undef; | |
1021 $coords{$col1}{$chr1}->{end}=undef; | |
1022 $coords{$col2}{$chr2}->{start}=undef; | |
1023 $coords{$col2}{$chr2}->{end}=undef; | |
1024 $strands{$col1}{$chr1}=undef; | |
1025 $strands{$col2}{$chr2}=undef; | |
1026 $ends_order{$col1}{$chr1}=undef; | |
1027 $ends_order{$col2}{$chr2}=undef; | |
1028 | |
1029 | |
1030 $order{$col1}{$chr1}->{index}->{1}=undef; | |
1031 $order{$col1}{$chr1}->{index}->{2}=undef; | |
1032 $order{$col2}{$chr2}->{index}->{1}=undef; | |
1033 $order{$col2}{$chr2}->{index}->{2}=undef; | |
1034 $order{$col1}{$chr1}->{order}=undef; | |
1035 $order{$col2}{$chr2}->{order}=undef; | |
1036 | |
1037 $record++; | |
1038 | |
1039 for my $p (0..$#pairs){ #for each pair | |
1040 | |
1041 my ($coord_start_read1,$coord_end_read1); | |
1042 my ($coord_start_read2,$coord_end_read2); | |
1043 my $strand_read1=recupCoords(${$$pair{$pairs[$p]}}[2],\$coord_start_read1,\$coord_end_read1,$tag_length->{${$$pair{$pairs[$p]}}[4]},$input_format); | |
1044 my $strand_read2=recupCoords(${$$pair{$pairs[$p]}}[3],\$coord_start_read2,\$coord_end_read2,$tag_length->{${$$pair{$pairs[$p]}}[5]},$input_format); | |
1045 | |
1046 if(!$diffchr){ #for a intrachromosomal link | |
1047 if($coord_start_read2<$coord_start_read1){ #get the closer start coordinate for each column | |
1048 ($col1,$col2)=(2,1); | |
1049 }else{ | |
1050 ($col1,$col2)=(1,2); | |
1051 } | |
1052 } | |
1053 | |
1054 push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{start}},$coord_start_read1); #get coords and strands of f3 and r3 reads | |
1055 push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{end}},$coord_end_read1); | |
1056 push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{start}},$coord_start_read2); | |
1057 push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{end}},$coord_end_read2); | |
1058 push(@{$strands{$col1}{${$$pair{$pairs[$p]}}[0]}},$strand_read1); | |
1059 push(@{$strands{$col2}{${$$pair{$pairs[$p]}}[1]}},$strand_read2); | |
1060 push(@{$ends_order{$col1}{${$$pair{$pairs[$p]}}[0]}},${$$pair{$pairs[$p]}}[4]); | |
1061 push(@{$ends_order{$col2}{${$$pair{$pairs[$p]}}[1]}},${$$pair{$pairs[$p]}}[5]); | |
1062 } | |
1063 | |
1064 ($col1,$col2)=(1,2) if(!$diffchr); | |
1065 | |
1066 my $coord_start_chr1=min(min(@{$coords{$col1}{$chr1}->{start}}),min(@{$coords{$col1}{$chr1}->{end}})); #get the biggest region | |
1067 my $coord_end_chr1=max(max(@{$coords{$col1}{$chr1}->{start}}),max(@{$coords{$col1}{$chr1}->{end}})); | |
1068 my $coord_start_chr2=min(min(@{$coords{$col2}{$chr2}->{start}}),min(@{$coords{$col2}{$chr2}->{end}})); | |
1069 my $coord_end_chr2=max(max(@{$coords{$col2}{$chr2}->{start}}),max(@{$coords{$col2}{$chr2}->{end}})); | |
1070 | |
1071 @{$order{$col1}{$chr1}->{index}->{1}}= sort {${$coords{$col1}{$chr1}->{start}}[$a] <=> ${$coords{$col1}{$chr1}->{start}}[$b]} 0 .. $#{$coords{$col1}{$chr1}->{start}}; | |
1072 @{$order{$col2}{$chr2}->{index}->{1}}= sort {${$coords{$col2}{$chr2}->{start}}[$a] <=> ${$coords{$col2}{$chr2}->{start}}[$b]} 0 .. $#{$coords{$col2}{$chr2}->{start}}; | |
1073 | |
1074 foreach my $i (@{$order{$col1}{$chr1}->{index}->{1}}){ #get the rank of the chr2 reads according to the sorted chr1 reads (start coordinate sorting) | |
1075 foreach my $j (@{$order{$col2}{$chr2}->{index}->{1}}){ | |
1076 | |
1077 if(${$order{$col1}{$chr1}->{index}->{1}}[$i] == ${$order{$col2}{$chr2}->{index}->{1}}[$j]){ | |
1078 ${$order{$col1}{$chr1}->{index}->{2}}[$i]=$i; | |
1079 ${$order{$col2}{$chr2}->{index}->{2}}[$i]=$j; | |
1080 last; | |
1081 } | |
1082 } | |
1083 } | |
1084 | |
1085 foreach my $i (@{$order{$col1}{$chr1}->{index}->{2}}){ #use rank chr1 as an ID | |
1086 foreach my $j (@{$order{$col2}{$chr2}->{index}->{2}}){ | |
1087 | |
1088 if(${$order{$col1}{$chr1}->{index}->{2}}[$i] == ${$order{$col2}{$chr2}->{index}->{2}}[$j]){ | |
1089 ${$order{$col1}{$chr1}->{order}}[$i]=$i+1; | |
1090 ${$order{$col2}{$chr2}->{order}}[$i]=$j+1; | |
1091 last; | |
1092 } | |
1093 } | |
1094 } | |
1095 | |
1096 @pairs=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},\@pairs);#sorting of the pairs, strands, and start coords from the sorted chr2 reads | |
1097 @{$strands{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col1}{$chr1}); | |
1098 @{$strands{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col2}{$chr2}); | |
1099 @{$ends_order{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col1}{$chr1}); | |
1100 @{$ends_order{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col2}{$chr2}); | |
1101 @{$coords{$col1}{$chr1}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col1}{$chr1}->{start}); | |
1102 @{$coords{$col2}{$chr2}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col2}{$chr2}->{start}); | |
1103 | |
1104 | |
1105 my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output | |
1106 $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2, | |
1107 scalar @pairs, | |
1108 join(",",@pairs), | |
1109 join(",",@{$strands{$col1}{$chr1}}), | |
1110 join(",",@{$strands{$col2}{$chr2}}), | |
1111 join(",",@{$ends_order{$col1}{$chr1}}), | |
1112 join(",",@{$ends_order{$col2}{$chr2}}), | |
1113 join(",",@{$order{$col1}{$chr1}->{order}}), | |
1114 join(",",@{$order{$col2}{$chr2}->{order}}), | |
1115 join(",",@{$coords{$col1}{$chr1}->{start}}), | |
1116 join(",",@{$coords{$col2}{$chr2}->{start}})); | |
1117 | |
1118 print $fh2 join("\t",@link)."\n"; | |
1119 | |
1120 if($record>=$warn){ | |
1121 print LOG "-- $fchr : $warn links processed\n"; | |
1122 $warn+=100000; | |
1123 } | |
1124 } | |
1125 $fh->close; | |
1126 $fh2->close; | |
1127 | |
1128 print LOG "-- $fchr : Total : $record links processed\n"; | |
1129 | |
1130 } | |
1131 #------------------------------------------------------------------------------# | |
1132 #------------------------------------------------------------------------------# | |
1133 #Sort links according the concerned chromosomes and their coordinates | |
1134 sub sortLinks{ | |
1135 | |
1136 my ($links_file,$sortedlinks_file,$unique)=@_; | |
1137 | |
1138 my @sfile=split(/\./,$links_file); | |
1139 my $fchr=$sfile[$#sfile-2]; | |
1140 | |
1141 | |
1142 print LOG "# $fchr : Sorting links...\n"; | |
1143 | |
1144 my $pipe=($unique)? "| sort -u":""; | |
1145 system "sort -k 1,1 -k 4,4 -k 2,2n -k 5,5n -k 8,8n $links_file $pipe > $sortedlinks_file"; | |
1146 | |
1147 } | |
1148 #------------------------------------------------------------------------------# | |
1149 #------------------------------------------------------------------------------# | |
1150 #removal of fully overlapped links | |
1151 sub removeFullyOverlappedLinks{ | |
1152 | |
1153 my ($links_file,$nrlinks_file,$warn_out)=@_; | |
1154 | |
1155 my %pt; | |
1156 my $n=1; | |
1157 | |
1158 my @sfile=split(/\./,$links_file); | |
1159 my $fchr=$sfile[$#sfile-2]; | |
1160 | |
1161 my $fh = new FileHandle; | |
1162 | |
1163 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; | |
1164 while(<$fh>){ | |
1165 | |
1166 my @t=split("\t",$_); | |
1167 $pt{$n}=[@t]; | |
1168 $n++; | |
1169 } | |
1170 $fh->close; | |
1171 | |
1172 my $nb_links=$n-1; | |
1173 my $nb=$nb_links; | |
1174 | |
1175 my %pt2; | |
1176 my $nb2=1; | |
1177 my $record=0; | |
1178 my $warn=10000; | |
1179 | |
1180 print LOG "# $fchr : Removing fully overlapped links...\n"; | |
1181 | |
1182 LINK: | |
1183 | |
1184 for my $i (1..$nb){ | |
1185 | |
1186 my @link=(); | |
1187 my @next_link=(); | |
1188 my $ind1=$i; | |
1189 | |
1190 $record++; | |
1191 if($record>=$warn){ | |
1192 print LOG "-- $fchr : $warn unique links analysed - ".($nb2-1)." non-overlapped links done\n"; | |
1193 $warn+=10000; | |
1194 } | |
1195 | |
1196 if(exists $pt{$ind1}){ | |
1197 @link=@{$pt{$ind1}}; #link1 | |
1198 }else{ | |
1199 next LINK; | |
1200 } | |
1201 | |
1202 my ($chr1,$start1,$end1,$chr2,$start2,$end2)=($link[0],$link[1],$link[2],$link[3],$link[4],$link[5]); #get info of link1 | |
1203 my @mates=deleteBadOrderSensePairs(split(",",$link[7])); | |
1204 | |
1205 my $ind2=$ind1+1; | |
1206 $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); #get the next found link | |
1207 | |
1208 if($ind2<=$nb){ | |
1209 | |
1210 @next_link=@{$pt{$ind2}}; #link2 | |
1211 my ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); #get info of link2 | |
1212 my @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); | |
1213 | |
1214 while(($chr1 eq $chr3 && $chr2 eq $chr4) && overlap($start1,$end1,$start3,$end3)){ #loop here according to the chr1 coordinates, need an overlap between links to enter | |
1215 | |
1216 if(!overlap($start2,$end2,$start4,$end4)){ #if no overlap with chr2 coordinates ->next link2 | |
1217 | |
1218 $ind2++; | |
1219 $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); | |
1220 | |
1221 if($ind2>$nb){ #if no more link in the file -> save link1 | |
1222 | |
1223 $pt2{$nb2}=\@link; | |
1224 $nb2++; | |
1225 next LINK; | |
1226 } | |
1227 | |
1228 @next_link=@{$pt{$ind2}}; | |
1229 ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); | |
1230 @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); | |
1231 next; | |
1232 } | |
1233 | |
1234 my %mates=map{$_ =>1} @mates; #get the equal number of mates | |
1235 my @same_mates = grep( $mates{$_}, @next_mates ); | |
1236 my $nb_mates= scalar @same_mates; | |
1237 | |
1238 if($nb_mates == scalar @mates){ | |
1239 | |
1240 delete $pt{$ind1}; #if pairs of link 1 are all included in link 2 -> delete link1 | |
1241 next LINK; #go to link2, link2 becomes link1 | |
1242 | |
1243 }else{ | |
1244 delete $pt{$ind2} if($nb_mates == scalar @next_mates); #if pairs of link2 are all included in link 1 -> delete link2 | |
1245 $ind2++; #we continue by checking the next link2 | |
1246 $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); | |
1247 | |
1248 if($ind2>$nb){ #if no more link in the file -> save link1 | |
1249 | |
1250 $pt2{$nb2}=\@link; | |
1251 $nb2++; | |
1252 next LINK; | |
1253 } | |
1254 | |
1255 @next_link=@{$pt{$ind2}}; #get info of link2 | |
1256 ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); | |
1257 @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); | |
1258 | |
1259 } | |
1260 } | |
1261 } | |
1262 $pt2{$nb2}=\@link; #if no (more) link with chr1 coordinates overlap -> save link1 | |
1263 $nb2++; | |
1264 } | |
1265 | |
1266 print LOG "-- $fchr : Total : $nb_links unique links analysed - ".($nb2-1)." non-overlapped links done\n"; | |
1267 | |
1268 #OUTPUT | |
1269 | |
1270 $fh = new FileHandle; | |
1271 $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n"; | |
1272 print LOG "-- $fchr : writing...\n"; | |
1273 for my $i (1..$nb2-1){ | |
1274 | |
1275 print $fh join("\t",@{$pt2{$i}}); #all links output | |
1276 } | |
1277 | |
1278 close $fh; | |
1279 | |
1280 print LOG "-- $fchr : output created: $nrlinks_file\n" if($warn_out); | |
1281 | |
1282 undef %pt; | |
1283 undef %pt2; | |
1284 } | |
1285 #------------------------------------------------------------------------------# | |
1286 sub postFiltering { | |
1287 | |
1288 my ($links_file,$pflinks_file, $finalScore_thres)=@_; | |
1289 | |
1290 my @sfile=split(/\./,$links_file); | |
1291 my $fchr=$sfile[$#sfile-2]; | |
1292 | |
1293 | |
1294 my ($nb,$nb2)=(0,0); | |
1295 | |
1296 print LOG "# $fchr : Post-filtering links...\n"; | |
1297 print LOG "-- $fchr : final score threshold = $finalScore_thres\n"; | |
1298 | |
1299 my $fh = new FileHandle; | |
1300 my $fh2 = new FileHandle; | |
1301 | |
1302 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; | |
1303 $fh2->open(">$pflinks_file") or die "$0: can't write in the output: $pflinks_file :$!\n"; | |
1304 | |
1305 | |
1306 while(<$fh>){ | |
1307 | |
1308 my @t=split("\t",$_); | |
1309 my $score=$t[$#t-1]; | |
1310 | |
1311 if($score >= $finalScore_thres){ | |
1312 print $fh2 join("\t", @t); | |
1313 $nb2++; | |
1314 } | |
1315 $nb++; | |
1316 } | |
1317 $fh->close; | |
1318 $fh2->close; | |
1319 | |
1320 print LOG "-- $fchr : Total : $nb unique links analysed - $nb2 links kept\n"; | |
1321 print LOG "-- $fchr : output created: $pflinks_file\n"; | |
1322 } | |
1323 | |
1324 | |
1325 | |
1326 #------------------------------------------------------------------------------# | |
1327 #------------------------------------------------------------------------------# | |
1328 #Filtering of the links | |
1329 sub strandFiltering{ | |
1330 | |
1331 my($chr,$chrID,$pairs_threshold,$strand_filtering,$chromosomes, | |
1332 $input_format,$cmap_file,$mate_sense, $tag_length,$links_file,$flinks_file)=@_; | |
1333 | |
1334 my @sfile=split(/\./,$links_file); | |
1335 my $fchr=$sfile[$#sfile-1]; | |
1336 | |
1337 | |
1338 my %chrs; | |
1339 my %chrs1; | |
1340 my %chrs2; | |
1341 my $nb_chrs; | |
1342 my $exclude; | |
1343 | |
1344 if($chromosomes){ | |
1345 my @chrs=split(",",$chromosomes); | |
1346 $nb_chrs=scalar @chrs; | |
1347 $exclude=($chrs[0]=~/^\-/)? 1:0; | |
1348 for my $chrName (@chrs){ | |
1349 $chrName=~s/^(\-)//; | |
1350 my $col=($chrName=~s/_(1|2)$//); | |
1351 | |
1352 if(!$col){ | |
1353 $chrs{$chrID->{$chrName}}=undef | |
1354 }else{ | |
1355 $chrs1{$chrID->{$chrName}}=undef if($1==1); | |
1356 $chrs2{$chrID->{$chrName}}=undef if($1==2); | |
1357 } | |
1358 } | |
1359 } | |
1360 | |
1361 my $record=0; | |
1362 my $nb_links=0; | |
1363 my $warn=10000; | |
1364 | |
1365 my $sens_ratio_threshold=0.6; | |
1366 | |
1367 print LOG "\# Filtering procedure...\n"; | |
1368 print LOG "\# Number of pairs and strand filtering...\n"; | |
1369 print LOG "-- file=$links_file\n"; | |
1370 print LOG "-- nb_pairs_threshold=$pairs_threshold, strand_filtering=".(($strand_filtering)? "yes":"no"). | |
1371 ", chromosomes=".(($chromosomes)? "$chromosomes":"all")."\n"; | |
1372 | |
1373 | |
1374 | |
1375 my $fh = new FileHandle; | |
1376 my $fh2 = new FileHandle; | |
1377 | |
1378 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; | |
1379 $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; | |
1380 | |
1381 while(<$fh>){ | |
1382 | |
1383 my @t=split; #for each link | |
1384 my $is_good=1; | |
1385 $record++; | |
1386 | |
1387 | |
1388 if($chromosomes){ | |
1389 | |
1390 my ($chr1,$chr2)=($chrID->{$t[0]},$chrID->{$t[3]}); | |
1391 | |
1392 if(!$exclude){ | |
1393 $is_good=(exists $chrs{$chr1} && exists $chrs{$chr2})? 1:0; | |
1394 $is_good=(exists $chrs1{$chr1} && exists $chrs2{$chr2})? 1:0 if(!$is_good); | |
1395 $is_good=($nb_chrs==1 && (exists $chrs1{$chr1} || exists $chrs2{$chr2}))? 1:0 if(!$is_good); | |
1396 }else{ | |
1397 $is_good=(exists $chrs{$chr1} || exists $chrs{$chr2})? 0:1; | |
1398 $is_good=(exists $chrs1{$chr1} || exists $chrs2{$chr2})? 0:1 if($is_good); | |
1399 } | |
1400 } | |
1401 | |
1402 $is_good = ($is_good && $t[6] >= $pairs_threshold)? 1 :0; #filtering according the number of pairs | |
1403 if($is_good && $strand_filtering){ #if filtering according the strand sense | |
1404 | |
1405 my @mates=split(/,/,$t[7]); #get the concordant pairs in the strand sense | |
1406 my @strands1=split(/,/,$t[8]); | |
1407 my @strands2=split(/,/,$t[9]); | |
1408 | |
1409 my %mate_class=( 'FF' => 0, 'RR' => 0, 'FR' => 0, 'RF' => 0); | |
1410 | |
1411 my %mate_reverse=( 'FF' => 'RR', 'RR' => 'FF', #group1: FF,RR | |
1412 'FR' => 'RF', 'RF' => 'FR'); #group2: FR,RF | |
1413 | |
1414 my %mate_class2=( $mate_sense=>"NORMAL_SENSE", inverseSense($mate_sense)=>"NORMAL_SENSE", | |
1415 substr($mate_sense,0,1).inverseSense(substr($mate_sense,1,1))=>"REVERSE_SENSE", | |
1416 inverseSense(substr($mate_sense,0,1)).substr($mate_sense,1,1)=>"REVERSE_SENSE"); | |
1417 | |
1418 if($t[6] == 1){ | |
1419 | |
1420 push(@t,$mate_class2{$strands1[0].$strands2[0]},"1/1",1,1); | |
1421 | |
1422 }else{ | |
1423 | |
1424 tie (my %class,'Tie::IxHash'); | |
1425 my $split; | |
1426 | |
1427 foreach my $i (0..$#mates){ | |
1428 $mate_class{$strands1[$i].$strands2[$i]}++; #get the over-represented group | |
1429 } | |
1430 | |
1431 my $nb_same_sens_class=$mate_class{FF}+$mate_class{RR}; | |
1432 my $nb_diff_sens_class=$mate_class{FR}+$mate_class{RF}; | |
1433 my $sens_ratio=max($nb_same_sens_class,$nb_diff_sens_class)/($nb_same_sens_class+$nb_diff_sens_class); | |
1434 | |
1435 if($sens_ratio < $sens_ratio_threshold){ | |
1436 %class=(1=>'FF', 2=>'FR'); | |
1437 $split=1; | |
1438 }else{ | |
1439 $class{1}=($nb_same_sens_class > $nb_diff_sens_class)? 'FF':'FR'; #if yes get the concerned class | |
1440 $split=0; | |
1441 } | |
1442 | |
1443 $is_good=getConsistentSenseLinks(\@t,\@mates,\@strands1,\@strands2,$tag_length,$mate_sense,\%mate_reverse,\%mate_class2,\%class,$split,$pairs_threshold); | |
1444 } | |
1445 } | |
1446 | |
1447 if($is_good){ #PRINT | |
1448 | |
1449 my $nb=scalar @t; | |
1450 if($nb > 20){ | |
1451 my @t2=splice(@t,0,20); | |
1452 print $fh2 join("\t",@t2)."\n"; | |
1453 $nb_links++; | |
1454 } | |
1455 $nb_links++; | |
1456 print $fh2 join("\t",@t)."\n"; | |
1457 } | |
1458 | |
1459 if($record>=$warn){ | |
1460 print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n"; | |
1461 $warn+=10000; | |
1462 } | |
1463 } | |
1464 $fh->close; | |
1465 $fh2->close; | |
1466 | |
1467 print LOG "-- $fchr : No links have been found with the selected filtering parameters\n" if(!$nb_links); | |
1468 | |
1469 print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; | |
1470 | |
1471 | |
1472 } | |
1473 #------------------------------------------------------------------------------# | |
1474 #------------------------------------------------------------------------------# | |
1475 sub getConsistentSenseLinks{ | |
1476 | |
1477 my ($t,$mates,$strands1,$strands2,$tag_length,$mate_sense, $mate_reverse,$mate_class2, $class, $split,$thres)=@_; | |
1478 | |
1479 my $npairs=scalar @$mates; | |
1480 | |
1481 my @ends_order1 = split (/,/,$$t[10]); | |
1482 my @ends_order2 = split (/,/,$$t[11]); | |
1483 my @order1 = split (/,/,$$t[12]); | |
1484 my @order2 = split (/,/,$$t[13]); | |
1485 my @positions1 = split (/,/,$$t[14]); | |
1486 my @positions2 = split (/,/,$$t[15]); | |
1487 | |
1488 my @newlink; | |
1489 | |
1490 foreach my $ind (keys %{$class} ){ | |
1491 | |
1492 tie (my %flink,'Tie::IxHash'); | |
1493 my @orders2remove=(); | |
1494 | |
1495 foreach my $i (0..$#{$mates}){ #get the pairs belonging the over-represented group | |
1496 | |
1497 if((($$strands1[$i].$$strands2[$i]) eq $$class{$ind}) || (($$strands1[$i].$$strands2[$i]) eq $$mate_reverse{$$class{$ind}})){ | |
1498 push(@{$flink{mates}},$$mates[$i]); | |
1499 push(@{$flink{strands1}},$$strands1[$i]); | |
1500 push(@{$flink{strands2}},$$strands2[$i]); | |
1501 push(@{$flink{ends_order1}},$ends_order1[$i]); | |
1502 push(@{$flink{ends_order2}},$ends_order2[$i]); | |
1503 push(@{$flink{positions1}},$positions1[$i]); | |
1504 push(@{$flink{positions2}},$positions2[$i]); | |
1505 | |
1506 }else{ | |
1507 push(@orders2remove,$order1[$i]); | |
1508 } | |
1509 } | |
1510 | |
1511 @{$flink{order1}}=(); | |
1512 @{$flink{order2}}=(); | |
1513 if(scalar @orders2remove > 0){ | |
1514 getNewOrders(\@order1,\@order2,\@orders2remove,$flink{order1},$flink{order2}) | |
1515 }else{ | |
1516 @{$flink{order1}}=@order1; | |
1517 @{$flink{order2}}=@order2; | |
1518 } | |
1519 | |
1520 my @ends1; getEnds(\@ends1,$flink{positions1},$flink{strands1},$flink{ends_order1},$tag_length); | |
1521 my @ends2; getEnds(\@ends2,$flink{positions2},$flink{strands2},$flink{ends_order2},$tag_length); | |
1522 | |
1523 my $fnpairs=scalar @{$flink{mates}}; | |
1524 my $strand_filtering_ratio=$fnpairs."/".$npairs; | |
1525 my $real_ratio=$fnpairs/$npairs; | |
1526 | |
1527 if($fnpairs>=$thres){ #filtering according the number of pairs | |
1528 | |
1529 push(@newlink, | |
1530 $$t[0], | |
1531 min(min(@{$flink{positions1}}),min(@ends1)), | |
1532 max(max(@{$flink{positions1}}),max(@ends1)), | |
1533 $$t[3], | |
1534 min(min(@{$flink{positions2}}),min(@ends2)), | |
1535 max(max(@{$flink{positions2}}),max(@ends2)), | |
1536 $fnpairs, | |
1537 join(",",@{$flink{mates}}), | |
1538 join(",",@{$flink{strands1}}), | |
1539 join(",",@{$flink{strands2}}), | |
1540 join(",",@{$flink{ends_order1}}), | |
1541 join(",",@{$flink{ends_order2}}), | |
1542 join(",",@{$flink{order1}}), | |
1543 join(",",@{$flink{order2}}), | |
1544 join(",",@{$flink{positions1}}), | |
1545 join(",",@{$flink{positions2}}), | |
1546 $$mate_class2{${$flink{strands1}}[0].${$flink{strands2}}[0]}, | |
1547 $strand_filtering_ratio, | |
1548 $real_ratio, | |
1549 $npairs | |
1550 ); | |
1551 } | |
1552 } | |
1553 | |
1554 if (grep {defined($_)} @newlink) { | |
1555 @$t=@newlink; | |
1556 return 1 | |
1557 } | |
1558 return 0; | |
1559 | |
1560 } | |
1561 #------------------------------------------------------------------------------# | |
1562 #------------------------------------------------------------------------------# | |
1563 sub getNewOrders{ | |
1564 | |
1565 my($tab1,$tab2,$list,$newtab1,$newtab2)=@_; | |
1566 my $j=1; | |
1567 my $k=1; | |
1568 for my $i (0..$#{$tab2}){ | |
1569 my $c=0; | |
1570 for my $j (0..$#{$list}){ | |
1571 $c++ if(${$list}[$j] < ${$tab2}[$i]); | |
1572 if(${$list}[$j] == ${$tab2}[$i]){ | |
1573 $c=-1; last; | |
1574 } | |
1575 } | |
1576 if($c!=-1){ | |
1577 push(@{$newtab2}, ${$tab2}[$i]-$c); | |
1578 push(@{$newtab1}, $k); | |
1579 $k++; | |
1580 } | |
1581 } | |
1582 } | |
1583 | |
1584 #------------------------------------------------------------------------------# | |
1585 #------------------------------------------------------------------------------# | |
1586 #Filtering of the links using their order | |
1587 sub orderFiltering { | |
1588 | |
1589 my ($chr,$chrID,$nb_pairs_threshold,$nb_pairs_order_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_; | |
1590 | |
1591 my @sfile=split(/\./,$links_file); | |
1592 my $fchr=$sfile[$#sfile-2]; | |
1593 | |
1594 | |
1595 my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0; | |
1596 | |
1597 my $record=0; | |
1598 my $warn=10000; | |
1599 my $nb_links=0; | |
1600 | |
1601 my $quant05 = 1.644854; | |
1602 my $quant001 = 3.090232; | |
1603 my $alphaDist = $quant05 * 2 * $sigma; | |
1604 my $maxFragmentLength = &floor($quant001 * $sigma + $mu); | |
1605 | |
1606 print LOG "\# Filtering by order...\n"; | |
1607 print LOG "-- mu length=$mu, sigma length=$sigma, nb pairs order threshold=$nb_pairs_order_threshold\n"; | |
1608 print LOG "-- distance between comparable pairs was set to $alphaDist\n"; | |
1609 print LOG "-- maximal fragment length was set to $maxFragmentLength\n"; | |
1610 | |
1611 | |
1612 my $fh = new FileHandle; | |
1613 my $fh2 = new FileHandle; | |
1614 | |
1615 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; | |
1616 $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; | |
1617 | |
1618 while(<$fh>){ | |
1619 | |
1620 $record++; | |
1621 my @t = split; | |
1622 my ($chr1,$chr2,$mates_list)=@t[0,3,7]; | |
1623 my @pairs=split(",",$mates_list); | |
1624 ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2}); | |
1625 my ($coord_start_chr1,$coord_end_chr1,$coord_start_chr2,$coord_end_chr2) = @t[1,2,4,5]; | |
1626 my $numberOfPairs = $t[6]; | |
1627 my @strand1 = split (/,/,$t[8]); | |
1628 my @strand2 = split (/,/,$t[9]); | |
1629 my @ends_order1 = split (/,/,$t[10]); | |
1630 my @ends_order2 = split (/,/,$t[11]); | |
1631 my @order1 = split (/,/,$t[12]); | |
1632 my @order2 = split (/,/,$t[13]); | |
1633 my @positions1 = split (/,/,$t[14]); | |
1634 my @positions2 = split (/,/,$t[15]); | |
1635 my @ends1; getEnds(\@ends1,\@positions1,\@strand1,\@ends_order1,$tag_length); | |
1636 my @ends2; getEnds(\@ends2,\@positions2,\@strand2,\@ends_order2,$tag_length); | |
1637 my $clusterCoordinates_chr1; | |
1638 my $clusterCoordinates_chr2; | |
1639 my $reads_left = 0; | |
1640 | |
1641 my $ifRenv = $t[16]; | |
1642 my $strand_ratio_filtering=$t[17]; | |
1643 | |
1644 #kind of strand filtering. For example, will keep only FFF-RRR from a link FFRF-RRRF if <F-R> orientation is correct | |
1645 my ($singleBreakpoint, %badInFRSense) = findBadInFRSenseSOLiDSolexa(\@strand1,\@strand2,\@ends_order1,\@ends_order2,\@order1,\@order2,$mate_sense); | |
1646 #find pairs type F-RRRR or FFFF-R in the case if <R-F> orientation is correct | |
1647 #These pairs are annotated as BED pairs forever! They won't be recycled! | |
1648 my $table; | |
1649 for my $i (0..$numberOfPairs-1) { #fill the table with non adequate pairs: pairID numberOfNonAdPairs nonAdPairIDs | |
1650 my $nonAdeq = 0; | |
1651 for my $j (0..$i-1) { | |
1652 if (exists($table->{$j}->{$i})) { | |
1653 $nonAdeq++; | |
1654 $table->{$i}->{$j} = 1; | |
1655 } | |
1656 } | |
1657 for my $j ($i+1..$numberOfPairs-1) { | |
1658 if ($positions1[$j]-$positions1[$i]>$alphaDist) { | |
1659 if (&reversed ($i,$j,$ifRenv,\@positions2)) { | |
1660 $nonAdeq++; | |
1661 $table->{$i}->{$j} = 1; | |
1662 } | |
1663 } | |
1664 } | |
1665 $table->{$i}->{nonAdeq} = $nonAdeq; | |
1666 } | |
1667 | |
1668 for my $bad (keys %badInFRSense) { #remove pairs type F-RRRR or FFFF-R in the case of <R-F> orientation | |
1669 &remove($bad,$table); | |
1670 } | |
1671 | |
1672 my @falseReads; | |
1673 #RRRR-F -> RRRR or R-FFFF -> FFFF | |
1674 @falseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense, keys %{$table}); | |
1675 #these pairs will be recycled later as $secondTable | |
1676 for my $bad (@falseReads) { | |
1677 &remove($bad,$table); | |
1678 } | |
1679 | |
1680 my $bad = &check($table); | |
1681 while ($bad ne "OK") { #clear the table to reject non adequate pairs in the sense of ORDER | |
1682 # push (@falseReads, $bad); remove completely!!! | |
1683 &remove($bad,$table); | |
1684 $bad = &check($table); | |
1685 } | |
1686 | |
1687 $reads_left = scalar keys %{$table}; | |
1688 my $coord_start_chr1_cluster1 = min(min(@positions1[sort {$a<=>$b} keys %{$table}]),min(@ends1[sort {$a<=>$b} keys %{$table}])); | |
1689 my $coord_end_chr1_cluster1 = max(max(@positions1[sort {$a<=>$b} keys %{$table}]),max(@ends1[sort {$a<=>$b} keys %{$table}])); | |
1690 my $coord_start_chr2_cluster1 = min(min(@positions2[sort {$a<=>$b} keys %{$table}]),min(@ends2[sort {$a<=>$b} keys %{$table}])); | |
1691 my $coord_end_chr2_cluster1 = max(max(@positions2[sort {$a<=>$b} keys %{$table}]),max(@ends2[sort {$a<=>$b} keys %{$table}])); | |
1692 | |
1693 $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; | |
1694 $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; | |
1695 | |
1696 my $ifBalanced = 'UNBAL'; | |
1697 my $secondTable; | |
1698 my $clusterCoordinates; | |
1699 my ($break_pont_chr1,$break_pont_chr2); | |
1700 | |
1701 my $signatureType=""; | |
1702 | |
1703 my $maxCoord1 =$chr->{$chr1}->{length}; | |
1704 my $maxCoord2 =$chr->{$chr2}->{length}; | |
1705 | |
1706 if (scalar @falseReads) { | |
1707 @falseReads = sort @falseReads; | |
1708 #now delete FRFR choosing the majority | |
1709 my @newfalseReads; #find and remove pairs type RRRR-F or R-FFFF | |
1710 @newfalseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense,@falseReads); #these @newfalseReads won't be recycled | |
1711 my %hashTmp; | |
1712 for my $count1 (0..scalar(@falseReads)-1) { | |
1713 my $i = $falseReads[$count1]; | |
1714 $hashTmp{$i} = 1; | |
1715 for my $bad (@newfalseReads) { | |
1716 if ($bad == $i) { | |
1717 delete $hashTmp{$i}; | |
1718 next; | |
1719 } | |
1720 } | |
1721 } | |
1722 @falseReads = sort keys %hashTmp; #what is left | |
1723 for my $count1 (0..scalar(@falseReads)-1) { #fill the table for reads which were previously rejected | |
1724 my $nonAdeq = 0; | |
1725 my $i = $falseReads[$count1]; | |
1726 | |
1727 for my $count2 (0..$count1-1) { | |
1728 my $j = $falseReads[$count2]; | |
1729 if (exists($secondTable->{$j}->{$i})) { | |
1730 $nonAdeq++; | |
1731 $secondTable->{$i}->{$j} = 1; | |
1732 } | |
1733 } | |
1734 for my $count2 ($count1+1..scalar(@falseReads)-1) { | |
1735 my $j = $falseReads[$count2]; | |
1736 if ($positions1[$j]-$positions1[$i]>$alphaDist) { | |
1737 if (&reversed ($i,$j,$ifRenv,\@positions2)) { | |
1738 $nonAdeq++; | |
1739 $secondTable->{$i}->{$j} = 1; | |
1740 } | |
1741 } | |
1742 } | |
1743 $secondTable->{$i}->{nonAdeq} = $nonAdeq; | |
1744 } | |
1745 | |
1746 my @falseReads2; | |
1747 my $bad = &check($secondTable); | |
1748 while ($bad ne "OK") { #clear the table to reject non adequate pairs | |
1749 push (@falseReads2, $bad); | |
1750 &remove($bad,$secondTable); | |
1751 $bad = &check($secondTable); | |
1752 } | |
1753 if (scalar keys %{$secondTable} >= $nb_pairs_order_threshold) { | |
1754 my $coord_start_chr1_cluster2 = min(min(@positions1[sort {$a<=>$b} keys %{$secondTable}]),min(@ends1[sort {$a<=>$b} keys %{$secondTable}])); | |
1755 my $coord_end_chr1_cluster2 = max(max(@positions1[sort {$a<=>$b} keys %{$secondTable}]),max(@ends1[sort {$a<=>$b} keys %{$secondTable}])); | |
1756 my $coord_start_chr2_cluster2 = min(min(@positions2[sort {$a<=>$b} keys %{$secondTable}]),min(@ends2[sort {$a<=>$b} keys %{$secondTable}])); | |
1757 my $coord_end_chr2_cluster2 = max(max(@positions2[sort {$a<=>$b} keys %{$secondTable}]),max(@ends2[sort {$a<=>$b} keys %{$secondTable}])); | |
1758 | |
1759 $ifBalanced = 'BAL'; | |
1760 | |
1761 if ($ifBalanced eq 'BAL') { | |
1762 | |
1763 if (scalar keys %{$table} < $nb_pairs_order_threshold) { | |
1764 $ifBalanced = 'UNBAL'; #kill cluster 1! | |
1765 ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 | |
1766 $reads_left = scalar keys %{$table}; | |
1767 $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2; | |
1768 $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2; | |
1769 $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2; | |
1770 $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2; | |
1771 $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; | |
1772 $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; | |
1773 | |
1774 } else { | |
1775 | |
1776 $reads_left += scalar keys %{$secondTable}; | |
1777 next if ($reads_left < $nb_pairs_threshold); | |
1778 | |
1779 if ($coord_end_chr1_cluster2 < $coord_start_chr1_cluster1) { | |
1780 ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 | |
1781 | |
1782 ($coord_start_chr1_cluster1,$coord_start_chr1_cluster2) = ($coord_start_chr1_cluster2,$coord_start_chr1_cluster1); | |
1783 ($coord_end_chr1_cluster1,$coord_end_chr1_cluster2)=($coord_end_chr1_cluster2,$coord_end_chr1_cluster1); | |
1784 ($coord_start_chr2_cluster1,$coord_start_chr2_cluster2)=($coord_start_chr2_cluster2,$coord_start_chr2_cluster1); | |
1785 ($coord_end_chr2_cluster1 , $coord_end_chr2_cluster2)=($coord_end_chr2_cluster2 , $coord_end_chr2_cluster1); | |
1786 | |
1787 $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.'),'.$clusterCoordinates_chr1; | |
1788 $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.'),'.$clusterCoordinates_chr2; | |
1789 }else { | |
1790 $clusterCoordinates_chr1 .= ',('.$coord_start_chr1_cluster2.','.$coord_end_chr1_cluster2.')'; | |
1791 $clusterCoordinates_chr2 .= ',('.$coord_start_chr2_cluster2.','.$coord_end_chr2_cluster2.')'; | |
1792 } | |
1793 $coord_start_chr1 = min($coord_start_chr1_cluster1,$coord_start_chr1_cluster2); | |
1794 $coord_end_chr1 = max($coord_end_chr1_cluster1,$coord_end_chr1_cluster2); | |
1795 $coord_start_chr2 = min($coord_start_chr2_cluster1,$coord_start_chr2_cluster2); | |
1796 $coord_end_chr2 = max($coord_end_chr2_cluster1,$coord_end_chr2_cluster2); | |
1797 #to calculate breakpoints one need to take into account read orientation in claster.. | |
1798 my $leftLetterOk = substr($mate_sense, 0, 1); #R | |
1799 my $rightLetterOk = substr($mate_sense, 1, 1); #F | |
1800 | |
1801 | |
1802 my @index1 = keys %{$table}; | |
1803 my @index2 = keys %{$secondTable}; | |
1804 | |
1805 my (@generalStrand1,@generalStrand2) = 0; | |
1806 | |
1807 if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs | |
1808 $leftLetterOk = 'R'; | |
1809 $rightLetterOk = 'F'; | |
1810 @generalStrand1 = translateSolidToRF(\@strand1,\@ends_order1); | |
1811 @generalStrand2 = translateSolidToRF(\@strand2,\@ends_order2); | |
1812 } else { | |
1813 @generalStrand1 = @strand1; | |
1814 @generalStrand2 = @strand2; # TODO check if it is correct | |
1815 } | |
1816 if ($generalStrand1[$index1[0]] eq $leftLetterOk && $generalStrand1[$index2[0]] eq $rightLetterOk) { #(R,F) | |
1817 $break_pont_chr1 = '('.$coord_end_chr1_cluster1.','.$coord_start_chr1_cluster2.')'; | |
1818 | |
1819 if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) { | |
1820 if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { | |
1821 $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')'; | |
1822 $signatureType = "TRANSLOC"; | |
1823 } else { | |
1824 $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')'; | |
1825 $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')'; | |
1826 $signatureType = "INS_FRAGMT"; | |
1827 } | |
1828 | |
1829 } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) { | |
1830 if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { | |
1831 $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')'; | |
1832 $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')'; | |
1833 $signatureType = "INV_INS_FRAGMT"; | |
1834 } else { | |
1835 $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')'; | |
1836 $signatureType = "INV_TRANSLOC"; | |
1837 } | |
1838 } else { | |
1839 #should not occur | |
1840 print STDERR "\nError in orderFiltering\n\n"; | |
1841 } | |
1842 } | |
1843 | |
1844 elsif ($generalStrand1[$index1[0]] eq $rightLetterOk && $generalStrand1[$index2[0]] eq $leftLetterOk) { #(F,R) | |
1845 $break_pont_chr1 = '('.max(($coord_end_chr1_cluster1-$maxFragmentLength),1).','.$coord_start_chr1_cluster1.')'; | |
1846 $break_pont_chr1 .= ',('.$coord_end_chr1_cluster2.','.min(($coord_start_chr1_cluster2+$maxFragmentLength),$maxCoord1).')'; | |
1847 if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) { | |
1848 if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { | |
1849 $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')'; | |
1850 $signatureType = "INV_INS_FRAGMT"; | |
1851 } else { | |
1852 $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')'; | |
1853 $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')'; | |
1854 $signatureType = "INV_COAMPLICON"; | |
1855 } | |
1856 | |
1857 } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) { | |
1858 if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { | |
1859 $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')'; | |
1860 $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')'; | |
1861 $signatureType = "COAMPLICON"; | |
1862 } else { | |
1863 $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')'; | |
1864 $signatureType = "INS_FRAGMT"; | |
1865 } | |
1866 } else { | |
1867 #should not occur | |
1868 $signatureType = "UNDEFINED"; | |
1869 } | |
1870 } | |
1871 else { # (F,F) or (R,R) something strange. We will discard the smallest cluster | |
1872 $ifBalanced = 'UNBAL'; | |
1873 if (scalar keys %{$secondTable} > scalar keys %{$table}) { | |
1874 ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 | |
1875 | |
1876 $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2; | |
1877 $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2; | |
1878 $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2; | |
1879 $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2; | |
1880 $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; | |
1881 $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; | |
1882 } | |
1883 $reads_left = scalar keys %{$table}; | |
1884 } | |
1885 if ($ifBalanced eq 'BAL') { | |
1886 $ifRenv = $signatureType; | |
1887 } | |
1888 } | |
1889 } | |
1890 } | |
1891 } | |
1892 if ($ifBalanced ne 'BAL') { | |
1893 #define possible break point | |
1894 $coord_start_chr1 = $coord_start_chr1_cluster1; | |
1895 $coord_end_chr1 = $coord_end_chr1_cluster1; | |
1896 $coord_start_chr2 = $coord_start_chr2_cluster1; | |
1897 $coord_end_chr2 = $coord_end_chr2_cluster1; | |
1898 | |
1899 my $region_length_chr1 = $coord_end_chr1-$coord_start_chr1; | |
1900 my $region_length_chr2 = $coord_end_chr2-$coord_start_chr2; | |
1901 | |
1902 my $leftLetterOk = substr($mate_sense, 0, 1); #R | |
1903 my $rightLetterOk = substr($mate_sense, 1, 1); #F | |
1904 | |
1905 my @index = keys %{$table}; | |
1906 unless ($diff_sense_ends) { | |
1907 my $firstEndOrder1 = $ends_order1[$index[0]]; | |
1908 my $firstEndOrder2 = $ends_order2[$index[0]]; | |
1909 $break_pont_chr1 = (($strand1[$index[0]] eq 'R' && $firstEndOrder1 == 2) || ($strand1[$index[0]] eq 'F' && $firstEndOrder1 == 1))?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')'; | |
1910 $break_pont_chr2 = (($strand2[$index[0]] eq 'R' && $firstEndOrder2 == 2) || ($strand2[$index[0]] eq 'F' && $firstEndOrder2 == 1))?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')'; | |
1911 } else { | |
1912 $break_pont_chr1 = ($strand1[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')'; | |
1913 $break_pont_chr2 = ($strand2[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')'; | |
1914 } | |
1915 | |
1916 if ($chr1 ne $chr2){ | |
1917 $ifRenv="INV_TRANSLOC" if($ifRenv eq "REVERSE_SENSE"); | |
1918 $ifRenv="TRANSLOC" if($ifRenv eq "NORMAL_SENSE"); | |
1919 } | |
1920 } | |
1921 | |
1922 if (($ifBalanced eq 'BAL')&&( (scalar keys %{$table}) + (scalar keys %{$secondTable}) < $nb_pairs_threshold)) { | |
1923 next; #discard the link | |
1924 } | |
1925 if (($ifBalanced eq 'UNBAL')&&(scalar keys %{$table} < $nb_pairs_threshold)) { | |
1926 next; #discard the link | |
1927 } | |
1928 my $ratioTxt = "$reads_left/".(scalar @pairs); | |
1929 my ($n1,$nTot) = split ("/",$strand_ratio_filtering); | |
1930 my $ratioReal = $reads_left/$nTot; | |
1931 | |
1932 if ($coord_start_chr1<=0) { | |
1933 $coord_start_chr1=1; | |
1934 } | |
1935 if ($coord_start_chr2<=0) { | |
1936 $coord_start_chr2=1; | |
1937 } | |
1938 #create output | |
1939 my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output | |
1940 $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2, | |
1941 $reads_left, | |
1942 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@pairs), | |
1943 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand1), | |
1944 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand2), | |
1945 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order1), | |
1946 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order2), | |
1947 &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order1), | |
1948 &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order2), | |
1949 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions1), | |
1950 &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions2), | |
1951 $ifRenv, | |
1952 $strand_ratio_filtering, | |
1953 $ifBalanced, $ratioTxt, $break_pont_chr1, $break_pont_chr2, | |
1954 $ratioReal, $nTot); | |
1955 | |
1956 $nb_links++; | |
1957 print $fh2 join("\t",@link)."\n"; | |
1958 | |
1959 if($record>=$warn){ | |
1960 print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n"; | |
1961 $warn+=10000; | |
1962 } | |
1963 | |
1964 } | |
1965 $fh->close; | |
1966 $fh2->close; | |
1967 | |
1968 print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; | |
1969 | |
1970 } | |
1971 #------------------------------------------------------------------------------# | |
1972 #------------------------------------------------------------------------------# | |
1973 #gets information about ends positions given start, direction and order | |
1974 sub getEnds { | |
1975 my ($ends,$starts,$strand,$end_order,$tag_length) = @_; | |
1976 for my $i (0..scalar(@{$starts})-1) { | |
1977 $ends->[$i] = getEnd($starts->[$i],$strand->[$i],$end_order->[$i],$tag_length); | |
1978 } | |
1979 } | |
1980 sub getEnd { | |
1981 my ($start,$strand, $end_order,$tag_length) = @_; | |
1982 return ($strand eq 'F')? $start+$tag_length->{$end_order}-1:$start-$tag_length->{$end_order}+1; | |
1983 } | |
1984 #------------------------------------------------------------------------------# | |
1985 #------------------------------------------------------------------------------# | |
1986 #gets starts and ends Coords when start=leftmost given positions, directions and orders | |
1987 sub getCoordswithLeftMost { | |
1988 | |
1989 my ($starts,$ends,$positions,$strand,$end_order,$tag_length) = @_; | |
1990 | |
1991 for my $i (0..scalar(@{$positions})-1) { | |
1992 | |
1993 if($strand->[$i] eq 'F'){ | |
1994 $starts->[$i]=$positions->[$i]; | |
1995 $ends->[$i]=$positions->[$i]+$tag_length->{$end_order->[$i]}-1; | |
1996 }else{ | |
1997 $starts->[$i]=$positions->[$i]-$tag_length->{$end_order->[$i]}+1; | |
1998 $ends->[$i]=$positions->[$i]; | |
1999 } | |
2000 } | |
2001 } | |
2002 #------------------------------------------------------------------------------# | |
2003 #------------------------------------------------------------------------------# | |
2004 sub addInsertionInfo { #add field with INS,DEL,NA and distance between clusters and performs filtering | |
2005 | |
2006 my ($chr,$chrID,$nb_pairs_threshold,$order_filtering,$indel_sigma_threshold,$dup_sigma_threshold,$singleton_sigma_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_; | |
2007 | |
2008 my @sfile=split(/\./,$links_file); | |
2009 my $fchr=$sfile[$#sfile-2]; | |
2010 | |
2011 | |
2012 my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0; | |
2013 | |
2014 my $record=0; | |
2015 my $nb_links=0; | |
2016 my $warn=10000; | |
2017 | |
2018 print LOG "\# Filtering out normal pairs using insert size...\n"; | |
2019 print LOG "-- mu length=$mu, sigma length=$sigma, indel sigma threshold=$indel_sigma_threshold, dup sigma threshold=$dup_sigma_threshold\n"; | |
2020 print LOG "-- using ".($mu-$indel_sigma_threshold*$sigma)."-". | |
2021 ($mu+$indel_sigma_threshold*$sigma)." as normal range of insert size for indels\n"; | |
2022 print LOG "-- using ".($mu-$dup_sigma_threshold*$sigma)."-". | |
2023 ($mu+$dup_sigma_threshold*$sigma)." as normal range of insert size for duplications\n"; | |
2024 print LOG "-- using ".($mu-$singleton_sigma_threshold*$sigma)." as the upper limit of insert size for singletons\n" if($mate_sense eq "RF"); | |
2025 | |
2026 my $fh = new FileHandle; | |
2027 my $fh2 = new FileHandle; | |
2028 | |
2029 $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; | |
2030 $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; | |
2031 | |
2032 while(<$fh>){ | |
2033 | |
2034 $record++; | |
2035 my @t = split; | |
2036 my ($chr1,$chr2,$mates_list)=@t[0,3,7]; | |
2037 | |
2038 if($chrID->{$chr1} ne $chrID->{$chr2}) { #if inter-chromosomal link here (because sv_type=all), | |
2039 $nb_links++; | |
2040 | |
2041 $t[16]="INV_TRANSLOC" if($t[16] eq "REVERSE_SENSE"); | |
2042 $t[16]="TRANSLOC" if($t[16] eq "NORMAL_SENSE"); | |
2043 | |
2044 $t[16].= "\t"; | |
2045 $t[19].= "\t"; | |
2046 | |
2047 print $fh2 join("\t",@t)."\n"; | |
2048 | |
2049 if($record>=$warn){ | |
2050 print LOG "-- $fchr : $warn links processed - $nb_links links kept\n"; | |
2051 $warn+=10000; | |
2052 } | |
2053 next; | |
2054 } | |
2055 | |
2056 my $ifRenv = $t[16]; | |
2057 my $ifBalanced = "UNBAL"; | |
2058 $ifBalanced = $t[18] if ($order_filtering); | |
2059 | |
2060 my $numberOfPairs = $t[6]; | |
2061 my @positions1 = deleteBadOrderSensePairs(split (/,/,$t[14])); | |
2062 my @positions2 = deleteBadOrderSensePairs(split (/,/,$t[15])); | |
2063 | |
2064 if ($ifBalanced eq "BAL") { | |
2065 | |
2066 if ($ifRenv eq "INV_TRANSLOC") { | |
2067 $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment | |
2068 } | |
2069 if ($ifRenv eq "NORMAL_SENSE") { | |
2070 $ifRenv = "TRANSLOC"; | |
2071 } | |
2072 if ($ifRenv eq "REVERSE_SENSE") { | |
2073 $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment | |
2074 } | |
2075 $t[19].= "\t"; | |
2076 | |
2077 my $meanDistance = 0; | |
2078 | |
2079 for my $i (0..$numberOfPairs-1) { | |
2080 $meanDistance += $positions2[$i]-$positions1[$i]; | |
2081 } | |
2082 $meanDistance /= $numberOfPairs; | |
2083 | |
2084 $t[16] = $ifRenv."\t".$meanDistance; | |
2085 #dont touch the annotation. It should be already OK. | |
2086 | |
2087 } else { | |
2088 #only for unbalanced | |
2089 | |
2090 my $ifoverlap=overlap($t[1],$t[2],$t[4],$t[5]); | |
2091 | |
2092 my $ends_sense_class = (deleteBadOrderSensePairs(split (/,/,$t[8])))[0]. | |
2093 (deleteBadOrderSensePairs(split (/,/,$t[9])))[0]; | |
2094 my $ends_order_class = (deleteBadOrderSensePairs(split (/,/,$t[10])))[0]. | |
2095 (deleteBadOrderSensePairs(split (/,/,$t[11])))[0]; | |
2096 | |
2097 my $indel_type = $ifRenv; | |
2098 | |
2099 my $meanDistance = "N/A"; | |
2100 | |
2101 ($meanDistance, $indel_type) = checkIndel ($numberOfPairs, #identify insertion type for rearrangments without inversion, calculates distance between cluster | |
2102 \@positions1, #assign N/A to $indel_type if unknown | |
2103 \@positions2, | |
2104 $ifRenv, | |
2105 $ifoverlap, | |
2106 $indel_sigma_threshold, | |
2107 $dup_sigma_threshold, | |
2108 $singleton_sigma_threshold, | |
2109 $mu, | |
2110 $sigma, | |
2111 $ifBalanced, | |
2112 $ends_sense_class, | |
2113 $ends_order_class, | |
2114 $mate_sense, | |
2115 $diff_sense_ends, | |
2116 ); | |
2117 | |
2118 #filtering of pairs with distance inconsistant with the SV | |
2119 if ($ifRenv ne "REVERSE_SENSE") { | |
2120 my $maxCoord1 =$chr->{$chrID->{$chr1}}->{length}; | |
2121 my $maxCoord2 =$chr->{$chrID->{$chr2}}->{length}; | |
2122 $meanDistance = recalc_t_usingInsertSizeInfo(\@t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense, | |
2123 $maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering); | |
2124 next if ($t[6] < $nb_pairs_threshold); | |
2125 }else{ | |
2126 $t[19].= "\t"; | |
2127 } | |
2128 $t[16] = $indel_type."\t".$meanDistance; | |
2129 } | |
2130 | |
2131 $nb_links++; | |
2132 | |
2133 print $fh2 join("\t",@t)."\n"; | |
2134 if($record>=$warn){ | |
2135 print LOG "-- $fchr : $warn links processed - $nb_links links kept\n"; | |
2136 $warn+=10000; | |
2137 } | |
2138 } | |
2139 $fh->close; | |
2140 $fh2->close; | |
2141 | |
2142 print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; | |
2143 | |
2144 } | |
2145 #------------------------------------------------------------------------------# | |
2146 #------------------------------------------------------------------------------# | |
2147 sub checkIndel { | |
2148 | |
2149 my ($numberOfPairs, $positions1, $positions2, $ifRenv, $ifoverlap, $indel_sigma_threshold, $dup_sigma_threshold, $singleton_sigma_threshold, | |
2150 $mu, $sigma, $ifBalanced,$ends_sense_class,$ends_order_class,$mate_sense,$diff_sense_ends) = @_; | |
2151 | |
2152 my $meanDistance = 0; | |
2153 | |
2154 for my $i (0..$numberOfPairs-1) { | |
2155 $meanDistance += $positions2->[$i]-$positions1->[$i]; | |
2156 } | |
2157 $meanDistance /= $numberOfPairs; | |
2158 | |
2159 return ($meanDistance,"INV_DUPLI") if (($ifRenv eq "REVERSE_SENSE") && ($meanDistance<$mu+$dup_sigma_threshold*$sigma) ); | |
2160 | |
2161 return ($meanDistance,"INVERSION") if ($ifRenv eq "REVERSE_SENSE"); | |
2162 | |
2163 if($diff_sense_ends){ | |
2164 return ($meanDistance, "LARGE_DUPLI") if ($ends_sense_class ne $mate_sense) && ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ; | |
2165 return ($meanDistance, "SINGLETON") if (($meanDistance<$mu-$singleton_sigma_threshold*$sigma) && $mate_sense eq "RF" && ($ends_sense_class eq inverseSense($mate_sense))); | |
2166 }else{ | |
2167 return ($meanDistance, "LARGE_DUPLI") if (($ends_sense_class eq $mate_sense) && ($ends_order_class eq "12") || ($ends_sense_class eq inverseSense($mate_sense)) && ($ends_order_class eq "21")) && | |
2168 ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ; | |
2169 } | |
2170 | |
2171 return ($meanDistance, "SMALL_DUPLI") if (($meanDistance<$mu-$dup_sigma_threshold*$sigma) && $ifoverlap); | |
2172 | |
2173 return ($meanDistance, "DUPLICATION") if ($diff_sense_ends && ($ends_sense_class ne $mate_sense) && ($meanDistance<$mu-$dup_sigma_threshold*$sigma) ) ; | |
2174 | |
2175 return ($meanDistance, "INSERTION") if ($meanDistance<$mu -$indel_sigma_threshold*$sigma); | |
2176 return ($meanDistance, "DELETION") if ($meanDistance>$mu+$indel_sigma_threshold*$sigma); | |
2177 | |
2178 return ($meanDistance, "UNDEFINED"); | |
2179 } | |
2180 #------------------------------------------------------------------------------# | |
2181 #------------------------------------------------------------------------------# | |
2182 #sub reacalulate @t so that get rid of unconsistent pairs (unconsistent insert size ) | |
2183 sub recalc_t_usingInsertSizeInfo { | |
2184 my($t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense,$maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering) = @_; | |
2185 | |
2186 my @badPairs; | |
2187 | |
2188 my @positions1 = getAllEntries($t->[14]); | |
2189 my @positions2 = getAllEntries($t->[15]); | |
2190 | |
2191 if ($meanDistance < $mu) { | |
2192 for my $i (0..scalar(@positions1)-1) { | |
2193 if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]>=$mu) { | |
2194 push(@badPairs,$i); | |
2195 } | |
2196 } | |
2197 } else { | |
2198 for my $i (0..scalar(@positions1)-1) { | |
2199 if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]<=$mu) { | |
2200 push(@badPairs,$i); | |
2201 } | |
2202 } | |
2203 } | |
2204 | |
2205 if (scalar (@badPairs)>0) { | |
2206 #print join("\t",@badPairs).": ".join("\t",@t)."\n"; | |
2207 #remove these inconsistant links | |
2208 $t->[6] -= scalar(@badPairs); #numberOfPairs | |
2209 return if ($t->[6] < $nb_pairs_threshold); | |
2210 | |
2211 $t->[7] = mark_values(\@badPairs, $t->[7]); | |
2212 $t->[8] = mark_values(\@badPairs, $t->[8]); | |
2213 $t->[9] = mark_values(\@badPairs, $t->[9]); | |
2214 $t->[10] = mark_values(\@badPairs, $t->[10]); | |
2215 $t->[11] = mark_values(\@badPairs, $t->[11]); | |
2216 | |
2217 $t->[12] = mark_indexes(\@badPairs, $t->[12]); | |
2218 $t->[13] = mark_indexes(\@badPairs, $t->[13]); | |
2219 | |
2220 $t->[14] = mark_values(\@badPairs, $t->[14]); | |
2221 $t->[15] = mark_values(\@badPairs, $t->[15]); | |
2222 $t->[19] = recalculate_ratio($t->[6],$t->[19]) if ($order_filtering); #add the second ratio | |
2223 $t->[17] = recalculate_ratio($t->[6],$t->[17]) unless ($order_filtering); | |
2224 ($t->[1],$t->[2]) = recalculate_boundaries($t->[14],$t->[8],$t->[10],$tag_length); | |
2225 ($t->[4],$t->[5]) = recalculate_boundaries($t->[15],$t->[9],$t->[11],$tag_length); | |
2226 | |
2227 #recalc breakpoints: | |
2228 my $quant001 = 3.090232; | |
2229 my $maxFragmentLength = &floor($quant001 * $sigma + $mu); | |
2230 $t->[20] = recalc_breakpoints($mate_sense,$maxCoord1,$t->[14],substr($ends_sense_class,0,1),substr($ends_order_class,0,1),$t->[1],$t->[2],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering); | |
2231 $t->[21] = recalc_breakpoints($mate_sense,$maxCoord2,$t->[15],substr($ends_sense_class,1,1),substr($ends_order_class,1,1),$t->[4],$t->[5],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering); | |
2232 #recalc total ratio | |
2233 $t->[22] = $t->[6] / $t->[23] if ($order_filtering); | |
2234 $t->[18] = $t->[6] / $t->[19] unless ($order_filtering); | |
2235 | |
2236 @positions1 = deleteBadOrderSensePairs(split (/,/,$t->[14])); | |
2237 @positions2 = deleteBadOrderSensePairs(split (/,/,$t->[15])); | |
2238 | |
2239 $meanDistance = 0; | |
2240 | |
2241 for my $i (0..scalar(@positions1)-1) { | |
2242 $meanDistance += $positions2[$i]-$positions1[$i]; | |
2243 } | |
2244 $meanDistance /= scalar(@positions1); | |
2245 | |
2246 } else { | |
2247 $t->[17] = recalculate_ratio((split(/\//,$t->[17]))[0],$t->[17]) unless ($order_filtering); | |
2248 $t->[19] = recalculate_ratio((split(/\//,$t->[19]))[0],$t->[19]) if ($order_filtering); | |
2249 | |
2250 } #nothing has been filtered | |
2251 return $meanDistance; | |
2252 } | |
2253 | |
2254 sub recalculate_ratio { | |
2255 my ($left, $ratio) = @_; | |
2256 my @elements = split (/\//,$ratio); | |
2257 $elements[1]= $elements[0]; | |
2258 $elements[0]=$left; | |
2259 return $ratio."\t".join("/",@elements); | |
2260 } | |
2261 | |
2262 sub recalc_breakpoints { | |
2263 my ($mate_sense,$maxCoord,$startString,$strand,$firstEndOrder,$coord_start_chr,$coord_end_chr,$maxFragmentLength,$diff_sense_ends ) = @_; | |
2264 my $break_pont_chr; | |
2265 | |
2266 my $leftLetterOk = substr($mate_sense, 0, 1); #R | |
2267 my $rightLetterOk = substr($mate_sense, 1, 1); #F | |
2268 | |
2269 | |
2270 my @positions = deleteBadOrderSensePairs(split (/,/,$startString)); | |
2271 | |
2272 unless ($diff_sense_ends) { | |
2273 $break_pont_chr = (($strand eq 'R' && $firstEndOrder == 2) || ($strand eq 'F' && $firstEndOrder == 1))?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')'; | |
2274 } else { | |
2275 $break_pont_chr = ($strand eq $leftLetterOk)?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')'; | |
2276 } | |
2277 return $break_pont_chr; | |
2278 } | |
2279 sub recalculate_boundaries { | |
2280 my ($startString,$senseString,$endsOrderString,$tag_length) = @_; | |
2281 my @positions = deleteBadOrderSensePairs(split (/,/,$startString)); | |
2282 my @strands = deleteBadOrderSensePairs(split (/,/,$senseString)); | |
2283 my @ends_orders = deleteBadOrderSensePairs(split (/,/,$endsOrderString)); | |
2284 my @ends; getEnds(\@ends,\@positions,\@strands,\@ends_orders,$tag_length); | |
2285 my $coord_start_cluster = min(min(@positions),min(@ends)); | |
2286 my $coord_end_cluster = max(max(@positions),max(@ends)); | |
2287 return ($coord_start_cluster,$coord_end_cluster); | |
2288 } | |
2289 | |
2290 sub remove_indexes { | |
2291 my ($bads, $string) = @_; | |
2292 my @elements = deleteBadOrderSensePairs(split (/,/,$string)); | |
2293 for my $i (reverse sort %{$bads}) { | |
2294 delete $elements[$i]; | |
2295 } | |
2296 return "(".join(",",@elements).")"; | |
2297 } | |
2298 ##add @ to to elements | |
2299 sub mark_values { | |
2300 my ($bads, $string) = @_; | |
2301 my @elements = getAllEntries($string); | |
2302 for my $i (@{$bads}) { | |
2303 $elements[$i] .= "@"; | |
2304 } | |
2305 return "(".join(",",@elements).")"; | |
2306 } | |
2307 ##add @ to to indexes | |
2308 sub mark_indexes { | |
2309 my ($bads, $string) = @_; | |
2310 my @elements = getAllEntries($string); | |
2311 for my $i ((0..scalar(@elements)-1)) { | |
2312 for my $j (@{$bads}) { | |
2313 $elements[$i] .= "@" if ($elements[$i] eq ($j+1)); | |
2314 } | |
2315 } | |
2316 | |
2317 return "(".join(",",@elements).")"; | |
2318 } | |
2319 | |
2320 #------------------------------------------------------------------------------# | |
2321 #------------------------------------------------------------------------------# | |
2322 sub redraw { | |
2323 | |
2324 my ($type,$table,$secondTable,$badInFRSense,$ifBalanced,$arr) = @_; | |
2325 | |
2326 my $out; | |
2327 my @first_arr; | |
2328 if ($ifBalanced eq 'BAL') { | |
2329 my @second_arr; | |
2330 my $lastPushed = 1; | |
2331 if ($type == 1) { | |
2332 for my $i (0 .. scalar(@{$arr})-1) { | |
2333 if (exists ($table->{$i})) { | |
2334 push(@first_arr,$arr->[$i]); | |
2335 $lastPushed = 1; | |
2336 }elsif (exists ($secondTable->{$i})) { | |
2337 push(@second_arr,$arr->[$i]); | |
2338 $lastPushed = 2; | |
2339 } elsif ($lastPushed == 1) { | |
2340 if (exists ($badInFRSense->{$i})) { | |
2341 push(@first_arr,$arr->[$i]."\$"); | |
2342 }else { | |
2343 push(@first_arr,$arr->[$i]."*"); | |
2344 } | |
2345 } elsif ($lastPushed == 2) { | |
2346 if (exists ($badInFRSense->{$i})) { | |
2347 push(@second_arr,$arr->[$i]."\$"); | |
2348 }else { | |
2349 push(@second_arr,$arr->[$i]."*"); | |
2350 } | |
2351 } else {print "Error!";exit;} | |
2352 } | |
2353 } else { | |
2354 for my $i (@{$arr}) { | |
2355 if (exists ($table->{$i-1})) { | |
2356 push(@first_arr,$i); | |
2357 $lastPushed = 1; | |
2358 }elsif (exists ($secondTable->{$i-1})) { | |
2359 push(@second_arr,$i); | |
2360 $lastPushed = 2; | |
2361 } elsif ($lastPushed == 1) { | |
2362 if (exists ($badInFRSense->{$i-1})) { | |
2363 push(@first_arr,$i."\$"); | |
2364 }else { | |
2365 push(@first_arr,$i."*"); | |
2366 } | |
2367 } elsif ($lastPushed == 2) { | |
2368 if (exists ($badInFRSense->{$i-1})) { | |
2369 push(@second_arr,$i."\$"); | |
2370 }else { | |
2371 push(@second_arr,$i."*"); | |
2372 } | |
2373 } else {print "Error!";exit;} | |
2374 } | |
2375 } | |
2376 $out = '('.join(",",@first_arr).'),('.join(",",@second_arr).')'; | |
2377 } | |
2378 else { | |
2379 if ($type == 1) { | |
2380 for my $i (0 .. scalar(@{$arr})-1) { | |
2381 if (exists ($table->{$i})) { | |
2382 push(@first_arr,$arr->[$i]); | |
2383 } else { | |
2384 if (exists ($badInFRSense->{$i})) { | |
2385 push(@first_arr,$arr->[$i]."\$"); | |
2386 }else { | |
2387 push(@first_arr,$arr->[$i]."*"); | |
2388 } | |
2389 } | |
2390 } | |
2391 } else { | |
2392 for my $i (@{$arr}) { | |
2393 if (exists ($table->{$i-1})) { | |
2394 push(@first_arr,$i); | |
2395 } else { | |
2396 if (exists ($badInFRSense->{$i-1})) { | |
2397 push(@first_arr,$i."\$"); | |
2398 }else { | |
2399 push(@first_arr,$i."*"); | |
2400 } | |
2401 } | |
2402 } | |
2403 } | |
2404 $out = '('.join(",",@first_arr).')'; | |
2405 } | |
2406 return $out; | |
2407 } | |
2408 #------------------------------------------------------------------------------# | |
2409 #------------------------------------------------------------------------------# | |
2410 sub check { | |
2411 | |
2412 my $table = $_[0]; | |
2413 my $bad = 'OK'; | |
2414 my $max = 0; | |
2415 for my $i (sort {$a<=>$b} keys %{$table}) { | |
2416 unless ($table->{$i}->{nonAdeq} == 0) { | |
2417 if ($max<$table->{$i}->{nonAdeq}) { | |
2418 $max=$table->{$i}->{nonAdeq}; | |
2419 $bad = $i; | |
2420 } | |
2421 } | |
2422 } | |
2423 return $bad; | |
2424 } | |
2425 #------------------------------------------------------------------------------# | |
2426 #------------------------------------------------------------------------------# | |
2427 sub reversed { | |
2428 | |
2429 my ($i,$j,$ifRenv,$positions) = @_; | |
2430 if (($ifRenv eq 'REVERSE_SENSE' && $positions->[$i]<$positions->[$j]) || ($ifRenv ne 'REVERSE_SENSE' && $positions->[$i]>$positions->[$j])){ | |
2431 return 1; | |
2432 } | |
2433 return 0; | |
2434 } | |
2435 #------------------------------------------------------------------------------# | |
2436 #------------------------------------------------------------------------------# | |
2437 sub remove { | |
2438 | |
2439 my ($bad,$table) = @_; | |
2440 for my $i (sort {$a<=>$b} keys %{$table}) { | |
2441 if ($bad == $i) { | |
2442 delete($table->{$i});; | |
2443 } else { | |
2444 if (exists($table->{$i}->{$bad})) { | |
2445 delete($table->{$i}->{$bad}); | |
2446 $table->{$i}->{nonAdeq}--; | |
2447 } | |
2448 } | |
2449 } | |
2450 } | |
2451 #------------------------------------------------------------------------------# | |
2452 #------------------------------------------------------------------------------# | |
2453 sub findBadInRFSenseSOLiDSolexa { #choose maximum: FFFFs or RRRRs | |
2454 | |
2455 my ($strand,$ends_order,$mate_sense,@keysLeft) = @_; | |
2456 | |
2457 my $leftLetterOk = substr($mate_sense, 0, 1); #R | |
2458 my $rightLetterOk = substr($mate_sense, 1, 1); #F | |
2459 | |
2460 my (@standardArray); | |
2461 if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs | |
2462 $leftLetterOk = 'R'; | |
2463 $rightLetterOk = 'F'; | |
2464 @standardArray = translateSolidToRF($strand,$ends_order); | |
2465 } else { | |
2466 @standardArray = @{$strand}; | |
2467 } | |
2468 | |
2469 my $ifR = 0; | |
2470 my @Rs; | |
2471 | |
2472 for my $i (@keysLeft) { | |
2473 if ($standardArray[$i] eq $leftLetterOk) { | |
2474 $ifR++; | |
2475 push(@Rs,$i); | |
2476 } | |
2477 } | |
2478 | |
2479 | |
2480 my $ifF = 0; | |
2481 my @Fs; | |
2482 | |
2483 for my $i (@keysLeft) { | |
2484 if ($standardArray[$i] eq $rightLetterOk) { | |
2485 $ifF++; | |
2486 push(@Fs,$i); | |
2487 } | |
2488 } | |
2489 | |
2490 if($ifR>=$ifF) { | |
2491 return @Fs; | |
2492 } | |
2493 return @Rs; | |
2494 } | |
2495 | |
2496 #------------------------------------------------------------------------------# | |
2497 #------------------------------------------------------------------------------# | |
2498 sub findBadInFRSenseSOLiDSolexa { #should work both for SOLiD and Solexa | |
2499 | |
2500 my ($strand1,$strand2,$ends_order1,$ends_order2,$order1,$order2) = ($_[0],$_[1],$_[2],$_[3],$_[4],$_[5]); | |
2501 my $mate_sense = $_[6]; | |
2502 | |
2503 my $leftLetterOk = substr($mate_sense, 0, 1); #R | |
2504 my $rightLetterOk = substr($mate_sense, 1, 1); #F | |
2505 | |
2506 my (@standardArray1,@standardArray2); | |
2507 | |
2508 if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs | |
2509 $leftLetterOk = 'R'; | |
2510 $rightLetterOk = 'F'; | |
2511 @standardArray1 = translateSolidToRF($strand1,$ends_order1); | |
2512 my @arr = getOrderedStrands($strand2,$order2); | |
2513 my @ends2 = getOrderedStrands($ends_order2,$order2); | |
2514 @standardArray2 = translateSolidToRF(\@arr,\@ends2); | |
2515 | |
2516 } else { | |
2517 @standardArray1 = @{$strand1}; | |
2518 @standardArray2 = getOrderedStrands($strand2,$order2); | |
2519 } | |
2520 | |
2521 #we will try 4 possibilities, 2 for each end of the link: RFRR-FFF->RFFFF , RFRR-FFF->RRRFFF | |
2522 | |
2523 #for the first end: | |
2524 | |
2525 my @array = @standardArray1; | |
2526 my %badInFRSense1; | |
2527 for my $i (1..scalar (@array)-1){ # FRFRFFFF -> FFFFFF and RRFRFRFFFF -> RRFFFFFF | |
2528 if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { | |
2529 $badInFRSense1{$i}=1; | |
2530 $array[$i] = $rightLetterOk; | |
2531 } | |
2532 } | |
2533 my $numberRRRFFF_or_FFF_1 = scalar(@array)-scalar(keys %badInFRSense1); | |
2534 @array = @standardArray1; | |
2535 my %badInFRSense0; | |
2536 for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFFRR -> FFFFFFRR | |
2537 if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { | |
2538 $badInFRSense0{$i-1}=1; | |
2539 $array[$i-1] = $leftLetterOk; | |
2540 | |
2541 } | |
2542 } | |
2543 my $numberRRF1 = scalar(@array)-scalar(keys %badInFRSense0); | |
2544 | |
2545 #for the second end: | |
2546 @array = @standardArray2; | |
2547 | |
2548 my %badInFRSense3; | |
2549 for my $i (1..scalar(@array)-1){ | |
2550 if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { | |
2551 $badInFRSense3{$order2->[$i]}=1; | |
2552 $array[$i] = $rightLetterOk; | |
2553 } | |
2554 } | |
2555 my $numberRRRFFF_or_FFF_2 = scalar(@array)-scalar(keys %badInFRSense3); | |
2556 | |
2557 @array = @standardArray2; | |
2558 my %badInFRSense5; | |
2559 for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFF -> FFFFFF | |
2560 if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { | |
2561 $badInFRSense5{$i-1}=1; | |
2562 $array[$i-1] = $leftLetterOk; | |
2563 } | |
2564 } | |
2565 my $numberRRF2 = scalar(@array)-scalar(keys %badInFRSense5); | |
2566 | |
2567 if ($numberRRF1>=$numberRRRFFF_or_FFF_1 && $numberRRF1 >= $numberRRRFFF_or_FFF_2 && $numberRRF1 >=$numberRRF2) { | |
2568 return (1,%badInFRSense0); | |
2569 } | |
2570 | |
2571 if ($numberRRRFFF_or_FFF_1 >=$numberRRF1 && $numberRRRFFF_or_FFF_1 >= $numberRRRFFF_or_FFF_2 && $numberRRRFFF_or_FFF_1 >= $numberRRF2) { | |
2572 return (1,%badInFRSense1); | |
2573 } | |
2574 | |
2575 if ($numberRRRFFF_or_FFF_2 >= $numberRRF1 && $numberRRRFFF_or_FFF_2 >= $numberRRRFFF_or_FFF_1 && $numberRRRFFF_or_FFF_2 >=$numberRRF2) { | |
2576 return (2,%badInFRSense3); | |
2577 } | |
2578 | |
2579 if ($numberRRF2 >= $numberRRF1 && $numberRRF2 >= $numberRRRFFF_or_FFF_1 && $numberRRF2 >= $numberRRRFFF_or_FFF_2 ) { | |
2580 return (2,%badInFRSense5); | |
2581 } | |
2582 | |
2583 #should not get here: | |
2584 print STDERR "Error in findBadInFRSenseSOLiDSolexa()!\n"; | |
2585 return (1,%badInFRSense1); | |
2586 } | |
2587 | |
2588 sub getOrderedStrands { | |
2589 my ($strand,$order) = ($_[0],$_[1]); | |
2590 my @arr; | |
2591 for my $i (0..scalar(@{$strand})-1) { | |
2592 push(@arr,$strand->[$order->[$i]-1]); | |
2593 } | |
2594 return @arr; | |
2595 } | |
2596 #------------------------------------------------------------------------------# | |
2597 #------------------------------------------------------------------------------# | |
2598 sub checkClusters { | |
2599 | |
2600 my ($ifRenv,$coord_start_chr1_cluster1,$coord_start_chr1_cluster2,$coord_start_chr2_cluster1,$coord_start_chr2_cluster2) = @_; | |
2601 if ($ifRenv eq 'REVERSE_SENSE') { | |
2602 if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) { | |
2603 return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0; | |
2604 } | |
2605 return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0; | |
2606 } | |
2607 #if NORM | |
2608 if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) { | |
2609 return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0; | |
2610 } | |
2611 return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0; | |
2612 } | |
2613 | |
2614 #------------------------------------------------------------------------------# | |
2615 #------------------------------------------------------------------------------# | |
2616 sub translateSolidToRF { | |
2617 my ($strandArr,$ends_orderArr)=@_; | |
2618 my @array; | |
2619 for my $i (0..scalar(@{$strandArr})-1) { | |
2620 if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'F') { | |
2621 push(@array,'F'); | |
2622 } | |
2623 if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'F') { | |
2624 push(@array,'R'); | |
2625 } | |
2626 if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'R') { | |
2627 push(@array,'R'); | |
2628 } | |
2629 if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'R') { | |
2630 push(@array,'F'); | |
2631 } | |
2632 } | |
2633 return @array; | |
2634 } | |
2635 | |
2636 #------------------------------------------------------------------------------# | |
2637 #------------------------------------------------------------------------------# | |
2638 #convert the links file to the circos format | |
2639 sub links2segdup{ | |
2640 | |
2641 my($id,$color_code,$links_file,$segdup_file)=@_; | |
2642 | |
2643 print LOG "# Converting to the circos format...\n"; | |
2644 | |
2645 tie (my %hcolor,'Tie::IxHash'); #color-code hash table | |
2646 foreach my $col (keys %{$color_code}){ | |
2647 my ($min_links,$max_links)=split(",",$color_code->{$col}); | |
2648 $hcolor{$col}=[$min_links,$max_links]; | |
2649 } | |
2650 | |
2651 open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; | |
2652 open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; | |
2653 | |
2654 my $index=1; | |
2655 while(<LINKS>){ | |
2656 | |
2657 my ($chr1,$start1,$end1,$chr2,$start2,$end2,$count)=(split)[0,1,2,3,4,5,6]; | |
2658 | |
2659 my $color=getColor($count,\%hcolor,"circos"); #get the color-code according the number of links | |
2660 | |
2661 print SEGDUP "$index\t$id$chr1\t$start1\t$end1\tcolor=$color\n". #circos output | |
2662 "$index\t$id$chr2\t$start2\t$end2\tcolor=$color\n"; | |
2663 $index++; | |
2664 } | |
2665 | |
2666 close LINKS; | |
2667 close SEGDUP; | |
2668 print LOG "-- output created: $segdup_file\n"; | |
2669 } | |
2670 #------------------------------------------------------------------------------# | |
2671 #------------------------------------------------------------------------------# | |
2672 #convert the links file to the bedPE format for BEDTools usage | |
2673 sub links2bedPElinksfile{ | |
2674 | |
2675 my ($sample,$links_file,$bedpe_file)=@_; | |
2676 | |
2677 open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; | |
2678 open BEDPE, ">$bedpe_file" or die "$0: can't write in the output: $bedpe_file :$!\n"; | |
2679 | |
2680 my $nb_links=1; | |
2681 | |
2682 while(<LINKS>){ | |
2683 | |
2684 chomp; | |
2685 my @t=split("\t",$_); | |
2686 my ($chr1,$start1,$end1,$chr2,$start2,$end2)=splice(@t,0,6); | |
2687 my $type=($chr1 eq $chr2)? "INTRA":"INTER"; | |
2688 $type.="_".$t[10]; | |
2689 | |
2690 $start1--; $start2--; | |
2691 | |
2692 print BEDPE "$chr1\t$start1\t$end1\t$chr2\t$start2\t$end2". | |
2693 "\t$sample"."_link$nb_links\t$type\t.\t.". | |
2694 "\t".join("|",@t)."\n"; | |
2695 | |
2696 $nb_links++; | |
2697 } | |
2698 | |
2699 close LINKS; | |
2700 close BEDPE; | |
2701 | |
2702 } | |
2703 #------------------------------------------------------------------------------# | |
2704 #------------------------------------------------------------------------------# | |
2705 sub bedPElinks2linksfile{ | |
2706 | |
2707 my ($bedpe_file,$links_file)=@_; | |
2708 | |
2709 open BEDPE, "<$bedpe_file" or die "$0: can't open: $bedpe_file :$!\n"; | |
2710 open LINKS, ">$links_file" or die "$0: can't write in the output $links_file :$!\n"; | |
2711 | |
2712 while(<BEDPE>){ | |
2713 | |
2714 chomp; | |
2715 my $sample=(split("_",(split("\t",$_))[6]))[0]; | |
2716 my @t1=(split("\t",$_))[0,1,2,3,4,5]; | |
2717 my @t2=split(/\|/,(split("\t",$_))[10]); | |
2718 push(@t2,$sample); | |
2719 | |
2720 print LINKS join("\t",@t1)."\t".join("\t",@t2)."\n"; | |
2721 | |
2722 } | |
2723 close BEDPE; | |
2724 close LINKS; | |
2725 | |
2726 } | |
2727 #------------------------------------------------------------------------------# | |
2728 #------------------------------------------------------------------------------# | |
2729 #convert the links file to the bed format | |
2730 sub links2bedfile{ | |
2731 | |
2732 my ($tag_length,$color_code,$links_file,$bed_file)=@_; | |
2733 | |
2734 print LOG "# Converting to the bed format...\n"; | |
2735 | |
2736 my $compare=1; | |
2737 if($links_file!~/compared$/){ | |
2738 $compare=0; | |
2739 $tag_length->{none}->{1}=$tag_length->{1}; | |
2740 $tag_length->{none}->{2}=$tag_length->{2}; | |
2741 } | |
2742 | |
2743 #color-code hash table | |
2744 tie (my %hcolor,'Tie::IxHash'); | |
2745 my %color_order; | |
2746 my $n=1; | |
2747 foreach my $col (keys %{$color_code}){ | |
2748 my ($min_links,$max_links)=split(",",$color_code->{$col}); | |
2749 $hcolor{$col}=[$min_links,$max_links]; | |
2750 $color_order{$col}=$n; | |
2751 $n++; | |
2752 } | |
2753 | |
2754 my %pair; | |
2755 my %pt; | |
2756 $n=1; | |
2757 open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; | |
2758 | |
2759 my %str=( "F"=>"+", "R"=>"-" ); | |
2760 | |
2761 while(<LINKS>){ | |
2762 | |
2763 my @t=split; | |
2764 my $sample=($compare)? pop(@t):"none"; | |
2765 | |
2766 my $chr1=$t[0]; | |
2767 my $chr2=$t[3]; | |
2768 $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); | |
2769 $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); | |
2770 my $same_chr=($chr1 eq $chr2)? 1:0; | |
2771 | |
2772 my $count=$t[6]; | |
2773 my $color=getColor($count,\%hcolor,"bed"); | |
2774 | |
2775 my @pairs=deleteBadOrderSensePairs(split(",",$t[7])); | |
2776 my @strand1=deleteBadOrderSensePairs(split(",",$t[8])); | |
2777 my @strand2=deleteBadOrderSensePairs(split(",",$t[9])); | |
2778 my @ends_order1=deleteBadOrderSensePairs(split(",",$t[10])); | |
2779 my @ends_order2=deleteBadOrderSensePairs(split(",",$t[11])); | |
2780 my @position1=deleteBadOrderSensePairs(split(",",$t[14])); | |
2781 my @position2=deleteBadOrderSensePairs(split(",",$t[15])); | |
2782 my @start1; my @end1; getCoordswithLeftMost(\@start1,\@end1,\@position1,\@strand1,\@ends_order1,$tag_length->{$sample}); | |
2783 my @start2; my @end2; getCoordswithLeftMost(\@start2,\@end2,\@position2,\@strand2,\@ends_order2,$tag_length->{$sample}); | |
2784 | |
2785 | |
2786 for my $p (0..$#pairs){ | |
2787 | |
2788 if (!exists $pair{$pairs[$p]}){ | |
2789 | |
2790 if($same_chr){ | |
2791 | |
2792 $pair{$pairs[$p]}->{0}=[ $chr1, $start1[$p]-1, $end2[$p], $pairs[$p], 0, $str{$strand1[$p]}, | |
2793 $start1[$p]-1, $end2[$p], $color, | |
2794 2, $tag_length->{$sample}->{$ends_order1[$p]}.",".$tag_length->{$sample}->{$ends_order2[$p]}, "0,".($start2[$p]-$start1[$p]) ]; | |
2795 $pt{$n}=$pair{$pairs[$p]}->{0}; | |
2796 $n++; | |
2797 | |
2798 }else{ | |
2799 | |
2800 $pair{$pairs[$p]}->{1}=[ $chr1, $start1[$p]-1, $end1[$p] , $pairs[$p]."/1", 0, $str{$strand1[$p]}, | |
2801 $start1[$p]-1, $end1[$p], $color, | |
2802 1, $tag_length->{$sample}->{$ends_order1[$p]}, 0]; | |
2803 $pt{$n}=$pair{$pairs[$p]}->{1}; | |
2804 $n++; | |
2805 | |
2806 | |
2807 $pair{$pairs[$p]}->{2}=[ $chr2, $start2[$p]-1, $end2[$p], $pairs[$p]."/2", 0, $str{$strand2[$p]}, | |
2808 $start2[$p]-1, $end2[$p], $color, | |
2809 1, $tag_length->{$sample}->{$ends_order2[$p]}, 0]; | |
2810 $pt{$n}=$pair{$pairs[$p]}->{2}; | |
2811 $n++; | |
2812 } | |
2813 }else{ | |
2814 | |
2815 if($same_chr){ | |
2816 ${$pair{$pairs[$p]}->{0}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{0}}[8]}); | |
2817 }else{ | |
2818 ${$pair{$pairs[$p]}->{1}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{1}}[8]}); | |
2819 ${$pair{$pairs[$p]}->{2}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{2}}[8]}); | |
2820 } | |
2821 } | |
2822 } | |
2823 } | |
2824 close LINKS; | |
2825 | |
2826 my $nb_pairs=$n-1; | |
2827 | |
2828 open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; | |
2829 print BED "track name=\"$bed_file\" description=\"mate pairs involved in links\" ". | |
2830 "visibility=2 itemRgb=\"On\"\n"; | |
2831 | |
2832 for my $i (1..$nb_pairs){ | |
2833 print BED join("\t",@{$pt{$i}})."\n"; | |
2834 } | |
2835 | |
2836 close BED; | |
2837 | |
2838 print LOG "-- output created: $bed_file\n"; | |
2839 | |
2840 undef %pair; | |
2841 undef %pt; | |
2842 | |
2843 } | |
2844 #------------------------------------------------------------------------------# | |
2845 #------------------------------------------------------------------------------# | |
2846 sub deleteBadOrderSensePairs{ | |
2847 | |
2848 my (@tab)=@_; | |
2849 my @tab2; | |
2850 | |
2851 foreach my $v (@tab){ | |
2852 | |
2853 $v=~s/[\(\)]//g; | |
2854 push(@tab2,$v) if($v!~/[\$\*\@]$/); | |
2855 } | |
2856 return @tab2; | |
2857 } | |
2858 #------------------------------------------------------------------------------# | |
2859 #------------------------------------------------------------------------------# | |
2860 sub getAllEntries{ | |
2861 my (@tab)=split (/,/,$_[0]); | |
2862 my @tab2; | |
2863 | |
2864 foreach my $v (@tab){ | |
2865 | |
2866 $v=~s/[\(\)]//g; | |
2867 push(@tab2,$v); | |
2868 } | |
2869 return @tab2; | |
2870 }#------------------------------------------------------------------------------# | |
2871 #------------------------------------------------------------------------------# | |
2872 sub getAllEntriesWOspecialChar{ | |
2873 my (@tab)=split (/,/,$_[0]); | |
2874 my @tab2; | |
2875 | |
2876 foreach my $v (@tab){ | |
2877 | |
2878 $v=~s/[\(\)\$\*\@]//g; | |
2879 push(@tab2,$v); | |
2880 } | |
2881 return @tab2; | |
2882 } | |
2883 #------------------------------------------------------------------------------# | |
2884 #------------------------------------------------------------------------------# | |
2885 sub links2SVfile{ | |
2886 | |
2887 my($links_file,$sv_file)=@_; | |
2888 | |
2889 print LOG "# Converting to the sv output table...\n"; | |
2890 open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; | |
2891 open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; | |
2892 | |
2893 my @header=qw(chr_type SV_type BAL_type chromosome1 start1-end1 average_dist | |
2894 chromosome2 start2-end2 nb_pairs score_strand_filtering score_order_filtering score_insert_size_filtering | |
2895 final_score breakpoint1_start1-end1 breakpoint2_start2-end2); | |
2896 | |
2897 my $nb_links=0; | |
2898 | |
2899 while (<LINKS>){ | |
2900 | |
2901 my @t=split; | |
2902 my @sv=(); | |
2903 my $sv_type="-"; | |
2904 my $strand_ratio="-"; | |
2905 my $eq_ratio="-"; | |
2906 my $eq_type="-"; | |
2907 my $insert_ratio="-"; | |
2908 my $link="-"; | |
2909 my ($bk1, $bk2)=("-","-"); | |
2910 my $score="-"; | |
2911 | |
2912 my ($chr1,$start1,$end1)=($t[0],$t[1],$t[2]); | |
2913 my ($chr2,$start2,$end2)=($t[3],$t[4],$t[5]); | |
2914 my $nb_pairs=$t[6]; | |
2915 $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); | |
2916 $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); | |
2917 my $chr_type=($chr1 eq $chr2)? "INTRA":"INTER"; | |
2918 | |
2919 #if strand filtering | |
2920 if (defined $t[16]){ | |
2921 #if inter-chr link | |
2922 $sv_type=$t[16]; | |
2923 if(defined $t[17] && $t[17]=~/^(\d+)\/(\d+)$/){ | |
2924 $strand_ratio=floor($1/$2*100)."%"; | |
2925 $score=$t[18]; | |
2926 } | |
2927 if(defined $t[18] && $t[18]=~/^(\d+)\/(\d+)$/){ | |
2928 #if intra-chr link with insert size filtering | |
2929 $strand_ratio=floor($1/$2*100)."%"; | |
2930 $link=floor($t[17]); | |
2931 if($sv_type!~/^INV/){ | |
2932 $insert_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); | |
2933 $score=$t[20]; | |
2934 }else{ | |
2935 $score=$t[19]; | |
2936 } | |
2937 } | |
2938 } | |
2939 | |
2940 if(defined $t[18] && ($t[18] eq "UNBAL" || $t[18] eq "BAL")){ | |
2941 | |
2942 #if strand and order filtering only and/or interchr link | |
2943 $eq_type=$t[18]; | |
2944 $eq_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); | |
2945 ($bk1, $bk2)=($t[20],$t[21]); | |
2946 foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} | |
2947 $score=$t[22]; | |
2948 | |
2949 }elsif(defined $t[19] && ($t[19] eq "UNBAL" || $t[19] eq "BAL")){ | |
2950 | |
2951 #if all three filtering | |
2952 $link=floor($t[17]); | |
2953 $eq_type=$t[19]; | |
2954 $eq_ratio=floor($1/$2*100)."%" if($t[20]=~/^(\d+)\/(\d+)$/); | |
2955 | |
2956 if(defined $t[21] && $t[21]=~/^(\d+)\/(\d+)$/){ | |
2957 $insert_ratio=floor($1/$2*100)."%"; | |
2958 ($bk1, $bk2)=($t[22],$t[23]); | |
2959 $score=$t[24]; | |
2960 | |
2961 }else{ | |
2962 ($bk1, $bk2)=($t[21],$t[22]); | |
2963 $score=$t[23]; | |
2964 } | |
2965 foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} | |
2966 | |
2967 } | |
2968 | |
2969 | |
2970 push(@sv, $chr_type, $sv_type,$eq_type); | |
2971 push(@sv,"$chr1\t$start1-$end1"); | |
2972 push(@sv, $link); | |
2973 push(@sv,"$chr2\t$start2-$end2", | |
2974 $nb_pairs,$strand_ratio,$eq_ratio,$insert_ratio, decimal($score,4), $bk1, $bk2); | |
2975 | |
2976 | |
2977 print SV join("\t",@sv)."\n"; | |
2978 } | |
2979 | |
2980 close LINKS; | |
2981 close SV; | |
2982 | |
2983 system "sort -k 9,9nr -k 13,13nr $sv_file > $sv_file.sorted"; | |
2984 | |
2985 open SV, "<".$sv_file.".sorted" or die "$0: can't open in the output: $sv_file".".sorted :$!\n"; | |
2986 my @links=<SV>; | |
2987 close SV; | |
2988 | |
2989 open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; | |
2990 | |
2991 print SV join("\t",@header)."\n"; | |
2992 print SV @links; | |
2993 close SV; | |
2994 | |
2995 unlink($sv_file.".sorted"); | |
2996 | |
2997 print LOG "-- output created: $sv_file\n"; | |
2998 | |
2999 } | |
3000 #------------------------------------------------------------------------------# | |
3001 sub densityCalculation{ | |
3002 | |
3003 my ($chr,$chrID,$file,$tag_length,$window_dist,$step,$mates_file,$mates_file_ref,$density_file,$input_format)=@_; | |
3004 | |
3005 my @sfile=split(/\./,$$mates_file[$file]); | |
3006 my $fchr=$sfile[$#sfile]; | |
3007 | |
3008 my $fh = new FileHandle; | |
3009 | |
3010 my %density; | |
3011 my %density_ref; | |
3012 my @ratio; | |
3013 my ($cov,$cov_ref); | |
3014 | |
3015 #FREQUENCY CALCULATION PROCEDURE | |
3016 print LOG "# $fchr : Frequency calculation procedure...\n"; | |
3017 &FreqCalculation(\%density,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file[$file],$input_format); | |
3018 &FreqCalculation(\%density_ref,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file_ref[$file],$input_format); | |
3019 | |
3020 #MAKING RATIO AND OUTPUT | |
3021 print LOG "\# Ratio calculation procedure...\n"; | |
3022 $density_file=~s/\/mates\//\/density\//; | |
3023 $fh->open(">".$density_file) or die "$0: can't write in the output ".$density_file." :$!\n"; | |
3024 | |
3025 foreach my $k (1..$chr->{nb_chrs}){ | |
3026 foreach my $frag (1..$chr->{$k}->{nb_frag}){ | |
3027 | |
3028 @ratio= ($chr->{$k}->{name}, | |
3029 (${$chr->{$k}->{$frag}}[0]+1), | |
3030 (${$chr->{$k}->{$frag}}[1]+1)); | |
3031 | |
3032 $cov=(exists $density{$k}{$frag}->{count})? $density{$k}{$frag}->{count}:0; | |
3033 $cov_ref=(exists $density_ref{$k}{$frag}->{count})? $density_ref{$k}{$frag}->{count}:0; | |
3034 | |
3035 push(@ratio,$cov,$cov_ref); | |
3036 push(@ratio,log($cov/$cov_ref)) if($cov && $cov_ref); | |
3037 push(@ratio,-log($cov_ref+1)) if(!$cov && $cov_ref); | |
3038 push(@ratio,log($cov+1)) if($cov && !$cov_ref); | |
3039 next if(!$cov && !$cov_ref); | |
3040 | |
3041 print $fh join("\t",@ratio)."\n"; | |
3042 } | |
3043 } | |
3044 | |
3045 $fh->close; | |
3046 print LOG "-- output created: $density_file\n"; | |
3047 | |
3048 undef %density; | |
3049 undef %density_ref; | |
3050 } | |
3051 #------------------------------------------------------------------------------# | |
3052 #------------------------------------------------------------------------------# | |
3053 sub FreqCalculation{ | |
3054 | |
3055 my ($density,$chr,$chrID,$tag_length,$window_dist,$step,$mates_file,$input_format) = @_; | |
3056 | |
3057 my @sfile=split(/\./,$mates_file); | |
3058 my $fchr=$sfile[$#sfile]; | |
3059 my $fh = new FileHandle; | |
3060 | |
3061 my $nb_windows=0; | |
3062 my $warn=100000; | |
3063 my $record=0; | |
3064 my %pair; | |
3065 | |
3066 my ($sumX,$sumX2) = (0,0); | |
3067 | |
3068 print LOG "\# Frequency calculation for $mates_file...\n"; | |
3069 | |
3070 if ($mates_file =~ /.gz$/) { | |
3071 $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; | |
3072 }elsif($mates_file =~ /.bam$/){ | |
3073 o$fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY | |
3074 }else{ | |
3075 $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n"; | |
3076 } | |
3077 | |
3078 while(<$fh>){ | |
3079 | |
3080 my @t=split; | |
3081 my $mate=$t[0]; | |
3082 | |
3083 my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2,); | |
3084 | |
3085 next if(exists $pair{$mate}); | |
3086 | |
3087 next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2,\$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); | |
3088 | |
3089 next unless (exists $chrID->{$chr_read1} || exists $chrID->{$chr_read2}); | |
3090 ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); | |
3091 | |
3092 $pair{$mate}=undef; | |
3093 $record++; | |
3094 | |
3095 my ($coord_start_read1,$coord_end_read1, $coord_start_read2,$coord_end_read2); | |
3096 | |
3097 recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format); | |
3098 recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format); | |
3099 | |
3100 my $length = abs($coord_start_read1-$coord_start_read2); | |
3101 $sumX += $length; #add to sum and sum^2 for mean and variance calculation | |
3102 $sumX2 += $length*$length; | |
3103 | |
3104 for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){ | |
3105 | |
3106 if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){ | |
3107 | |
3108 &addToDensity($density,$chr_read1,$i,\$nb_windows) | |
3109 if(overlap($coord_start_read1,$coord_end_read2,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1])); | |
3110 | |
3111 }else{ | |
3112 | |
3113 $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step); | |
3114 } | |
3115 } | |
3116 | |
3117 if($record>=$warn){ | |
3118 print LOG "-- $warn mate-pairs analysed - $nb_windows points created\n"; | |
3119 $warn+=100000; | |
3120 } | |
3121 } | |
3122 $fh->close; | |
3123 | |
3124 print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_windows points created\n"; | |
3125 | |
3126 if($record>0){ | |
3127 | |
3128 my $mu = $sumX/$record; | |
3129 my $sigma = sqrt($sumX2/$record - $mu*$mu); | |
3130 print LOG "-- $fchr : mu length = $mu, sigma length = $sigma\n"; | |
3131 } | |
3132 | |
3133 } | |
3134 #------------------------------------------------------------------------------# | |
3135 #------------------------------------------------------------------------------# | |
3136 sub ratio2segdup{ | |
3137 | |
3138 my($id,$density_file,$segdup_file)=@_; | |
3139 | |
3140 print LOG "# Converting to circos format...\n"; | |
3141 | |
3142 open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n"; | |
3143 open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; | |
3144 | |
3145 while(<RATIO>){ | |
3146 chomp; | |
3147 my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5]; | |
3148 print SEGDUP "$id$chr1\t$start1\t$end1\t$ratio\n"; | |
3149 } | |
3150 | |
3151 close RATIO; | |
3152 close SEGDUP; | |
3153 print LOG "-- output created: $segdup_file\n"; | |
3154 } | |
3155 #------------------------------------------------------------------------------# | |
3156 #------------------------------------------------------------------------------# | |
3157 sub ratio2bedfile{ | |
3158 | |
3159 my($density_file,$bed_file)=@_; | |
3160 | |
3161 print LOG "# Converting to bedGraph format...\n"; | |
3162 | |
3163 open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n"; | |
3164 open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; | |
3165 print BED "track type=bedGraph name=\"$bed_file\" description=\"log ratios for cnv detection\" ". | |
3166 "visibility=2 color=255,0,0 alwaysZero=\"On\"\n"; | |
3167 | |
3168 while(<RATIO>){ | |
3169 chomp; | |
3170 my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5]; | |
3171 $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/); | |
3172 print BED "$chr1\t".($start1-1)."\t$end1\t$ratio\n"; | |
3173 } | |
3174 | |
3175 close RATIO; | |
3176 close BED; | |
3177 print LOG "-- output created: $bed_file\n"; | |
3178 } | |
3179 #------------------------------------------------------------------------------# | |
3180 #------------------------------------------------------------------------------# | |
3181 sub inverseSense{ | |
3182 | |
3183 my $mate_sense=$_[0]; | |
3184 my %reverse=( 'F' => 'R' , 'R' => 'F' , | |
3185 'FF' => 'RR', 'RR' => 'FF', | |
3186 'FR' => 'RF', 'RF' => 'FR'); | |
3187 return $reverse{$mate_sense}; | |
3188 } | |
3189 | |
3190 #------------------------------------------------------------------------------# | |
3191 #------------------------------------------------------------------------------# | |
3192 sub getNextFrag{ | |
3193 | |
3194 my ($read_start,$frag_num,$frag_start,$frag_last,$window_dist,$step)=@_; | |
3195 | |
3196 my $how_far = $read_start-$frag_start; | |
3197 my $nb_windows_toskip; | |
3198 | |
3199 if($how_far>0){ | |
3200 | |
3201 $nb_windows_toskip=($how_far/$step)-($window_dist/$step); | |
3202 $nb_windows_toskip=~ s/\..*//; | |
3203 $nb_windows_toskip=0 if($nb_windows_toskip<0); | |
3204 return ($frag_num + $nb_windows_toskip); | |
3205 } | |
3206 return $frag_last; | |
3207 } | |
3208 #------------------------------------------------------------------------------# | |
3209 #------------------------------------------------------------------------------# | |
3210 sub getColor{ | |
3211 | |
3212 my($count,$hcolor,$format)=@_; | |
3213 for my $col ( keys % { $hcolor} ) { | |
3214 return $col if($count>=$hcolor->{$col}->[0] && $count<=$hcolor->{$col}->[1]); | |
3215 } | |
3216 return "white" if($format eq "circos"); | |
3217 return "255,255,255" if($format eq "bed"); | |
3218 } | |
3219 #------------------------------------------------------------------------------# | |
3220 #------------------------------------------------------------------------------# | |
3221 sub recupCoords{ | |
3222 | |
3223 my($c_hit,$cs_hit,$ce_hit,$tag_length,$input_format)=@_; | |
3224 my $strand = 'F'; | |
3225 | |
3226 if ($c_hit=~s/^\-//) { | |
3227 $strand='R'; | |
3228 $$cs_hit=$c_hit; | |
3229 $$ce_hit=$c_hit-($tag_length-1); | |
3230 }else{ | |
3231 $$cs_hit=$c_hit; | |
3232 $$ce_hit=$c_hit+($tag_length-1); | |
3233 } | |
3234 return $strand; | |
3235 | |
3236 } | |
3237 #------------------------------------------------------------------------------# | |
3238 #------------------------------------------------------------------------------# | |
3239 sub overlap { | |
3240 my($cs_hit,$ce_hit,$cs_region,$ce_region)=@_; | |
3241 if( (($cs_hit < $cs_region) && ($ce_hit < $cs_region )) || (($cs_hit > $ce_region) && ($ce_hit > $ce_region )) ) { | |
3242 return 0; | |
3243 } | |
3244 return 1; | |
3245 } | |
3246 #------------------------------------------------------------------------------# | |
3247 #------------------------------------------------------------------------------# | |
3248 sub makeLink { | |
3249 | |
3250 my ($link,$chr1,$frag1,$chr2,$frag2,$mt,$nb)=@_; | |
3251 | |
3252 if($chr1>$chr2){ | |
3253 ($chr1,$chr2)= ($chr2,$chr1); | |
3254 ($frag1,$frag2)= ($frag2,$frag1); | |
3255 } | |
3256 | |
3257 if($chr1 == $chr2){ | |
3258 if($frag1>$frag2){ | |
3259 ($frag1,$frag2)= ($frag2,$frag1); | |
3260 } | |
3261 } | |
3262 | |
3263 if(!exists $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}){ | |
3264 $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}=$mt; | |
3265 $$nb++; | |
3266 }elsif($link->{$chr1}->{$chr2}->{$frag1}->{$frag2}!~/(^|,)$mt(,|$)/){ | |
3267 $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}.=",$mt"; | |
3268 } | |
3269 } | |
3270 #------------------------------------------------------------------------------# | |
3271 #------------------------------------------------------------------------------# | |
3272 #fonction of adding the read to the density profile | |
3273 sub addToDensity { | |
3274 | |
3275 my ($density,$chr1,$frag1,$nb)=@_; | |
3276 | |
3277 if(!exists $density->{$chr1}->{$frag1}->{count}){ | |
3278 $density->{$chr1}->{$frag1}->{count}=1; | |
3279 $$nb++; | |
3280 }else{ | |
3281 $density->{$chr1}->{$frag1}->{count}++; | |
3282 } | |
3283 } | |
3284 #------------------------------------------------------------------------------# | |
3285 #------------------------------------------------------------------------------# | |
3286 sub floor { | |
3287 my $nb = $_[0]; | |
3288 $nb=~ s/\..*//; | |
3289 return $nb; | |
3290 } | |
3291 #------------------------------------------------------------------------------# | |
3292 sub decimal{ | |
3293 | |
3294 my $num=shift; | |
3295 my $digs_to_cut=shift; | |
3296 | |
3297 $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); | |
3298 | |
3299 return $num; | |
3300 } | |
3301 | |
3302 #------------------------------------------------------------------------------# | |
3303 sub max { | |
3304 | |
3305 my($max) = shift(@_); | |
3306 foreach my $temp (@_) { | |
3307 $max = $temp if $temp > $max; | |
3308 } | |
3309 return($max); | |
3310 } | |
3311 #------------------------------------------------------------------------------# | |
3312 #------------------------------------------------------------------------------# | |
3313 sub min { | |
3314 | |
3315 my($min) = shift(@_); | |
3316 foreach my $temp (@_) { | |
3317 $min = $temp if $temp < $min; | |
3318 } | |
3319 return($min); | |
3320 } | |
3321 #------------------------------------------------------------------------------# | |
3322 #------------------------------------------------------------------------------# | |
3323 sub sortTablebyIndex{ | |
3324 my ($tab1,$tab2)=@_; | |
3325 my @tab3; | |
3326 | |
3327 foreach my $i (@$tab1){ | |
3328 $tab3[$i]=$$tab2[$$tab1[$i]]; | |
3329 } | |
3330 return @tab3; | |
3331 } | |
3332 #------------------------------------------------------------------------------# | |
3333 #------------------------------------------------------------------------------# | |
3334 sub round { | |
3335 my $number = shift || 0; | |
3336 my $dec = 10 ** (shift || 0); | |
3337 return int( $dec * $number + .5 * ($number <=> 0)) / $dec; | |
3338 } | |
3339 #------------------------------------------------------------------------------# | |
3340 #------------------------------------------------------------------------------# | |
3341 sub getUniqueTable{ | |
3342 | |
3343 my (@tab)=@_; | |
3344 my (%saw,@out)=(); | |
3345 undef %saw; | |
3346 return sort(grep(!$saw{$_}++, @tab)); | |
3347 } | |
3348 #------------------------------------------------------------------------------# | |
3349 #------------------------------------------------------------------------------# | |
3350 sub catFiles { | |
3351 | |
3352 unlink("$_[1]") if(exists $_[1]); | |
3353 system qq( cat "$_" >> "$_[1]" ) for @{$_[0]}; | |
3354 } | |
3355 #------------------------------------------------------------------------------# | |
3356 #------------------------------------------------------------------------------# | |
3357 #check if the configuration file is correct | |
3358 sub validateconfiguration{ | |
3359 | |
3360 my %conf=%{$_[0]}; | |
3361 my $list_prgs="@ARGV"; | |
3362 | |
3363 my @general_params=qw(input_format mates_orientation read1_length read2_length mates_file cmap_file); | |
3364 my @detection_params=qw(split_mate_file window_size step_length split_mate_file); | |
3365 my @filtering_params=qw(split_link_file nb_pairs_threshold strand_filtering split_link_file); | |
3366 my @circos_params=qw(organism_id colorcode); | |
3367 my @bed_params=qw(colorcode); | |
3368 my @compare_params=qw(list_samples file_suffix); | |
3369 | |
3370 foreach my $dir ($conf{general}{output_dir},$conf{general}{tmp_dir}){ | |
3371 | |
3372 unless (defined($dir)) { | |
3373 $dir = "."; | |
3374 } | |
3375 unless (-d $dir){ | |
3376 mkdir $dir or die; | |
3377 } | |
3378 $dir.="/" if($dir!~/\/$/); | |
3379 } | |
3380 | |
3381 unless (defined($conf{general}{num_threads})) { | |
3382 $conf{general}{num_threads} = 1; | |
3383 } | |
3384 $conf{general}{num_threads}=24 if($conf{general}{num_threads}>24); | |
3385 | |
3386 if($list_prgs!~/links2compare/){ | |
3387 | |
3388 foreach my $p (@general_params){ | |
3389 die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{general}{$p}); | |
3390 } | |
3391 | |
3392 $conf{general}{input_format}="sam" if($conf{general}{input_format} eq "bam"); | |
3393 | |
3394 unless (defined($conf{general}{sv_type})) { | |
3395 $conf{general}{sv_type} = "all"; | |
3396 } | |
3397 | |
3398 $conf{general}{read_lengths}={ 1=> $conf{general}{read1_length}, 2=> $conf{general}{read2_length}}; | |
3399 } | |
3400 | |
3401 if($list_prgs=~/(linking|cnv)/){ | |
3402 | |
3403 foreach my $p (@detection_params){ | |
3404 die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{detection}{$p}); | |
3405 } | |
3406 | |
3407 die("Error Config : The parameter \"mates_file_ref\" is not defined\n") if($list_prgs=~/cnv/ && !defined $conf{detection}{mates_file_ref}); | |
3408 | |
3409 if($conf{detection}{step_length}>$conf{detection}{window_size}){ | |
3410 die("Error Config : Parameter \"step_length\" should not exceed \"window size\"\n"); | |
3411 } | |
3412 | |
3413 unless (-d $conf{general}{tmp_dir}."/mates"){ | |
3414 mkdir $conf{general}{tmp_dir}."/mates" or die; | |
3415 } | |
3416 | |
3417 if($list_prgs=~/linking/){ | |
3418 unless (-d $conf{general}{tmp_dir}."/links"){ | |
3419 mkdir $conf{general}{tmp_dir}."/links" or die; | |
3420 } | |
3421 } | |
3422 if($list_prgs=~/cnv/){ | |
3423 unless (-d $conf{general}{tmp_dir}."/density"){ | |
3424 mkdir $conf{general}{tmp_dir}."/density" or die; | |
3425 } | |
3426 } | |
3427 | |
3428 } | |
3429 | |
3430 if($list_prgs=~/filtering/){ | |
3431 | |
3432 foreach my $p (@filtering_params) { | |
3433 die("Error Config : The filtering parameter \"$p\" is not defined\n") if (!defined $conf{filtering}{$p}); | |
3434 | |
3435 } | |
3436 | |
3437 if(defined($conf{filtering}{chromosomes})) { | |
3438 my @chrs=split(",",$conf{filtering}{chromosomes}); | |
3439 my $exclude=($chrs[0]=~/^\-/)? 1:0; | |
3440 for my $chrName (@chrs){ | |
3441 | |
3442 die("Error Config : The filtering parameter \"chromosomes\" is not valid\n") | |
3443 if(($chrName!~/^\-/ && $exclude) || ($chrName=~/^\-/ && !$exclude)); | |
3444 | |
3445 } | |
3446 } | |
3447 | |
3448 if (( $conf{filtering}{order_filtering} )&& !$conf{filtering}{strand_filtering}) { | |
3449 die("Error Config : The parameter strand_filtering is set to \"0\" while order_filtering is selected". | |
3450 "\nChange strand_filtering to \"1\" if you want to use the order filtering\n"); | |
3451 } | |
3452 if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{order_filtering}) { | |
3453 die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use order filtering\n"); | |
3454 } | |
3455 if (( $conf{filtering}{insert_size_filtering} )&& !$conf{filtering}{strand_filtering}) { | |
3456 die("Error Config : The parameter strand_filtering is set to \"0\" while insert_size_filtering is selected". | |
3457 "\nChange strand_filtering to \"1\" if you want to use the insert size filtering\n"); | |
3458 } | |
3459 if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{insert_size_filtering}) { | |
3460 die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use discriminate insertions from deletions\n"); | |
3461 } | |
3462 | |
3463 if (!defined($conf{filtering}{indel_sigma_threshold})) { | |
3464 $conf{filtering}{indel_sigma_threshold} = 2; | |
3465 } | |
3466 if (!defined($conf{filtering}{dup_sigma_threshold})) { | |
3467 $conf{filtering}{dup_sigma_threshold} = 2; | |
3468 } | |
3469 if (!defined($conf{filtering}{singleton_sigma_threshold})) { | |
3470 $conf{filtering}{singleton_sigma_threshold} = 4; | |
3471 } | |
3472 | |
3473 if (!defined($conf{filtering}{nb_pairs_order_threshold})) { | |
3474 $conf{filtering}{nb_pairs_order_threshold} = 1; | |
3475 } | |
3476 | |
3477 if (!defined($conf{filtering}{final_score_threshold})) { | |
3478 $conf{filtering}{final_score_threshold} = 0.8; | |
3479 } | |
3480 | |
3481 if ($conf{filtering}{nb_pairs_order_threshold}>$conf{filtering}{nb_pairs_threshold}) { | |
3482 die("Error Config : Parameter \"nb_pairs_order_threshold\" should not exceed \"nb_pairs_threshold\"\n"); | |
3483 } | |
3484 | |
3485 } | |
3486 | |
3487 if($list_prgs=~/2circos$/){ | |
3488 foreach my $p (@circos_params) { | |
3489 next if($list_prgs=~/^ratio/ && $p eq "colorcode"); | |
3490 die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); | |
3491 } | |
3492 } | |
3493 | |
3494 if($list_prgs=~/2bed$/){ | |
3495 foreach my $p (@bed_params) { | |
3496 die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); | |
3497 } | |
3498 } | |
3499 | |
3500 if($list_prgs=~/links2compare/){ | |
3501 foreach my $p (@compare_params) { | |
3502 die("Error Config : The compare parameter \"$p\" is not defined\n") if (!defined $conf{compare}{$p}); | |
3503 } | |
3504 | |
3505 unless (defined($conf{compare}{same_sv_type})) { | |
3506 $conf{compare}{same_sv_type} = 0; | |
3507 } | |
3508 | |
3509 unless (defined($conf{compare}{min_overlap})) { | |
3510 $conf{compare}{min_overlap} = 1E-9; | |
3511 } | |
3512 | |
3513 if($conf{compare}{circos_output}){ | |
3514 foreach my $p (@circos_params) { | |
3515 next if($list_prgs=~/^ratio/ && $p eq "colorcode"); | |
3516 die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); | |
3517 } | |
3518 } | |
3519 if($conf{compare}{bed_output}){ | |
3520 foreach my $p (@bed_params) { | |
3521 die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); | |
3522 } | |
3523 die("Error Config : The compare parameter \"list_read_lengths\" is not defined\n") if (!defined $conf{compare}{list_read_lengths}); | |
3524 | |
3525 my @samples=split(",",$conf{compare}{list_samples}); | |
3526 my @read_lengths=split(",",$conf{compare}{list_read_lengths}); | |
3527 for my $i (0..$#samples){ | |
3528 my @l=split("-",$read_lengths[$i]); | |
3529 $conf{compare}{read_lengths}{$samples[$i]}={ 1=> $l[0], 2=> $l[1]}; | |
3530 } | |
3531 } | |
3532 } | |
3533 | |
3534 | |
3535 } | |
3536 #------------------------------------------------------------------------------# | |
3537 #::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# |