Mercurial > repos > bzeitouni > svdetect
changeset 5:ba8c5e544948 draft
Uploaded
author | bzeitouni |
---|---|
date | Mon, 11 Jun 2012 12:31:07 -0400 |
parents | f7a84d31bd83 |
children | f6ccaaed3654 |
files | SVDetect_run_parallel.pl |
diffstat | 1 files changed, 3537 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/SVDetect_run_parallel.pl Mon Jun 11 12:31:07 2012 -0400 @@ -0,0 +1,3537 @@ +#!/usr/bin/perl -w + +=pod + +=head1 NAME + +SVDetect - Program designed to the detection of structural variations +from paired-end/mate-pair sequencing data, compatible with SOLiD and Illumina (>=1.3) reads + +Version: 0.8 for Galaxy + +=head1 SYNOPSIS + +SVDetect <command> -conf <configuration_file> [-help] [-man] + + Command: + + linking detection and isolation of links + filtering filtering of links according different parameters + links2circos links conversion to circos format + links2bed paired-ends of links converted to bed format (UCSC) + links2SV formatted output to show most significant SVs + cnv calculate copy-number profiles + ratio2circos ratio conversion to circos density format + ratio2bedgraph ratio conversion to bedGraph density format (UCSC) + +=head1 DESCRIPTION + +This is a command-line interface to SVDetect. + + +=head1 AUTHORS + +Bruno Zeitouni E<lt>bruno.zeitouni@curie.frE<gt>, +Valentina Boeva E<lt>valentina.boeva@curie.frE<gt> + +=cut + +# ------------------------------------------------------------------- + +use strict; +use warnings; + +use Pod::Usage; +use Getopt::Long; + +use Config::General; +use Tie::IxHash; +use FileHandle; +use Parallel::ForkManager; + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#PARSE THE COMMAND LINE +my %OPT; +GetOptions(\%OPT, + 'conf=s', + 'out1=s', #GALAXY + 'out2=s', #GALAXY + 'out3=s', #GALAXY + 'out4=s', #GALAXY + 'out5=s', #GALAXY + 'l=s', #GALAXY + 'N=s',#GALAXY + 'help',#GALAXY + 'man' + ); + +pod2usage() if $OPT{help}; +pod2usage(-verbose=>2) if $OPT{man}; +pod2usage(-message=> "$!", -exitval => 2) if (!defined $OPT{conf}); + +pod2usage() if(@ARGV<1); + +tie (my %func, 'Tie::IxHash',linking=>\&createlinks, + filtering=>\&filterlinks, + links2circos=>\&links2circos, + links2bed=>\&links2bed, + links2compare=>\&links2compare, + links2SV=>\&links2SV, + cnv=>\&cnv, + ratio2circos=>\&ratio2circos, + ratio2bedgraph=>\&ratio2bedgraph); + +foreach my $command (@ARGV){ + pod2usage(-message=> "Unknown command \"$command\"", -exitval => 2) if (!defined($func{$command})); +} +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# + + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#READ THE CONFIGURATION FILE +my $conf=Config::General->new( -ConfigFile => $OPT{conf}, + -Tie => "Tie::IxHash", + -AllowMultiOptions => 1, + -LowerCaseNames => 1, + -AutoTrue => 1); +my %CONF= $conf->getall; +validateconfiguration(\%CONF); #validation of the configuration parameters + +my $SAMTOOLS_BIN_DIR="/bioinfo/local/samtools"; #GALAXY + +my $pt_log_file=$OPT{l}; #GALAXY +my $pt_links_file=$OPT{out1} if($OPT{out1}); #GALAXY +my $pt_flinks_file=$OPT{out2} if($OPT{out2}); #GALAXY +my $pt_sv_file=$OPT{out3} if($OPT{out3}); #GALAXY +my $pt_circos_file=$OPT{out4} if($OPT{out4}); #GALAXY +my $pt_bed_file=$OPT{out5} if($OPT{out5}); #GALAXY + +$CONF{general}{mates_file}=readlink($CONF{general}{mates_file});#GALAXY +$CONF{general}{cmap_file}=readlink($CONF{general}{cmap_file});#GALAXY + +my $log_file=$CONF{general}{output_dir}.$OPT{N}.".svdetect_run.log"; #GALAXY +open LOG,">$log_file" or die "$0: can't open ".$log_file.":$!\n";#GALAXY +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#COMMAND EXECUTION +foreach my $command (@ARGV){ + &{$func{$command}}(); +} +print LOG "-- end\n";#GALAXY + +close LOG;#GALAXY +system "rm $pt_log_file ; ln -s $log_file $pt_log_file"; #GALAXY +exit(0); +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# + + +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::# +#FUNCTIONS +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 1: Detection of links from mate-pairs data +sub createlinks{ + + my %CHR; #main hash table 1: fragments, links + my %CHRID; + my @MATEFILES; + + my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}; + my @path=split(/\//,$output_prefix); + $output_prefix=$CONF{general}{output_dir}.$path[$#path]; + my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path]; + my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path]; + + shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters + $CONF{detection}{window_size}, + $CONF{detection}{step_length}, + $CONF{general}{cmap_file}); + + if($CONF{detection}{split_mate_file}){ + + splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix, + $CONF{general}{sv_type}, + $CONF{general}{mates_file}, + $CONF{general}{input_format}, + $CONF{general}{read_lengths} + ); + }else{ + + @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted mate files already created at $CONF{general}{tmp_dir} :$!"; + chomp(@MATEFILES); + print LOG "# Splitted mate files already created.\n"; + } + + + #Parallelization of the linking per chromosome for intra + interchrs + my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); + + foreach my $matefile (@MATEFILES){ + + my $pid = $pm->start and next; + getlinks(\%CHR, \%CHRID, $matefile); + $pm->finish; + + } + $pm->wait_all_children; + + #Merge the chromosome links file into only one + my @LINKFILES= qx{ls $tmp_links_prefix*links} or die "# Error: No links files created at $CONF{general}{tmp_dir} :$!"; + chomp(@LINKFILES); + catFiles( \@LINKFILES => "$output_prefix.links" ); + + system "rm $pt_links_file; ln -s $output_prefix.links $pt_links_file" if (defined $pt_links_file); #GALAXY + print LOG "# Linking end procedure : output created: $output_prefix.links\n"; + #unlink(@LINKFILES); + #unlink(@MATEFILES); + + undef %CHR; + undef %CHRID; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getlinks { + + my ($chr,$chrID,$tmp_mates_prefix)=@_; + + my $tmp_links_prefix=$tmp_mates_prefix; + $tmp_links_prefix=~s/\/mates\//\/links\//; + + my %PAIR; #main hash table 2: pairs + + linking($chr,$chrID, \%PAIR, #creation of all links from chromosome coordinates of pairs + $CONF{general}{read_lengths}, + $CONF{detection}{window_size}, + $CONF{detection}{step_length}, + $tmp_mates_prefix, + $CONF{general}{input_format}, + $CONF{general}{sv_type}, + "$tmp_links_prefix.links.mapped" + ); + + getUniqueLinks("$tmp_links_prefix.links.mapped", #remove the doublons + "$tmp_links_prefix.links.unique"); + + defineCoordsLinks($chr,$chrID, \%PAIR, #definition of the precise coordinates of links + $CONF{general}{input_format}, + $CONF{general}{sv_type}, + $CONF{general}{read_lengths}, + "$tmp_links_prefix.links.unique", + "$tmp_links_prefix.links.unique_defined"); + + sortLinks("$tmp_links_prefix.links.unique_defined", #sorting links from coordinates + "$tmp_links_prefix.links.sorted"); + + removeFullyOverlappedLinks("$tmp_links_prefix.links.sorted", #remove redundant links + "$tmp_links_prefix.links",1); #file output + + + undef %PAIR; + + unlink("$tmp_links_prefix.links.mapped", + "$tmp_links_prefix.links.unique", + "$tmp_links_prefix.links.unique_defined", + "$tmp_links_prefix.links.sorted"); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub splitMateFile{ + + my ($chr,$chrID,$files_list,$output_prefix,$sv_type,$mates_file,$input_format,$tag_length)=@_; + + print LOG "# Splitting the mate file \"$mates_file\" for parallel processing...\n"; + + my %filesHandle; + + #fichier matefile inter + if($sv_type=~/^(all|inter)$/){ + my $newFileName="$output_prefix.interchrs"; + push(@{$files_list},$newFileName); + my $fh = new FileHandle; + $fh->open(">$newFileName"); + $filesHandle{inter}=$fh; + } + + #fichiers matefiles intra + if($sv_type=~/^(all|intra)$/){ + foreach my $k (1..$chr->{nb_chrs}){ + my $newFileName=$output_prefix.".".$chr->{$k}->{name}; + push(@{$files_list},$newFileName); + my $fh = new FileHandle; + $fh->open(">$newFileName"); + $filesHandle{$k}=$fh; + } + } + + if ($mates_file =~ /.gz$/) { + open(MATES, "gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat + }elsif($mates_file =~ /.bam$/){ + open(MATES, "$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY + }else{ + open MATES, "<".$mates_file or die "$0: can't open ".$mates_file.":$!\n"; + } + + + while(<MATES>){ + + my @t=split; + my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2); + + next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); + + next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); + + ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); + + if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){ + my $fh2print=$filesHandle{inter}; + print $fh2print join("\t",@t)."\n"; + } + + if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){ + my $fh2print=$filesHandle{$chr_read1}; + print $fh2print join("\t",@t)."\n"; + + } + } + + foreach my $name (keys %filesHandle){ + my $fh=$filesHandle{$name}; + $fh->close; + } + + print LOG "# Splitted mate files of \"$mates_file\" created.\n"; +} + + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub filterlinks{ + + my %CHR; + my %CHRID; + my @LINKFILES; + my @FLINKFILES; + + my $output_prefix=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}; + my @path=split(/\//,$output_prefix); + $output_prefix=$CONF{general}{output_dir}.$path[$#path]; + my $tmp_links_prefix=$CONF{general}{tmp_dir}."links/".$path[$#path]; + + createChrHashTables(\%CHR,\%CHRID, + $CONF{general}{cmap_file}); + + if($CONF{filtering}{split_link_file}){ + + splitLinkFile(\%CHR, \%CHRID, \@LINKFILES, + $tmp_links_prefix, + $CONF{general}{sv_type}, + "$output_prefix.links", + ); + }else{ + + @LINKFILES=qx{ls $tmp_links_prefix*links} or die "# Error: No splitted link files already created\n"; + chomp(@LINKFILES); + print LOG "# Splitted link files already created.\n"; + } + + #Parallelization of the filtering per chromosome for intra + interchrs + my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); + + foreach my $linkfile (@LINKFILES){ + + my $pid = $pm->start and next; + getFilteredlinks(\%CHR, \%CHRID, $linkfile); + $pm->finish; + + } + $pm->wait_all_children; + + #Merge the chromosome links file into only one + @FLINKFILES= qx{ls $tmp_links_prefix*filtered} or die "# Error: No links files created\n"; + chomp(@FLINKFILES); + catFiles( \@FLINKFILES => "$output_prefix.links.filtered" ); + + system "rm $pt_flinks_file; ln -s $output_prefix.links.filtered $pt_flinks_file" if (defined $pt_flinks_file); #GALAXY + print LOG"# Filtering end procedure : output created: $output_prefix.links.filtered\n"; + + undef %CHR; + undef %CHRID; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub splitLinkFile{ + + my ($chr,$chrID,$files_list,$input_prefix,$sv_type,$link_file)=@_; + + print LOG "# Splitting the link file for parallel processing...\n"; + + my %filesHandle; + + #fichier matefile inter + if($sv_type=~/^(all|inter)$/){ + my $newFileName="$input_prefix.interchrs.links"; + push(@{$files_list},$newFileName); + my $fh = new FileHandle; + $fh->open(">$newFileName"); + $filesHandle{inter}=$fh; + } + + #fichiers matefiles intra + if($sv_type=~/^(all|intra)$/){ + foreach my $k (1..$chr->{nb_chrs}){ + my $newFileName=$input_prefix.".".$chr->{$k}->{name}.".links"; + push(@{$files_list},$newFileName); + my $fh = new FileHandle; + $fh->open(">$newFileName"); + $filesHandle{$k}=$fh; + } + } + + open LINKS, "<".$link_file or die "$0: can't open ".$link_file.":$!\n"; + while(<LINKS>){ + + my @t=split; + my ($chr_read1,$chr_read2)=($t[0],$t[3]); + + next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); + + ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); + + if( ($sv_type=~/^(all|inter)$/) && ($chr_read1 ne $chr_read2) ){ + my $fh2print=$filesHandle{inter}; + print $fh2print join("\t",@t)."\n"; + } + + if( ($sv_type=~/^(all|intra)$/) && ($chr_read1 eq $chr_read2) ){ + my $fh2print=$filesHandle{$chr_read1}; + print $fh2print join("\t",@t)."\n"; + + } + } + + foreach my $name (keys %filesHandle){ + my $fh=$filesHandle{$name}; + $fh->close; + } + + print LOG "# Splitted link files created.\n"; +} + + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 2: Filtering processing +sub getFilteredlinks { + + my ($chr,$chrID,$tmp_links_prefix)=@_; + my %PAIR; + + strandFiltering($chr,$chrID, + $CONF{filtering}{nb_pairs_threshold}, #filtering of links + $CONF{filtering}{strand_filtering}, + $CONF{filtering}{chromosomes}, + $CONF{general}{input_format}, + $CONF{general}{cmap_file}, + $CONF{general}{mates_orientation}, + $CONF{general}{read_lengths}, + $tmp_links_prefix, + "$tmp_links_prefix.filtered", + ); + + if($CONF{filtering}{strand_filtering}){ #re-definition of links coordinates with strand filtering + + my @tmpfiles; + + rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_unique"); + + getUniqueLinks("$tmp_links_prefix.filtered_unique", + "$tmp_links_prefix.filtered"); + + push(@tmpfiles,"$tmp_links_prefix.filtered_unique"); + + if($CONF{filtering}{order_filtering}){ #filtering using the order + + rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_ordered"); + + orderFiltering($chr,$chrID, + $CONF{filtering}{nb_pairs_threshold}, + $CONF{filtering}{nb_pairs_order_threshold}, + $CONF{filtering}{mu_length}, + $CONF{filtering}{sigma_length}, + $CONF{general}{mates_orientation}, + $CONF{general}{read_lengths}, + "$tmp_links_prefix.filtered_ordered", + "$tmp_links_prefix.filtered", + ); + + push(@tmpfiles,"$tmp_links_prefix.filtered_ordered"); + } + + if (($CONF{filtering}{insert_size_filtering})&& + ($CONF{general}{sv_type} ne 'inter')){ + + rename("$tmp_links_prefix.filtered","$tmp_links_prefix.filtered_withoutIndelSize"); + + addInsertionInfo($chr,$chrID, + $CONF{filtering}{nb_pairs_threshold}, + $CONF{filtering}{order_filtering}, + $CONF{filtering}{indel_sigma_threshold}, + $CONF{filtering}{dup_sigma_threshold}, + $CONF{filtering}{singleton_sigma_threshold}, + $CONF{filtering}{mu_length}, + $CONF{filtering}{sigma_length}, + $CONF{general}{mates_orientation}, + $CONF{general}{read_lengths}, + "$tmp_links_prefix.filtered_withoutIndelSize", + "$tmp_links_prefix.filtered" + ); + + push(@tmpfiles,"$tmp_links_prefix.filtered_withoutIndelSize"); + } + + sortLinks("$tmp_links_prefix.filtered", + "$tmp_links_prefix.filtered_sorted"); + + removeFullyOverlappedLinks("$tmp_links_prefix.filtered_sorted", + "$tmp_links_prefix.filtered_nodup", + ); + + postFiltering("$tmp_links_prefix.filtered_nodup", + "$tmp_links_prefix.filtered", + $CONF{filtering}{final_score_threshold}); + + push(@tmpfiles,"$tmp_links_prefix.filtered_sorted","$tmp_links_prefix.filtered_nodup"); + + unlink(@tmpfiles); + + + } + undef %PAIR; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 3: Circos format conversion for links +sub links2circos{ + + my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; + my @path=split(/\//,$input_file); + $input_file=$CONF{general}{output_dir}.$path[$#path]; + + my $output_file.=$input_file.".segdup.txt"; + + links2segdup($CONF{circos}{organism_id}, + $CONF{circos}{colorcode}, + $input_file, + $output_file); #circos file output + + system "rm $pt_circos_file; ln -s $output_file $pt_circos_file" if (defined $pt_circos_file); #GALAXY +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 4: Bed format conversion for links +sub links2bed{ + + my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; + my @path=split(/\//,$input_file); + $input_file=$CONF{general}{output_dir}.$path[$#path]; + + my $output_file.=$input_file.".bed.txt"; + + links2bedfile($CONF{general}{read_lengths}, + $CONF{bed}{colorcode}, + $input_file, + $output_file); #bed file output + + system "rm $pt_bed_file; ln -s $output_file $pt_bed_file" if (defined $pt_bed_file); #GALAXY + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 6: Bed format conversion for links +sub links2SV{ + + my $input_file=$CONF{general}{mates_file}.".".$CONF{general}{sv_type}.".links.filtered"; + + my @path=split(/\//,$input_file); + $input_file=$CONF{general}{output_dir}.$path[$#path]; + + my $output_file.=$input_file.".sv.txt"; + + + links2SVfile( $input_file, + $output_file); + + system "rm $pt_sv_file; ln -s $output_file $pt_sv_file" if (defined $pt_sv_file); #GALAXY +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 7: copy number variations, coverage ratio calculation +sub cnv{ + + my %CHR; + my %CHRID; + my @MATEFILES; + my @MATEFILES_REF; + + my $output_prefix=$CONF{general}{mates_file}; + my $output_prefix_ref=$CONF{detection}{mates_file_ref}; + my @path=split(/\//,$output_prefix); + my @path_ref=split(/\//,$output_prefix_ref); + $output_prefix=$CONF{general}{output_dir}.$path[$#path]; + $output_prefix_ref=$CONF{general}{output_dir}.$path_ref[$#path_ref]; + my $tmp_mates_prefix=$CONF{general}{tmp_dir}."mates/".$path[$#path]; + my $tmp_mates_prefix_ref=$CONF{general}{tmp_dir}."mates/".$path_ref[$#path_ref]; + my $tmp_density_prefix=$CONF{general}{tmp_dir}."density/".$path[$#path]; + + shearingChromosome(\%CHR, \%CHRID, #making the genomic fragment library with the detection parameters + $CONF{detection}{window_size}, + $CONF{detection}{step_length}, + $CONF{general}{cmap_file}); + + if($CONF{detection}{split_mate_file}){ + + splitMateFile(\%CHR, \%CHRID, \@MATEFILES, $tmp_mates_prefix, + "intra", + $CONF{general}{mates_file}, + $CONF{general}{input_format}, + $CONF{general}{read_lengths} + ); + + splitMateFile(\%CHR, \%CHRID, \@MATEFILES_REF, $tmp_mates_prefix_ref, + "intra", + $CONF{detection}{mates_file_ref}, + $CONF{general}{input_format}, + $CONF{general}{read_lengths} + ); + + + }else{ + + @MATEFILES=qx{ls $tmp_mates_prefix*} or die "# Error: No splitted sample mate files of \"$CONF{general}{mates_file}\" already created at $CONF{general}{tmp_dir} :$!"; + chomp(@MATEFILES); + @MATEFILES_REF=qx{ls $tmp_mates_prefix_ref*} or die "# Error: No splitted reference mate files of \"$CONF{detection}{mates_file_ref}\" already created at $CONF{general}{tmp_dir} :$!"; + chomp(@MATEFILES_REF); + print LOG "# Splitted sample and reference mate files already created.\n"; + } + + #Parallelization of the cnv per chromosome + my $pm = new Parallel::ForkManager($CONF{general}{num_threads}); + + foreach my $file (0..$#MATEFILES){ + + my $pid = $pm->start and next; + + densityCalculation(\%CHR, \%CHRID, $file, + $CONF{general}{read_lengths}, + $CONF{detection}{window_size}, + $CONF{detection}{step_length}, + \@MATEFILES, + \@MATEFILES_REF, + $MATEFILES[$file].".density", + $CONF{general}{input_format}); + + $pm->finish; + + } + $pm->wait_all_children; + + #Merge the chromosome links file into only one + my @DENSITYFILES= qx{ls $tmp_density_prefix*density} or die "# Error: No density files created at $CONF{general}{tmp_dir} :$!"; + chomp(@DENSITYFILES); + catFiles( \@DENSITYFILES => "$output_prefix.density" ); + + print LOG "# cnv end procedure : output created: $output_prefix.density\n"; + + + undef %CHR; + undef %CHRID; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 8: Circos format conversion for cnv ratios +sub ratio2circos{ + + my $input_file=$CONF{general}{mates_file}.".density"; + + my @path=split(/\//,$input_file); + $input_file=$CONF{general}{output_dir}.$path[$#path]; + + my $output_file.=$input_file.".segdup.txt"; + + ratio2segdup($CONF{circos}{organism_id}, + $input_file, + $output_file); +} +#------------------------------------------------------------------------------# +#MAIN FUNCTION number 9: BedGraph format conversion for cnv ratios +sub ratio2bedgraph{ + + my $input_file=$CONF{general}{mates_file}.".density"; + + my @path=split(/\//,$input_file); + $input_file=$CONF{general}{output_dir}.$path[$#path]; + + my $output_file.=$input_file.".bedgraph.txt"; + + ratio2bedfile($input_file, + $output_file); #bed file output +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Creation of the fragment library +sub shearingChromosome{ + + print LOG "# Making the fragments library...\n"; + + my ($chr,$chrID,$window,$step,$cmap_file)=@_; #window and step sizes parameters + + createChrHashTables($chr,$chrID,$cmap_file); #hash tables: chromosome ID <=> chromsomes Name + + foreach my $k (1..$chr->{nb_chrs}){ + + print LOG"-- $chr->{$k}->{name}\n"; + + my $frag=1; + for (my $start=0; $start<$chr->{$k}->{length}; $start+=$step){ + + my $end=($start<($chr->{$k}->{length})-$window)? $start+$window-1:($chr->{$k}->{length})-1; + $chr->{$k}->{$frag}=[$start,$end]; #creation of fragments, coordinates storage + + if($end==($chr->{$k}->{length})-1){ + $chr->{$k}->{nb_frag}=$frag; #nb of fragments per chromosome + last; + } + $frag++; + } + } +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Creation of chromosome hash tables from the cmap file +sub createChrHashTables{ + + my ($chr,$chrID,$cmap_file)=@_; + $chr->{nb_chrs}=0; + + open CMAP, "<".$cmap_file or die "$0: can't open ".$cmap_file.":$!\n"; + while(<CMAP>){ + + if(/^\s+$/){ next;} + my ($k,$name,$length) = split; + $chr->{$k}->{name}=$name; + $chr->{$k}->{length}=$length; + $chrID->{$name}=$k; + $chr->{nb_chrs}++; + + } + close CMAP; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Read the mate file according the input format file (solid, eland or sam) +sub readMateFile{ + + my ($chr1,$chr2,$pos1,$pos2,$order1,$order2,$t,$file_type,$tag_length)=@_; + my ($strand1,$strand2); + + if($file_type eq "solid"){ + + ($$chr1,$$chr2,$$pos1,$$pos2,$$order1,$$order2)=($$t[6],$$t[7],$$t[8]+1,$$t[9]+1,1,2); #0-based + + }else{ + my ($tag_length1,$tag_length2); + ($$chr1,$$chr2,$$pos1,$strand1,$$pos2,$strand2,$$order1,$$order2,$tag_length1,$tag_length2)=($$t[11],$$t[12],$$t[7],$$t[8],$$t[9],$$t[10],1,2,length($$t[1]),length($$t[2])) #1-based + if($file_type eq "eland"); + + if($file_type eq "sam"){ + + return 0 if ($$t[0]=~/^@/); #header sam filtered out + + ($$chr1,$$chr2,$$pos1,$$pos2)=($$t[2],$$t[6],$$t[3],$$t[7]); + + return 0 if ($$chr1 eq "*" || $$chr2 eq "*"); + + $$chr2=$$chr1 if($$chr2 eq "="); + + $strand1 = (($$t[1]&0x0010))? 'R':'F'; + $strand2 = (($$t[1]&0x0020))? 'R':'F'; + + $$order1= (($$t[1]&0x0040))? '1':'2'; + $$order2= (($$t[1]&0x0080))? '1':'2'; + $tag_length1 = $tag_length->{$$order1}; + $tag_length2 = $tag_length->{$$order2}; + } + + $$pos1 = -($$pos1+$tag_length1) if ($strand1 eq "R"); #get sequencing starts + $$pos2 = -($$pos2+$tag_length2) if ($strand2 eq "R"); + } + return 1; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Parsing of the mates files and creation of links between 2 chromosomal fragments +sub linking{ + + my ($chr,$chrID,$pair,$tag_length,$window_dist,$step,$mates_file,$input_format,$sv_type,$links_file)=@_; + my %link; + + my $record=0; + my $nb_links=0; + my $warn=10000; + + my @sfile=split(/\./,$mates_file); + my $fchr=$sfile[$#sfile]; + + my $fh = new FileHandle; + + print LOG "# $fchr : Linking procedure...\n"; + print LOG "-- file=$mates_file\n". + "-- chromosome=$fchr\n". + "-- input format=$input_format\n". + "-- type=$sv_type\n". + "-- read1 length=$tag_length->{1}, read2 length=$tag_length->{2}\n". + "-- window size=$window_dist, step length=$step\n"; + + if ($mates_file =~ /.gz$/) { + $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; #gzcat + }elsif($mates_file =~ /.bam$/){ + $fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY + }else{ + $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n"; + } + + + while(<$fh>){ + + my @t=split; #for each mate-pair + my $mate=$t[0]; + my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1,$end_order_read2); + + next if(exists $$pair{$mate}); + + next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2, \$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); + + next unless (exists $chrID->{$chr_read1} && exists $chrID->{$chr_read2}); + + ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); + + if($sv_type ne "all"){ + if( ($sv_type eq "inter") && ($chr_read1 ne $chr_read2) || + ($sv_type eq "intra") && ($chr_read1 eq $chr_read2) ){ + }else{ + next; + } + } + + $$pair{$mate}=[$chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2 ]; #fill out the hash pair table (ready for the defineCoordsLinks function) + + $record++; + + my ($coord_start_read1,$coord_end_read1,$coord_start_read2,$coord_end_read2); #get the coordinates of each read + + recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format); + recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format); + + for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){ #fast genome parsing for link creation + + if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){ + + if(overlap($coord_start_read1,$coord_end_read1,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1])){ + + for(my $j=1;$j<=$chr->{$chr_read2}->{'nb_frag'};$j++){ + + if (abs ($coord_start_read2-${$chr->{$chr_read2}->{$j}}[0]) <= $window_dist) { + + if(overlap($coord_start_read2,$coord_end_read2,${$chr->{$chr_read2}->{$j}}[0],${$chr->{$chr_read2}->{$j}}[1])){ + + makeLink(\%link,$chr_read1,$i,$chr_read2,$j,$mate,\$nb_links); #make the link + } + + }else{ + + $j=getNextFrag($coord_start_read2,$j,${$chr->{$chr_read2}->{$j}}[0],$chr->{$chr_read2}->{nb_frag},$window_dist,$step); + } + } + } + + }else{ + + $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step); + } + } + + if($record>=$warn){ + print LOG "-- $fchr : $warn mate-pairs analysed - $nb_links links done\n"; + $warn+=10000; + } + } + $fh->close; + + if(!$nb_links){ + print LOG "-- $fchr : No mate-pairs !\n". + "-- $fchr : No links have been found with the selected type of structural variations \($sv_type\)\n"; + } + + print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_links links done\n"; + + print LOG "-- $fchr : writing...\n"; + + $fh = new FileHandle; + + $fh->open(">".$links_file) or die "$0: can't write in the output ".$links_file." :$!\n"; + + foreach my $chr1 ( sort { $a <=> $b} keys %link){ #Sorted links output + + foreach my $chr2 ( sort { $a <=> $b} keys %{$link{$chr1}}){ + + foreach my $frag1 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}}){ + + foreach my $frag2 ( sort { $a <=> $b} keys %{$link{$chr1}{$chr2}{$frag1}}){ + + my @count=split(",",$link{$chr1}{$chr2}{$frag1}{$frag2}); + print $fh "$chr->{$chr1}->{name}\t".(${$chr->{$chr1}->{$frag1}}[0]+1)."\t".(${$chr->{$chr1}->{$frag1}}[1]+1)."\t". + "$chr->{$chr2}->{name}\t".(${$chr->{$chr2}->{$frag2}}[0]+1)."\t".(${$chr->{$chr2}->{$frag2}}[1]+1)."\t". + scalar @count."\t". #nb of read + $link{$chr1}{$chr2}{$frag1}{$frag2}."\n"; #mate list + } + } + } + } + + $fh->close; + + undef %link; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#remove exact links doublons according to the mate list +sub getUniqueLinks{ + + my ($links_file,$nrlinks_file)=@_; + my %links; + my %pt; + my $nb_links; + my $n=1; + + my $record=0; + my $warn=300000; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + my $fh = new FileHandle; + + print LOG "# $fchr : Getting unique links...\n"; + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + + while(<$fh>){ + + my @t=split; + my $mates=$t[7]; + $record++; + + if(!exists $links{$mates}){ #Unique links selection + + $links{$mates}=[@t]; + $pt{$n}=$links{$mates}; + $n++; + + + }else{ #get the link coordinates from the mate-pairs list + + for my $i (1,2,4,5){ #get the shortest regions + + $links{$mates}->[$i]=($t[$i]>$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #maximum start + if($i==1 || $i==4); + $links{$mates}->[$i]=($t[$i]<$links{$mates}->[$i])? $t[$i]:$links{$mates}->[$i] #minimum end + if($i==2 || $i==5); + } + } + if($record>=$warn){ + print LOG "-- $fchr : $warn links analysed - ".($n-1)." unique links done\n"; + $warn+=300000; + } + } + $fh->close; + + $nb_links=$n-1; + print LOG "-- $fchr : Total : $record links analysed - $nb_links unique links done\n"; + + $fh = new FileHandle; + $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n"; + print LOG "-- $fchr : writing...\n"; + for my $i (1..$nb_links){ + + print $fh join("\t",@{$pt{$i}})."\n"; #all links output + } + + $fh->close; + + undef %links; + undef %pt; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#get the new coordinates of each link from the mate list +sub defineCoordsLinks{ + + my ($chr,$chrID,$pair,$input_format,$sv_type,$tag_length,$links_file,$clinks_file)=@_; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + my $fh = new FileHandle; + my $fh2 = new FileHandle; + + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + $fh2->open(">$clinks_file") or die "$0: can't write in the output: $clinks_file :$!\n"; + + print LOG "# $fchr : Defining precise link coordinates...\n"; + + my $record=0; + my $warn=100000; + + my %coords; + my %strands; + my %order; + my %ends_order; + + while(<$fh>){ + + + my ($col1,$col2)=(1,2); #for an intrachromosomal link + my $diffchr=0; #difference between chr1 and chr2 + my ($chr1,$chr2,$mates_list,$npairs)=(split)[0,3,7,8]; + ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2}); + if ($chr1 != $chr2){ #for an interchromosomal link + $col1=$col2=0; #no distinction + $diffchr=1; + } + + my @pairs=split(",",$mates_list); + + $coords{$col1}{$chr1}->{start}=undef; + $coords{$col1}{$chr1}->{end}=undef; + $coords{$col2}{$chr2}->{start}=undef; + $coords{$col2}{$chr2}->{end}=undef; + $strands{$col1}{$chr1}=undef; + $strands{$col2}{$chr2}=undef; + $ends_order{$col1}{$chr1}=undef; + $ends_order{$col2}{$chr2}=undef; + + + $order{$col1}{$chr1}->{index}->{1}=undef; + $order{$col1}{$chr1}->{index}->{2}=undef; + $order{$col2}{$chr2}->{index}->{1}=undef; + $order{$col2}{$chr2}->{index}->{2}=undef; + $order{$col1}{$chr1}->{order}=undef; + $order{$col2}{$chr2}->{order}=undef; + + $record++; + + for my $p (0..$#pairs){ #for each pair + + my ($coord_start_read1,$coord_end_read1); + my ($coord_start_read2,$coord_end_read2); + my $strand_read1=recupCoords(${$$pair{$pairs[$p]}}[2],\$coord_start_read1,\$coord_end_read1,$tag_length->{${$$pair{$pairs[$p]}}[4]},$input_format); + my $strand_read2=recupCoords(${$$pair{$pairs[$p]}}[3],\$coord_start_read2,\$coord_end_read2,$tag_length->{${$$pair{$pairs[$p]}}[5]},$input_format); + + if(!$diffchr){ #for a intrachromosomal link + if($coord_start_read2<$coord_start_read1){ #get the closer start coordinate for each column + ($col1,$col2)=(2,1); + }else{ + ($col1,$col2)=(1,2); + } + } + + push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{start}},$coord_start_read1); #get coords and strands of f3 and r3 reads + push(@{$coords{$col1}{${$$pair{$pairs[$p]}}[0]}->{end}},$coord_end_read1); + push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{start}},$coord_start_read2); + push(@{$coords{$col2}{${$$pair{$pairs[$p]}}[1]}->{end}},$coord_end_read2); + push(@{$strands{$col1}{${$$pair{$pairs[$p]}}[0]}},$strand_read1); + push(@{$strands{$col2}{${$$pair{$pairs[$p]}}[1]}},$strand_read2); + push(@{$ends_order{$col1}{${$$pair{$pairs[$p]}}[0]}},${$$pair{$pairs[$p]}}[4]); + push(@{$ends_order{$col2}{${$$pair{$pairs[$p]}}[1]}},${$$pair{$pairs[$p]}}[5]); + } + + ($col1,$col2)=(1,2) if(!$diffchr); + + my $coord_start_chr1=min(min(@{$coords{$col1}{$chr1}->{start}}),min(@{$coords{$col1}{$chr1}->{end}})); #get the biggest region + my $coord_end_chr1=max(max(@{$coords{$col1}{$chr1}->{start}}),max(@{$coords{$col1}{$chr1}->{end}})); + my $coord_start_chr2=min(min(@{$coords{$col2}{$chr2}->{start}}),min(@{$coords{$col2}{$chr2}->{end}})); + my $coord_end_chr2=max(max(@{$coords{$col2}{$chr2}->{start}}),max(@{$coords{$col2}{$chr2}->{end}})); + + @{$order{$col1}{$chr1}->{index}->{1}}= sort {${$coords{$col1}{$chr1}->{start}}[$a] <=> ${$coords{$col1}{$chr1}->{start}}[$b]} 0 .. $#{$coords{$col1}{$chr1}->{start}}; + @{$order{$col2}{$chr2}->{index}->{1}}= sort {${$coords{$col2}{$chr2}->{start}}[$a] <=> ${$coords{$col2}{$chr2}->{start}}[$b]} 0 .. $#{$coords{$col2}{$chr2}->{start}}; + + foreach my $i (@{$order{$col1}{$chr1}->{index}->{1}}){ #get the rank of the chr2 reads according to the sorted chr1 reads (start coordinate sorting) + foreach my $j (@{$order{$col2}{$chr2}->{index}->{1}}){ + + if(${$order{$col1}{$chr1}->{index}->{1}}[$i] == ${$order{$col2}{$chr2}->{index}->{1}}[$j]){ + ${$order{$col1}{$chr1}->{index}->{2}}[$i]=$i; + ${$order{$col2}{$chr2}->{index}->{2}}[$i]=$j; + last; + } + } + } + + foreach my $i (@{$order{$col1}{$chr1}->{index}->{2}}){ #use rank chr1 as an ID + foreach my $j (@{$order{$col2}{$chr2}->{index}->{2}}){ + + if(${$order{$col1}{$chr1}->{index}->{2}}[$i] == ${$order{$col2}{$chr2}->{index}->{2}}[$j]){ + ${$order{$col1}{$chr1}->{order}}[$i]=$i+1; + ${$order{$col2}{$chr2}->{order}}[$i]=$j+1; + last; + } + } + } + + @pairs=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},\@pairs);#sorting of the pairs, strands, and start coords from the sorted chr2 reads + @{$strands{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col1}{$chr1}); + @{$strands{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$strands{$col2}{$chr2}); + @{$ends_order{$col1}{$chr1}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col1}{$chr1}); + @{$ends_order{$col2}{$chr2}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$ends_order{$col2}{$chr2}); + @{$coords{$col1}{$chr1}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col1}{$chr1}->{start}); + @{$coords{$col2}{$chr2}->{start}}=sortTablebyIndex(\@{$order{$col1}{$chr1}->{index}->{1}},$coords{$col2}{$chr2}->{start}); + + + my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output + $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2, + scalar @pairs, + join(",",@pairs), + join(",",@{$strands{$col1}{$chr1}}), + join(",",@{$strands{$col2}{$chr2}}), + join(",",@{$ends_order{$col1}{$chr1}}), + join(",",@{$ends_order{$col2}{$chr2}}), + join(",",@{$order{$col1}{$chr1}->{order}}), + join(",",@{$order{$col2}{$chr2}->{order}}), + join(",",@{$coords{$col1}{$chr1}->{start}}), + join(",",@{$coords{$col2}{$chr2}->{start}})); + + print $fh2 join("\t",@link)."\n"; + + if($record>=$warn){ + print LOG "-- $fchr : $warn links processed\n"; + $warn+=100000; + } + } + $fh->close; + $fh2->close; + + print LOG "-- $fchr : Total : $record links processed\n"; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Sort links according the concerned chromosomes and their coordinates +sub sortLinks{ + + my ($links_file,$sortedlinks_file,$unique)=@_; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + + print LOG "# $fchr : Sorting links...\n"; + + my $pipe=($unique)? "| sort -u":""; + system "sort -k 1,1 -k 4,4 -k 2,2n -k 5,5n -k 8,8n $links_file $pipe > $sortedlinks_file"; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#removal of fully overlapped links +sub removeFullyOverlappedLinks{ + + my ($links_file,$nrlinks_file,$warn_out)=@_; + + my %pt; + my $n=1; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + my $fh = new FileHandle; + + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + while(<$fh>){ + + my @t=split("\t",$_); + $pt{$n}=[@t]; + $n++; + } + $fh->close; + + my $nb_links=$n-1; + my $nb=$nb_links; + + my %pt2; + my $nb2=1; + my $record=0; + my $warn=10000; + + print LOG "# $fchr : Removing fully overlapped links...\n"; + + LINK: + + for my $i (1..$nb){ + + my @link=(); + my @next_link=(); + my $ind1=$i; + + $record++; + if($record>=$warn){ + print LOG "-- $fchr : $warn unique links analysed - ".($nb2-1)." non-overlapped links done\n"; + $warn+=10000; + } + + if(exists $pt{$ind1}){ + @link=@{$pt{$ind1}}; #link1 + }else{ + next LINK; + } + + my ($chr1,$start1,$end1,$chr2,$start2,$end2)=($link[0],$link[1],$link[2],$link[3],$link[4],$link[5]); #get info of link1 + my @mates=deleteBadOrderSensePairs(split(",",$link[7])); + + my $ind2=$ind1+1; + $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); #get the next found link + + if($ind2<=$nb){ + + @next_link=@{$pt{$ind2}}; #link2 + my ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); #get info of link2 + my @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); + + while(($chr1 eq $chr3 && $chr2 eq $chr4) && overlap($start1,$end1,$start3,$end3)){ #loop here according to the chr1 coordinates, need an overlap between links to enter + + if(!overlap($start2,$end2,$start4,$end4)){ #if no overlap with chr2 coordinates ->next link2 + + $ind2++; + $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); + + if($ind2>$nb){ #if no more link in the file -> save link1 + + $pt2{$nb2}=\@link; + $nb2++; + next LINK; + } + + @next_link=@{$pt{$ind2}}; + ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); + @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); + next; + } + + my %mates=map{$_ =>1} @mates; #get the equal number of mates + my @same_mates = grep( $mates{$_}, @next_mates ); + my $nb_mates= scalar @same_mates; + + if($nb_mates == scalar @mates){ + + delete $pt{$ind1}; #if pairs of link 1 are all included in link 2 -> delete link1 + next LINK; #go to link2, link2 becomes link1 + + }else{ + delete $pt{$ind2} if($nb_mates == scalar @next_mates); #if pairs of link2 are all included in link 1 -> delete link2 + $ind2++; #we continue by checking the next link2 + $ind2++ while (!exists $pt{$ind2}&& $ind2<=$nb); + + if($ind2>$nb){ #if no more link in the file -> save link1 + + $pt2{$nb2}=\@link; + $nb2++; + next LINK; + } + + @next_link=@{$pt{$ind2}}; #get info of link2 + ($chr3,$start3,$end3,$chr4,$start4,$end4)=($next_link[0],$next_link[1],$next_link[2],$next_link[3],$next_link[4],$next_link[5]); + @next_mates=deleteBadOrderSensePairs(split(",",$next_link[7])); + + } + } + } + $pt2{$nb2}=\@link; #if no (more) link with chr1 coordinates overlap -> save link1 + $nb2++; + } + + print LOG "-- $fchr : Total : $nb_links unique links analysed - ".($nb2-1)." non-overlapped links done\n"; + + #OUTPUT + + $fh = new FileHandle; + $fh->open(">$nrlinks_file") or die "$0: can't write in the output: $nrlinks_file :$!\n"; + print LOG "-- $fchr : writing...\n"; + for my $i (1..$nb2-1){ + + print $fh join("\t",@{$pt2{$i}}); #all links output + } + + close $fh; + + print LOG "-- $fchr : output created: $nrlinks_file\n" if($warn_out); + + undef %pt; + undef %pt2; +} +#------------------------------------------------------------------------------# +sub postFiltering { + + my ($links_file,$pflinks_file, $finalScore_thres)=@_; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + + my ($nb,$nb2)=(0,0); + + print LOG "# $fchr : Post-filtering links...\n"; + print LOG "-- $fchr : final score threshold = $finalScore_thres\n"; + + my $fh = new FileHandle; + my $fh2 = new FileHandle; + + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + $fh2->open(">$pflinks_file") or die "$0: can't write in the output: $pflinks_file :$!\n"; + + + while(<$fh>){ + + my @t=split("\t",$_); + my $score=$t[$#t-1]; + + if($score >= $finalScore_thres){ + print $fh2 join("\t", @t); + $nb2++; + } + $nb++; + } + $fh->close; + $fh2->close; + + print LOG "-- $fchr : Total : $nb unique links analysed - $nb2 links kept\n"; + print LOG "-- $fchr : output created: $pflinks_file\n"; +} + + + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Filtering of the links +sub strandFiltering{ + + my($chr,$chrID,$pairs_threshold,$strand_filtering,$chromosomes, + $input_format,$cmap_file,$mate_sense, $tag_length,$links_file,$flinks_file)=@_; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-1]; + + + my %chrs; + my %chrs1; + my %chrs2; + my $nb_chrs; + my $exclude; + + if($chromosomes){ + my @chrs=split(",",$chromosomes); + $nb_chrs=scalar @chrs; + $exclude=($chrs[0]=~/^\-/)? 1:0; + for my $chrName (@chrs){ + $chrName=~s/^(\-)//; + my $col=($chrName=~s/_(1|2)$//); + + if(!$col){ + $chrs{$chrID->{$chrName}}=undef + }else{ + $chrs1{$chrID->{$chrName}}=undef if($1==1); + $chrs2{$chrID->{$chrName}}=undef if($1==2); + } + } + } + + my $record=0; + my $nb_links=0; + my $warn=10000; + + my $sens_ratio_threshold=0.6; + + print LOG "\# Filtering procedure...\n"; + print LOG "\# Number of pairs and strand filtering...\n"; + print LOG "-- file=$links_file\n"; + print LOG "-- nb_pairs_threshold=$pairs_threshold, strand_filtering=".(($strand_filtering)? "yes":"no"). + ", chromosomes=".(($chromosomes)? "$chromosomes":"all")."\n"; + + + + my $fh = new FileHandle; + my $fh2 = new FileHandle; + + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; + + while(<$fh>){ + + my @t=split; #for each link + my $is_good=1; + $record++; + + + if($chromosomes){ + + my ($chr1,$chr2)=($chrID->{$t[0]},$chrID->{$t[3]}); + + if(!$exclude){ + $is_good=(exists $chrs{$chr1} && exists $chrs{$chr2})? 1:0; + $is_good=(exists $chrs1{$chr1} && exists $chrs2{$chr2})? 1:0 if(!$is_good); + $is_good=($nb_chrs==1 && (exists $chrs1{$chr1} || exists $chrs2{$chr2}))? 1:0 if(!$is_good); + }else{ + $is_good=(exists $chrs{$chr1} || exists $chrs{$chr2})? 0:1; + $is_good=(exists $chrs1{$chr1} || exists $chrs2{$chr2})? 0:1 if($is_good); + } + } + + $is_good = ($is_good && $t[6] >= $pairs_threshold)? 1 :0; #filtering according the number of pairs + if($is_good && $strand_filtering){ #if filtering according the strand sense + + my @mates=split(/,/,$t[7]); #get the concordant pairs in the strand sense + my @strands1=split(/,/,$t[8]); + my @strands2=split(/,/,$t[9]); + + my %mate_class=( 'FF' => 0, 'RR' => 0, 'FR' => 0, 'RF' => 0); + + my %mate_reverse=( 'FF' => 'RR', 'RR' => 'FF', #group1: FF,RR + 'FR' => 'RF', 'RF' => 'FR'); #group2: FR,RF + + my %mate_class2=( $mate_sense=>"NORMAL_SENSE", inverseSense($mate_sense)=>"NORMAL_SENSE", + substr($mate_sense,0,1).inverseSense(substr($mate_sense,1,1))=>"REVERSE_SENSE", + inverseSense(substr($mate_sense,0,1)).substr($mate_sense,1,1)=>"REVERSE_SENSE"); + + if($t[6] == 1){ + + push(@t,$mate_class2{$strands1[0].$strands2[0]},"1/1",1,1); + + }else{ + + tie (my %class,'Tie::IxHash'); + my $split; + + foreach my $i (0..$#mates){ + $mate_class{$strands1[$i].$strands2[$i]}++; #get the over-represented group + } + + my $nb_same_sens_class=$mate_class{FF}+$mate_class{RR}; + my $nb_diff_sens_class=$mate_class{FR}+$mate_class{RF}; + my $sens_ratio=max($nb_same_sens_class,$nb_diff_sens_class)/($nb_same_sens_class+$nb_diff_sens_class); + + if($sens_ratio < $sens_ratio_threshold){ + %class=(1=>'FF', 2=>'FR'); + $split=1; + }else{ + $class{1}=($nb_same_sens_class > $nb_diff_sens_class)? 'FF':'FR'; #if yes get the concerned class + $split=0; + } + + $is_good=getConsistentSenseLinks(\@t,\@mates,\@strands1,\@strands2,$tag_length,$mate_sense,\%mate_reverse,\%mate_class2,\%class,$split,$pairs_threshold); + } + } + + if($is_good){ #PRINT + + my $nb=scalar @t; + if($nb > 20){ + my @t2=splice(@t,0,20); + print $fh2 join("\t",@t2)."\n"; + $nb_links++; + } + $nb_links++; + print $fh2 join("\t",@t)."\n"; + } + + if($record>=$warn){ + print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n"; + $warn+=10000; + } + } + $fh->close; + $fh2->close; + + print LOG "-- $fchr : No links have been found with the selected filtering parameters\n" if(!$nb_links); + + print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; + + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getConsistentSenseLinks{ + + my ($t,$mates,$strands1,$strands2,$tag_length,$mate_sense, $mate_reverse,$mate_class2, $class, $split,$thres)=@_; + + my $npairs=scalar @$mates; + + my @ends_order1 = split (/,/,$$t[10]); + my @ends_order2 = split (/,/,$$t[11]); + my @order1 = split (/,/,$$t[12]); + my @order2 = split (/,/,$$t[13]); + my @positions1 = split (/,/,$$t[14]); + my @positions2 = split (/,/,$$t[15]); + + my @newlink; + + foreach my $ind (keys %{$class} ){ + + tie (my %flink,'Tie::IxHash'); + my @orders2remove=(); + + foreach my $i (0..$#{$mates}){ #get the pairs belonging the over-represented group + + if((($$strands1[$i].$$strands2[$i]) eq $$class{$ind}) || (($$strands1[$i].$$strands2[$i]) eq $$mate_reverse{$$class{$ind}})){ + push(@{$flink{mates}},$$mates[$i]); + push(@{$flink{strands1}},$$strands1[$i]); + push(@{$flink{strands2}},$$strands2[$i]); + push(@{$flink{ends_order1}},$ends_order1[$i]); + push(@{$flink{ends_order2}},$ends_order2[$i]); + push(@{$flink{positions1}},$positions1[$i]); + push(@{$flink{positions2}},$positions2[$i]); + + }else{ + push(@orders2remove,$order1[$i]); + } + } + + @{$flink{order1}}=(); + @{$flink{order2}}=(); + if(scalar @orders2remove > 0){ + getNewOrders(\@order1,\@order2,\@orders2remove,$flink{order1},$flink{order2}) + }else{ + @{$flink{order1}}=@order1; + @{$flink{order2}}=@order2; + } + + my @ends1; getEnds(\@ends1,$flink{positions1},$flink{strands1},$flink{ends_order1},$tag_length); + my @ends2; getEnds(\@ends2,$flink{positions2},$flink{strands2},$flink{ends_order2},$tag_length); + + my $fnpairs=scalar @{$flink{mates}}; + my $strand_filtering_ratio=$fnpairs."/".$npairs; + my $real_ratio=$fnpairs/$npairs; + + if($fnpairs>=$thres){ #filtering according the number of pairs + + push(@newlink, + $$t[0], + min(min(@{$flink{positions1}}),min(@ends1)), + max(max(@{$flink{positions1}}),max(@ends1)), + $$t[3], + min(min(@{$flink{positions2}}),min(@ends2)), + max(max(@{$flink{positions2}}),max(@ends2)), + $fnpairs, + join(",",@{$flink{mates}}), + join(",",@{$flink{strands1}}), + join(",",@{$flink{strands2}}), + join(",",@{$flink{ends_order1}}), + join(",",@{$flink{ends_order2}}), + join(",",@{$flink{order1}}), + join(",",@{$flink{order2}}), + join(",",@{$flink{positions1}}), + join(",",@{$flink{positions2}}), + $$mate_class2{${$flink{strands1}}[0].${$flink{strands2}}[0]}, + $strand_filtering_ratio, + $real_ratio, + $npairs + ); + } + } + + if (grep {defined($_)} @newlink) { + @$t=@newlink; + return 1 + } + return 0; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getNewOrders{ + + my($tab1,$tab2,$list,$newtab1,$newtab2)=@_; + my $j=1; + my $k=1; + for my $i (0..$#{$tab2}){ + my $c=0; + for my $j (0..$#{$list}){ + $c++ if(${$list}[$j] < ${$tab2}[$i]); + if(${$list}[$j] == ${$tab2}[$i]){ + $c=-1; last; + } + } + if($c!=-1){ + push(@{$newtab2}, ${$tab2}[$i]-$c); + push(@{$newtab1}, $k); + $k++; + } + } +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#Filtering of the links using their order +sub orderFiltering { + + my ($chr,$chrID,$nb_pairs_threshold,$nb_pairs_order_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + + my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0; + + my $record=0; + my $warn=10000; + my $nb_links=0; + + my $quant05 = 1.644854; + my $quant001 = 3.090232; + my $alphaDist = $quant05 * 2 * $sigma; + my $maxFragmentLength = &floor($quant001 * $sigma + $mu); + + print LOG "\# Filtering by order...\n"; + print LOG "-- mu length=$mu, sigma length=$sigma, nb pairs order threshold=$nb_pairs_order_threshold\n"; + print LOG "-- distance between comparable pairs was set to $alphaDist\n"; + print LOG "-- maximal fragment length was set to $maxFragmentLength\n"; + + + my $fh = new FileHandle; + my $fh2 = new FileHandle; + + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; + + while(<$fh>){ + + $record++; + my @t = split; + my ($chr1,$chr2,$mates_list)=@t[0,3,7]; + my @pairs=split(",",$mates_list); + ($chr1,$chr2) = ($chrID->{$chr1},$chrID->{$chr2}); + my ($coord_start_chr1,$coord_end_chr1,$coord_start_chr2,$coord_end_chr2) = @t[1,2,4,5]; + my $numberOfPairs = $t[6]; + my @strand1 = split (/,/,$t[8]); + my @strand2 = split (/,/,$t[9]); + my @ends_order1 = split (/,/,$t[10]); + my @ends_order2 = split (/,/,$t[11]); + my @order1 = split (/,/,$t[12]); + my @order2 = split (/,/,$t[13]); + my @positions1 = split (/,/,$t[14]); + my @positions2 = split (/,/,$t[15]); + my @ends1; getEnds(\@ends1,\@positions1,\@strand1,\@ends_order1,$tag_length); + my @ends2; getEnds(\@ends2,\@positions2,\@strand2,\@ends_order2,$tag_length); + my $clusterCoordinates_chr1; + my $clusterCoordinates_chr2; + my $reads_left = 0; + + my $ifRenv = $t[16]; + my $strand_ratio_filtering=$t[17]; + + #kind of strand filtering. For example, will keep only FFF-RRR from a link FFRF-RRRF if <F-R> orientation is correct + my ($singleBreakpoint, %badInFRSense) = findBadInFRSenseSOLiDSolexa(\@strand1,\@strand2,\@ends_order1,\@ends_order2,\@order1,\@order2,$mate_sense); + #find pairs type F-RRRR or FFFF-R in the case if <R-F> orientation is correct + #These pairs are annotated as BED pairs forever! They won't be recycled! + my $table; + for my $i (0..$numberOfPairs-1) { #fill the table with non adequate pairs: pairID numberOfNonAdPairs nonAdPairIDs + my $nonAdeq = 0; + for my $j (0..$i-1) { + if (exists($table->{$j}->{$i})) { + $nonAdeq++; + $table->{$i}->{$j} = 1; + } + } + for my $j ($i+1..$numberOfPairs-1) { + if ($positions1[$j]-$positions1[$i]>$alphaDist) { + if (&reversed ($i,$j,$ifRenv,\@positions2)) { + $nonAdeq++; + $table->{$i}->{$j} = 1; + } + } + } + $table->{$i}->{nonAdeq} = $nonAdeq; + } + + for my $bad (keys %badInFRSense) { #remove pairs type F-RRRR or FFFF-R in the case of <R-F> orientation + &remove($bad,$table); + } + + my @falseReads; + #RRRR-F -> RRRR or R-FFFF -> FFFF + @falseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense, keys %{$table}); + #these pairs will be recycled later as $secondTable + for my $bad (@falseReads) { + &remove($bad,$table); + } + + my $bad = &check($table); + while ($bad ne "OK") { #clear the table to reject non adequate pairs in the sense of ORDER + # push (@falseReads, $bad); remove completely!!! + &remove($bad,$table); + $bad = &check($table); + } + + $reads_left = scalar keys %{$table}; + my $coord_start_chr1_cluster1 = min(min(@positions1[sort {$a<=>$b} keys %{$table}]),min(@ends1[sort {$a<=>$b} keys %{$table}])); + my $coord_end_chr1_cluster1 = max(max(@positions1[sort {$a<=>$b} keys %{$table}]),max(@ends1[sort {$a<=>$b} keys %{$table}])); + my $coord_start_chr2_cluster1 = min(min(@positions2[sort {$a<=>$b} keys %{$table}]),min(@ends2[sort {$a<=>$b} keys %{$table}])); + my $coord_end_chr2_cluster1 = max(max(@positions2[sort {$a<=>$b} keys %{$table}]),max(@ends2[sort {$a<=>$b} keys %{$table}])); + + $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; + $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; + + my $ifBalanced = 'UNBAL'; + my $secondTable; + my $clusterCoordinates; + my ($break_pont_chr1,$break_pont_chr2); + + my $signatureType=""; + + my $maxCoord1 =$chr->{$chr1}->{length}; + my $maxCoord2 =$chr->{$chr2}->{length}; + + if (scalar @falseReads) { + @falseReads = sort @falseReads; + #now delete FRFR choosing the majority + my @newfalseReads; #find and remove pairs type RRRR-F or R-FFFF + @newfalseReads = findBadInRFSenseSOLiDSolexa(\@strand1,\@ends_order1,$mate_sense,@falseReads); #these @newfalseReads won't be recycled + my %hashTmp; + for my $count1 (0..scalar(@falseReads)-1) { + my $i = $falseReads[$count1]; + $hashTmp{$i} = 1; + for my $bad (@newfalseReads) { + if ($bad == $i) { + delete $hashTmp{$i}; + next; + } + } + } + @falseReads = sort keys %hashTmp; #what is left + for my $count1 (0..scalar(@falseReads)-1) { #fill the table for reads which were previously rejected + my $nonAdeq = 0; + my $i = $falseReads[$count1]; + + for my $count2 (0..$count1-1) { + my $j = $falseReads[$count2]; + if (exists($secondTable->{$j}->{$i})) { + $nonAdeq++; + $secondTable->{$i}->{$j} = 1; + } + } + for my $count2 ($count1+1..scalar(@falseReads)-1) { + my $j = $falseReads[$count2]; + if ($positions1[$j]-$positions1[$i]>$alphaDist) { + if (&reversed ($i,$j,$ifRenv,\@positions2)) { + $nonAdeq++; + $secondTable->{$i}->{$j} = 1; + } + } + } + $secondTable->{$i}->{nonAdeq} = $nonAdeq; + } + + my @falseReads2; + my $bad = &check($secondTable); + while ($bad ne "OK") { #clear the table to reject non adequate pairs + push (@falseReads2, $bad); + &remove($bad,$secondTable); + $bad = &check($secondTable); + } + if (scalar keys %{$secondTable} >= $nb_pairs_order_threshold) { + my $coord_start_chr1_cluster2 = min(min(@positions1[sort {$a<=>$b} keys %{$secondTable}]),min(@ends1[sort {$a<=>$b} keys %{$secondTable}])); + my $coord_end_chr1_cluster2 = max(max(@positions1[sort {$a<=>$b} keys %{$secondTable}]),max(@ends1[sort {$a<=>$b} keys %{$secondTable}])); + my $coord_start_chr2_cluster2 = min(min(@positions2[sort {$a<=>$b} keys %{$secondTable}]),min(@ends2[sort {$a<=>$b} keys %{$secondTable}])); + my $coord_end_chr2_cluster2 = max(max(@positions2[sort {$a<=>$b} keys %{$secondTable}]),max(@ends2[sort {$a<=>$b} keys %{$secondTable}])); + + $ifBalanced = 'BAL'; + + if ($ifBalanced eq 'BAL') { + + if (scalar keys %{$table} < $nb_pairs_order_threshold) { + $ifBalanced = 'UNBAL'; #kill cluster 1! + ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 + $reads_left = scalar keys %{$table}; + $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2; + $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2; + $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2; + $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2; + $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; + $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; + + } else { + + $reads_left += scalar keys %{$secondTable}; + next if ($reads_left < $nb_pairs_threshold); + + if ($coord_end_chr1_cluster2 < $coord_start_chr1_cluster1) { + ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 + + ($coord_start_chr1_cluster1,$coord_start_chr1_cluster2) = ($coord_start_chr1_cluster2,$coord_start_chr1_cluster1); + ($coord_end_chr1_cluster1,$coord_end_chr1_cluster2)=($coord_end_chr1_cluster2,$coord_end_chr1_cluster1); + ($coord_start_chr2_cluster1,$coord_start_chr2_cluster2)=($coord_start_chr2_cluster2,$coord_start_chr2_cluster1); + ($coord_end_chr2_cluster1 , $coord_end_chr2_cluster2)=($coord_end_chr2_cluster2 , $coord_end_chr2_cluster1); + + $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.'),'.$clusterCoordinates_chr1; + $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.'),'.$clusterCoordinates_chr2; + }else { + $clusterCoordinates_chr1 .= ',('.$coord_start_chr1_cluster2.','.$coord_end_chr1_cluster2.')'; + $clusterCoordinates_chr2 .= ',('.$coord_start_chr2_cluster2.','.$coord_end_chr2_cluster2.')'; + } + $coord_start_chr1 = min($coord_start_chr1_cluster1,$coord_start_chr1_cluster2); + $coord_end_chr1 = max($coord_end_chr1_cluster1,$coord_end_chr1_cluster2); + $coord_start_chr2 = min($coord_start_chr2_cluster1,$coord_start_chr2_cluster2); + $coord_end_chr2 = max($coord_end_chr2_cluster1,$coord_end_chr2_cluster2); + #to calculate breakpoints one need to take into account read orientation in claster.. + my $leftLetterOk = substr($mate_sense, 0, 1); #R + my $rightLetterOk = substr($mate_sense, 1, 1); #F + + + my @index1 = keys %{$table}; + my @index2 = keys %{$secondTable}; + + my (@generalStrand1,@generalStrand2) = 0; + + if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs + $leftLetterOk = 'R'; + $rightLetterOk = 'F'; + @generalStrand1 = translateSolidToRF(\@strand1,\@ends_order1); + @generalStrand2 = translateSolidToRF(\@strand2,\@ends_order2); + } else { + @generalStrand1 = @strand1; + @generalStrand2 = @strand2; # TODO check if it is correct + } + if ($generalStrand1[$index1[0]] eq $leftLetterOk && $generalStrand1[$index2[0]] eq $rightLetterOk) { #(R,F) + $break_pont_chr1 = '('.$coord_end_chr1_cluster1.','.$coord_start_chr1_cluster2.')'; + + if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) { + if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { + $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')'; + $signatureType = "TRANSLOC"; + } else { + $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')'; + $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')'; + $signatureType = "INS_FRAGMT"; + } + + } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) { + if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { + $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')'; + $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')'; + $signatureType = "INV_INS_FRAGMT"; + } else { + $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')'; + $signatureType = "INV_TRANSLOC"; + } + } else { + #should not occur + print STDERR "\nError in orderFiltering\n\n"; + } + } + + elsif ($generalStrand1[$index1[0]] eq $rightLetterOk && $generalStrand1[$index2[0]] eq $leftLetterOk) { #(F,R) + $break_pont_chr1 = '('.max(($coord_end_chr1_cluster1-$maxFragmentLength),1).','.$coord_start_chr1_cluster1.')'; + $break_pont_chr1 .= ',('.$coord_end_chr1_cluster2.','.min(($coord_start_chr1_cluster2+$maxFragmentLength),$maxCoord1).')'; + if ($generalStrand2[$index1[0]] eq $rightLetterOk && $generalStrand2[$index2[0]] eq $leftLetterOk) { + if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { + $break_pont_chr2 = '('.$coord_end_chr2_cluster2.','.$coord_start_chr2_cluster1.')'; + $signatureType = "INV_INS_FRAGMT"; + } else { + $break_pont_chr2 = '('.max(($coord_end_chr2_cluster1-$maxFragmentLength),1).','.$coord_start_chr2_cluster1.')'; + $break_pont_chr2 .= ',('.$coord_end_chr2_cluster2.','.min(($coord_start_chr2_cluster2+$maxFragmentLength),$maxCoord2).')'; + $signatureType = "INV_COAMPLICON"; + } + + } elsif ($generalStrand2[$index1[0]] eq $leftLetterOk && $generalStrand2[$index2[0]] eq $rightLetterOk) { + if ($coord_end_chr2_cluster1 >= $coord_end_chr2_cluster2) { + $break_pont_chr2 = '('.max(($coord_end_chr2_cluster2-$maxFragmentLength),1).','.$coord_start_chr2_cluster2.')'; + $break_pont_chr2 .= ',('.$coord_end_chr2_cluster1.','.min(($coord_start_chr2_cluster1+$maxFragmentLength),$maxCoord2).')'; + $signatureType = "COAMPLICON"; + } else { + $break_pont_chr2 = '('.$coord_end_chr2_cluster1.','.$coord_start_chr2_cluster2.')'; + $signatureType = "INS_FRAGMT"; + } + } else { + #should not occur + $signatureType = "UNDEFINED"; + } + } + else { # (F,F) or (R,R) something strange. We will discard the smallest cluster + $ifBalanced = 'UNBAL'; + if (scalar keys %{$secondTable} > scalar keys %{$table}) { + ($table,$secondTable)=($secondTable,$table); #this means that one needs to exchange cluster1 with cluster2 + + $coord_start_chr1_cluster1 = $coord_start_chr1_cluster2; + $coord_end_chr1_cluster1 = $coord_end_chr1_cluster2; + $coord_start_chr2_cluster1 = $coord_start_chr2_cluster2; + $coord_end_chr2_cluster1 = $coord_end_chr2_cluster2; + $clusterCoordinates_chr1 = '('.$coord_start_chr1_cluster1.','.$coord_end_chr1_cluster1.')'; + $clusterCoordinates_chr2 = '('.$coord_start_chr2_cluster1.','.$coord_end_chr2_cluster1.')'; + } + $reads_left = scalar keys %{$table}; + } + if ($ifBalanced eq 'BAL') { + $ifRenv = $signatureType; + } + } + } + } + } + if ($ifBalanced ne 'BAL') { + #define possible break point + $coord_start_chr1 = $coord_start_chr1_cluster1; + $coord_end_chr1 = $coord_end_chr1_cluster1; + $coord_start_chr2 = $coord_start_chr2_cluster1; + $coord_end_chr2 = $coord_end_chr2_cluster1; + + my $region_length_chr1 = $coord_end_chr1-$coord_start_chr1; + my $region_length_chr2 = $coord_end_chr2-$coord_start_chr2; + + my $leftLetterOk = substr($mate_sense, 0, 1); #R + my $rightLetterOk = substr($mate_sense, 1, 1); #F + + my @index = keys %{$table}; + unless ($diff_sense_ends) { + my $firstEndOrder1 = $ends_order1[$index[0]]; + my $firstEndOrder2 = $ends_order2[$index[0]]; + $break_pont_chr1 = (($strand1[$index[0]] eq 'R' && $firstEndOrder1 == 2) || ($strand1[$index[0]] eq 'F' && $firstEndOrder1 == 1))?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')'; + $break_pont_chr2 = (($strand2[$index[0]] eq 'R' && $firstEndOrder2 == 2) || ($strand2[$index[0]] eq 'F' && $firstEndOrder2 == 1))?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')'; + } else { + $break_pont_chr1 = ($strand1[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr1.','.min(($coord_start_chr1+$maxFragmentLength),$maxCoord1).')':'('.max(($coord_end_chr1-$maxFragmentLength),1).','.$coord_start_chr1.')'; + $break_pont_chr2 = ($strand2[$index[0]] eq $leftLetterOk )?'('.$coord_end_chr2.','.min(($coord_start_chr2+$maxFragmentLength),$maxCoord2).')':'('.max(($coord_end_chr2-$maxFragmentLength),1).','.$coord_start_chr2.')'; + } + + if ($chr1 ne $chr2){ + $ifRenv="INV_TRANSLOC" if($ifRenv eq "REVERSE_SENSE"); + $ifRenv="TRANSLOC" if($ifRenv eq "NORMAL_SENSE"); + } + } + + if (($ifBalanced eq 'BAL')&&( (scalar keys %{$table}) + (scalar keys %{$secondTable}) < $nb_pairs_threshold)) { + next; #discard the link + } + if (($ifBalanced eq 'UNBAL')&&(scalar keys %{$table} < $nb_pairs_threshold)) { + next; #discard the link + } + my $ratioTxt = "$reads_left/".(scalar @pairs); + my ($n1,$nTot) = split ("/",$strand_ratio_filtering); + my $ratioReal = $reads_left/$nTot; + + if ($coord_start_chr1<=0) { + $coord_start_chr1=1; + } + if ($coord_start_chr2<=0) { + $coord_start_chr2=1; + } + #create output + my @link=($chr->{$chr1}->{name}, $coord_start_chr1 , $coord_end_chr1, #all information output + $chr->{$chr2}->{name}, $coord_start_chr2 , $coord_end_chr2, + $reads_left, + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@pairs), + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand1), + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@strand2), + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order1), + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@ends_order2), + &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order1), + &redraw(2,$table,$secondTable,\%badInFRSense,$ifBalanced,\@order2), + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions1), + &redraw(1,$table,$secondTable,\%badInFRSense,$ifBalanced,\@positions2), + $ifRenv, + $strand_ratio_filtering, + $ifBalanced, $ratioTxt, $break_pont_chr1, $break_pont_chr2, + $ratioReal, $nTot); + + $nb_links++; + print $fh2 join("\t",@link)."\n"; + + if($record>=$warn){ + print LOG "-- $fchr : $warn links analysed - $nb_links links kept\n"; + $warn+=10000; + } + + } + $fh->close; + $fh2->close; + + print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#gets information about ends positions given start, direction and order +sub getEnds { + my ($ends,$starts,$strand,$end_order,$tag_length) = @_; + for my $i (0..scalar(@{$starts})-1) { + $ends->[$i] = getEnd($starts->[$i],$strand->[$i],$end_order->[$i],$tag_length); + } +} +sub getEnd { + my ($start,$strand, $end_order,$tag_length) = @_; + return ($strand eq 'F')? $start+$tag_length->{$end_order}-1:$start-$tag_length->{$end_order}+1; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#gets starts and ends Coords when start=leftmost given positions, directions and orders +sub getCoordswithLeftMost { + + my ($starts,$ends,$positions,$strand,$end_order,$tag_length) = @_; + + for my $i (0..scalar(@{$positions})-1) { + + if($strand->[$i] eq 'F'){ + $starts->[$i]=$positions->[$i]; + $ends->[$i]=$positions->[$i]+$tag_length->{$end_order->[$i]}-1; + }else{ + $starts->[$i]=$positions->[$i]-$tag_length->{$end_order->[$i]}+1; + $ends->[$i]=$positions->[$i]; + } + } +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub addInsertionInfo { #add field with INS,DEL,NA and distance between clusters and performs filtering + + my ($chr,$chrID,$nb_pairs_threshold,$order_filtering,$indel_sigma_threshold,$dup_sigma_threshold,$singleton_sigma_threshold,$mu,$sigma,$mate_sense,$tag_length,$links_file,$flinks_file)=@_; + + my @sfile=split(/\./,$links_file); + my $fchr=$sfile[$#sfile-2]; + + + my $diff_sense_ends=(($mate_sense eq "FR") || ($mate_sense eq "RF"))? 1:0; + + my $record=0; + my $nb_links=0; + my $warn=10000; + + print LOG "\# Filtering out normal pairs using insert size...\n"; + print LOG "-- mu length=$mu, sigma length=$sigma, indel sigma threshold=$indel_sigma_threshold, dup sigma threshold=$dup_sigma_threshold\n"; + print LOG "-- using ".($mu-$indel_sigma_threshold*$sigma)."-". + ($mu+$indel_sigma_threshold*$sigma)." as normal range of insert size for indels\n"; + print LOG "-- using ".($mu-$dup_sigma_threshold*$sigma)."-". + ($mu+$dup_sigma_threshold*$sigma)." as normal range of insert size for duplications\n"; + print LOG "-- using ".($mu-$singleton_sigma_threshold*$sigma)." as the upper limit of insert size for singletons\n" if($mate_sense eq "RF"); + + my $fh = new FileHandle; + my $fh2 = new FileHandle; + + $fh->open("<$links_file") or die "$0: can't open $links_file :$!\n"; + $fh2->open(">$flinks_file") or die "$0: can't write in the output: $flinks_file :$!\n"; + + while(<$fh>){ + + $record++; + my @t = split; + my ($chr1,$chr2,$mates_list)=@t[0,3,7]; + + if($chrID->{$chr1} ne $chrID->{$chr2}) { #if inter-chromosomal link here (because sv_type=all), + $nb_links++; + + $t[16]="INV_TRANSLOC" if($t[16] eq "REVERSE_SENSE"); + $t[16]="TRANSLOC" if($t[16] eq "NORMAL_SENSE"); + + $t[16].= "\t"; + $t[19].= "\t"; + + print $fh2 join("\t",@t)."\n"; + + if($record>=$warn){ + print LOG "-- $fchr : $warn links processed - $nb_links links kept\n"; + $warn+=10000; + } + next; + } + + my $ifRenv = $t[16]; + my $ifBalanced = "UNBAL"; + $ifBalanced = $t[18] if ($order_filtering); + + my $numberOfPairs = $t[6]; + my @positions1 = deleteBadOrderSensePairs(split (/,/,$t[14])); + my @positions2 = deleteBadOrderSensePairs(split (/,/,$t[15])); + + if ($ifBalanced eq "BAL") { + + if ($ifRenv eq "INV_TRANSLOC") { + $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment + } + if ($ifRenv eq "NORMAL_SENSE") { + $ifRenv = "TRANSLOC"; + } + if ($ifRenv eq "REVERSE_SENSE") { + $ifRenv = "INV_FRAGMT"; #for intrachromosomal inverted translocation is the same as inverted fragment + } + $t[19].= "\t"; + + my $meanDistance = 0; + + for my $i (0..$numberOfPairs-1) { + $meanDistance += $positions2[$i]-$positions1[$i]; + } + $meanDistance /= $numberOfPairs; + + $t[16] = $ifRenv."\t".$meanDistance; + #dont touch the annotation. It should be already OK. + + } else { + #only for unbalanced + + my $ifoverlap=overlap($t[1],$t[2],$t[4],$t[5]); + + my $ends_sense_class = (deleteBadOrderSensePairs(split (/,/,$t[8])))[0]. + (deleteBadOrderSensePairs(split (/,/,$t[9])))[0]; + my $ends_order_class = (deleteBadOrderSensePairs(split (/,/,$t[10])))[0]. + (deleteBadOrderSensePairs(split (/,/,$t[11])))[0]; + + my $indel_type = $ifRenv; + + my $meanDistance = "N/A"; + + ($meanDistance, $indel_type) = checkIndel ($numberOfPairs, #identify insertion type for rearrangments without inversion, calculates distance between cluster + \@positions1, #assign N/A to $indel_type if unknown + \@positions2, + $ifRenv, + $ifoverlap, + $indel_sigma_threshold, + $dup_sigma_threshold, + $singleton_sigma_threshold, + $mu, + $sigma, + $ifBalanced, + $ends_sense_class, + $ends_order_class, + $mate_sense, + $diff_sense_ends, + ); + + #filtering of pairs with distance inconsistant with the SV + if ($ifRenv ne "REVERSE_SENSE") { + my $maxCoord1 =$chr->{$chrID->{$chr1}}->{length}; + my $maxCoord2 =$chr->{$chrID->{$chr2}}->{length}; + $meanDistance = recalc_t_usingInsertSizeInfo(\@t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense, + $maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering); + next if ($t[6] < $nb_pairs_threshold); + }else{ + $t[19].= "\t"; + } + $t[16] = $indel_type."\t".$meanDistance; + } + + $nb_links++; + + print $fh2 join("\t",@t)."\n"; + if($record>=$warn){ + print LOG "-- $fchr : $warn links processed - $nb_links links kept\n"; + $warn+=10000; + } + } + $fh->close; + $fh2->close; + + print LOG "-- $fchr : Total : $record links analysed - $nb_links links kept\n"; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub checkIndel { + + my ($numberOfPairs, $positions1, $positions2, $ifRenv, $ifoverlap, $indel_sigma_threshold, $dup_sigma_threshold, $singleton_sigma_threshold, + $mu, $sigma, $ifBalanced,$ends_sense_class,$ends_order_class,$mate_sense,$diff_sense_ends) = @_; + + my $meanDistance = 0; + + for my $i (0..$numberOfPairs-1) { + $meanDistance += $positions2->[$i]-$positions1->[$i]; + } + $meanDistance /= $numberOfPairs; + + return ($meanDistance,"INV_DUPLI") if (($ifRenv eq "REVERSE_SENSE") && ($meanDistance<$mu+$dup_sigma_threshold*$sigma) ); + + return ($meanDistance,"INVERSION") if ($ifRenv eq "REVERSE_SENSE"); + + if($diff_sense_ends){ + return ($meanDistance, "LARGE_DUPLI") if ($ends_sense_class ne $mate_sense) && ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ; + return ($meanDistance, "SINGLETON") if (($meanDistance<$mu-$singleton_sigma_threshold*$sigma) && $mate_sense eq "RF" && ($ends_sense_class eq inverseSense($mate_sense))); + }else{ + return ($meanDistance, "LARGE_DUPLI") if (($ends_sense_class eq $mate_sense) && ($ends_order_class eq "12") || ($ends_sense_class eq inverseSense($mate_sense)) && ($ends_order_class eq "21")) && + ($meanDistance>$mu+$dup_sigma_threshold*$sigma) ; + } + + return ($meanDistance, "SMALL_DUPLI") if (($meanDistance<$mu-$dup_sigma_threshold*$sigma) && $ifoverlap); + + return ($meanDistance, "DUPLICATION") if ($diff_sense_ends && ($ends_sense_class ne $mate_sense) && ($meanDistance<$mu-$dup_sigma_threshold*$sigma) ) ; + + return ($meanDistance, "INSERTION") if ($meanDistance<$mu -$indel_sigma_threshold*$sigma); + return ($meanDistance, "DELETION") if ($meanDistance>$mu+$indel_sigma_threshold*$sigma); + + return ($meanDistance, "UNDEFINED"); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#sub reacalulate @t so that get rid of unconsistent pairs (unconsistent insert size ) +sub recalc_t_usingInsertSizeInfo { + my($t,$mu,$sigma,$meanDistance,$tag_length,$diff_sense_ends,$mate_sense,$maxCoord1,$maxCoord2,$ends_sense_class,$ends_order_class,$nb_pairs_threshold,$order_filtering) = @_; + + my @badPairs; + + my @positions1 = getAllEntries($t->[14]); + my @positions2 = getAllEntries($t->[15]); + + if ($meanDistance < $mu) { + for my $i (0..scalar(@positions1)-1) { + if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]>=$mu) { + push(@badPairs,$i); + } + } + } else { + for my $i (0..scalar(@positions1)-1) { + if (substr($positions2[$i],-1,1) ne '$' && substr($positions2[$i],-1,1) ne '*' && $positions2[$i]-$positions1[$i]<=$mu) { + push(@badPairs,$i); + } + } + } + + if (scalar (@badPairs)>0) { + #print join("\t",@badPairs).": ".join("\t",@t)."\n"; + #remove these inconsistant links + $t->[6] -= scalar(@badPairs); #numberOfPairs + return if ($t->[6] < $nb_pairs_threshold); + + $t->[7] = mark_values(\@badPairs, $t->[7]); + $t->[8] = mark_values(\@badPairs, $t->[8]); + $t->[9] = mark_values(\@badPairs, $t->[9]); + $t->[10] = mark_values(\@badPairs, $t->[10]); + $t->[11] = mark_values(\@badPairs, $t->[11]); + + $t->[12] = mark_indexes(\@badPairs, $t->[12]); + $t->[13] = mark_indexes(\@badPairs, $t->[13]); + + $t->[14] = mark_values(\@badPairs, $t->[14]); + $t->[15] = mark_values(\@badPairs, $t->[15]); + $t->[19] = recalculate_ratio($t->[6],$t->[19]) if ($order_filtering); #add the second ratio + $t->[17] = recalculate_ratio($t->[6],$t->[17]) unless ($order_filtering); + ($t->[1],$t->[2]) = recalculate_boundaries($t->[14],$t->[8],$t->[10],$tag_length); + ($t->[4],$t->[5]) = recalculate_boundaries($t->[15],$t->[9],$t->[11],$tag_length); + + #recalc breakpoints: + my $quant001 = 3.090232; + my $maxFragmentLength = &floor($quant001 * $sigma + $mu); + $t->[20] = recalc_breakpoints($mate_sense,$maxCoord1,$t->[14],substr($ends_sense_class,0,1),substr($ends_order_class,0,1),$t->[1],$t->[2],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering); + $t->[21] = recalc_breakpoints($mate_sense,$maxCoord2,$t->[15],substr($ends_sense_class,1,1),substr($ends_order_class,1,1),$t->[4],$t->[5],$maxFragmentLength,$diff_sense_ends ) if ($order_filtering); + #recalc total ratio + $t->[22] = $t->[6] / $t->[23] if ($order_filtering); + $t->[18] = $t->[6] / $t->[19] unless ($order_filtering); + + @positions1 = deleteBadOrderSensePairs(split (/,/,$t->[14])); + @positions2 = deleteBadOrderSensePairs(split (/,/,$t->[15])); + + $meanDistance = 0; + + for my $i (0..scalar(@positions1)-1) { + $meanDistance += $positions2[$i]-$positions1[$i]; + } + $meanDistance /= scalar(@positions1); + + } else { + $t->[17] = recalculate_ratio((split(/\//,$t->[17]))[0],$t->[17]) unless ($order_filtering); + $t->[19] = recalculate_ratio((split(/\//,$t->[19]))[0],$t->[19]) if ($order_filtering); + + } #nothing has been filtered + return $meanDistance; +} + +sub recalculate_ratio { + my ($left, $ratio) = @_; + my @elements = split (/\//,$ratio); + $elements[1]= $elements[0]; + $elements[0]=$left; + return $ratio."\t".join("/",@elements); +} + +sub recalc_breakpoints { + my ($mate_sense,$maxCoord,$startString,$strand,$firstEndOrder,$coord_start_chr,$coord_end_chr,$maxFragmentLength,$diff_sense_ends ) = @_; + my $break_pont_chr; + + my $leftLetterOk = substr($mate_sense, 0, 1); #R + my $rightLetterOk = substr($mate_sense, 1, 1); #F + + + my @positions = deleteBadOrderSensePairs(split (/,/,$startString)); + + unless ($diff_sense_ends) { + $break_pont_chr = (($strand eq 'R' && $firstEndOrder == 2) || ($strand eq 'F' && $firstEndOrder == 1))?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')'; + } else { + $break_pont_chr = ($strand eq $leftLetterOk)?'('.$coord_end_chr.','.min(($coord_start_chr+$maxFragmentLength),$maxCoord).')':'('.max(($coord_end_chr-$maxFragmentLength),1).','.$coord_start_chr.')'; + } + return $break_pont_chr; +} +sub recalculate_boundaries { + my ($startString,$senseString,$endsOrderString,$tag_length) = @_; + my @positions = deleteBadOrderSensePairs(split (/,/,$startString)); + my @strands = deleteBadOrderSensePairs(split (/,/,$senseString)); + my @ends_orders = deleteBadOrderSensePairs(split (/,/,$endsOrderString)); + my @ends; getEnds(\@ends,\@positions,\@strands,\@ends_orders,$tag_length); + my $coord_start_cluster = min(min(@positions),min(@ends)); + my $coord_end_cluster = max(max(@positions),max(@ends)); + return ($coord_start_cluster,$coord_end_cluster); +} + +sub remove_indexes { + my ($bads, $string) = @_; + my @elements = deleteBadOrderSensePairs(split (/,/,$string)); + for my $i (reverse sort %{$bads}) { + delete $elements[$i]; + } + return "(".join(",",@elements).")"; +} +##add @ to to elements +sub mark_values { + my ($bads, $string) = @_; + my @elements = getAllEntries($string); + for my $i (@{$bads}) { + $elements[$i] .= "@"; + } + return "(".join(",",@elements).")"; +} +##add @ to to indexes +sub mark_indexes { + my ($bads, $string) = @_; + my @elements = getAllEntries($string); + for my $i ((0..scalar(@elements)-1)) { + for my $j (@{$bads}) { + $elements[$i] .= "@" if ($elements[$i] eq ($j+1)); + } + } + + return "(".join(",",@elements).")"; +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub redraw { + + my ($type,$table,$secondTable,$badInFRSense,$ifBalanced,$arr) = @_; + + my $out; + my @first_arr; + if ($ifBalanced eq 'BAL') { + my @second_arr; + my $lastPushed = 1; + if ($type == 1) { + for my $i (0 .. scalar(@{$arr})-1) { + if (exists ($table->{$i})) { + push(@first_arr,$arr->[$i]); + $lastPushed = 1; + }elsif (exists ($secondTable->{$i})) { + push(@second_arr,$arr->[$i]); + $lastPushed = 2; + } elsif ($lastPushed == 1) { + if (exists ($badInFRSense->{$i})) { + push(@first_arr,$arr->[$i]."\$"); + }else { + push(@first_arr,$arr->[$i]."*"); + } + } elsif ($lastPushed == 2) { + if (exists ($badInFRSense->{$i})) { + push(@second_arr,$arr->[$i]."\$"); + }else { + push(@second_arr,$arr->[$i]."*"); + } + } else {print "Error!";exit;} + } + } else { + for my $i (@{$arr}) { + if (exists ($table->{$i-1})) { + push(@first_arr,$i); + $lastPushed = 1; + }elsif (exists ($secondTable->{$i-1})) { + push(@second_arr,$i); + $lastPushed = 2; + } elsif ($lastPushed == 1) { + if (exists ($badInFRSense->{$i-1})) { + push(@first_arr,$i."\$"); + }else { + push(@first_arr,$i."*"); + } + } elsif ($lastPushed == 2) { + if (exists ($badInFRSense->{$i-1})) { + push(@second_arr,$i."\$"); + }else { + push(@second_arr,$i."*"); + } + } else {print "Error!";exit;} + } + } + $out = '('.join(",",@first_arr).'),('.join(",",@second_arr).')'; + } + else { + if ($type == 1) { + for my $i (0 .. scalar(@{$arr})-1) { + if (exists ($table->{$i})) { + push(@first_arr,$arr->[$i]); + } else { + if (exists ($badInFRSense->{$i})) { + push(@first_arr,$arr->[$i]."\$"); + }else { + push(@first_arr,$arr->[$i]."*"); + } + } + } + } else { + for my $i (@{$arr}) { + if (exists ($table->{$i-1})) { + push(@first_arr,$i); + } else { + if (exists ($badInFRSense->{$i-1})) { + push(@first_arr,$i."\$"); + }else { + push(@first_arr,$i."*"); + } + } + } + } + $out = '('.join(",",@first_arr).')'; + } + return $out; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub check { + + my $table = $_[0]; + my $bad = 'OK'; + my $max = 0; + for my $i (sort {$a<=>$b} keys %{$table}) { + unless ($table->{$i}->{nonAdeq} == 0) { + if ($max<$table->{$i}->{nonAdeq}) { + $max=$table->{$i}->{nonAdeq}; + $bad = $i; + } + } + } + return $bad; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub reversed { + + my ($i,$j,$ifRenv,$positions) = @_; + if (($ifRenv eq 'REVERSE_SENSE' && $positions->[$i]<$positions->[$j]) || ($ifRenv ne 'REVERSE_SENSE' && $positions->[$i]>$positions->[$j])){ + return 1; + } + return 0; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub remove { + + my ($bad,$table) = @_; + for my $i (sort {$a<=>$b} keys %{$table}) { + if ($bad == $i) { + delete($table->{$i});; + } else { + if (exists($table->{$i}->{$bad})) { + delete($table->{$i}->{$bad}); + $table->{$i}->{nonAdeq}--; + } + } + } +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub findBadInRFSenseSOLiDSolexa { #choose maximum: FFFFs or RRRRs + + my ($strand,$ends_order,$mate_sense,@keysLeft) = @_; + + my $leftLetterOk = substr($mate_sense, 0, 1); #R + my $rightLetterOk = substr($mate_sense, 1, 1); #F + + my (@standardArray); + if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs + $leftLetterOk = 'R'; + $rightLetterOk = 'F'; + @standardArray = translateSolidToRF($strand,$ends_order); + } else { + @standardArray = @{$strand}; + } + + my $ifR = 0; + my @Rs; + + for my $i (@keysLeft) { + if ($standardArray[$i] eq $leftLetterOk) { + $ifR++; + push(@Rs,$i); + } + } + + + my $ifF = 0; + my @Fs; + + for my $i (@keysLeft) { + if ($standardArray[$i] eq $rightLetterOk) { + $ifF++; + push(@Fs,$i); + } + } + + if($ifR>=$ifF) { + return @Fs; + } + return @Rs; +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub findBadInFRSenseSOLiDSolexa { #should work both for SOLiD and Solexa + + my ($strand1,$strand2,$ends_order1,$ends_order2,$order1,$order2) = ($_[0],$_[1],$_[2],$_[3],$_[4],$_[5]); + my $mate_sense = $_[6]; + + my $leftLetterOk = substr($mate_sense, 0, 1); #R + my $rightLetterOk = substr($mate_sense, 1, 1); #F + + my (@standardArray1,@standardArray2); + + if ($leftLetterOk eq $rightLetterOk) { #SOLID mate-pairs + $leftLetterOk = 'R'; + $rightLetterOk = 'F'; + @standardArray1 = translateSolidToRF($strand1,$ends_order1); + my @arr = getOrderedStrands($strand2,$order2); + my @ends2 = getOrderedStrands($ends_order2,$order2); + @standardArray2 = translateSolidToRF(\@arr,\@ends2); + + } else { + @standardArray1 = @{$strand1}; + @standardArray2 = getOrderedStrands($strand2,$order2); + } + + #we will try 4 possibilities, 2 for each end of the link: RFRR-FFF->RFFFF , RFRR-FFF->RRRFFF + + #for the first end: + + my @array = @standardArray1; + my %badInFRSense1; + for my $i (1..scalar (@array)-1){ # FRFRFFFF -> FFFFFF and RRFRFRFFFF -> RRFFFFFF + if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { + $badInFRSense1{$i}=1; + $array[$i] = $rightLetterOk; + } + } + my $numberRRRFFF_or_FFF_1 = scalar(@array)-scalar(keys %badInFRSense1); + @array = @standardArray1; + my %badInFRSense0; + for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFFRR -> FFFFFFRR + if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { + $badInFRSense0{$i-1}=1; + $array[$i-1] = $leftLetterOk; + + } + } + my $numberRRF1 = scalar(@array)-scalar(keys %badInFRSense0); + + #for the second end: + @array = @standardArray2; + + my %badInFRSense3; + for my $i (1..scalar(@array)-1){ + if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { + $badInFRSense3{$order2->[$i]}=1; + $array[$i] = $rightLetterOk; + } + } + my $numberRRRFFF_or_FFF_2 = scalar(@array)-scalar(keys %badInFRSense3); + + @array = @standardArray2; + my %badInFRSense5; + for my $i (reverse(1..scalar (@array)-1)){ # FRFRFFFF -> FFFFFF + if ($array[$i-1] eq $rightLetterOk && $array[$i] eq $leftLetterOk) { + $badInFRSense5{$i-1}=1; + $array[$i-1] = $leftLetterOk; + } + } + my $numberRRF2 = scalar(@array)-scalar(keys %badInFRSense5); + + if ($numberRRF1>=$numberRRRFFF_or_FFF_1 && $numberRRF1 >= $numberRRRFFF_or_FFF_2 && $numberRRF1 >=$numberRRF2) { + return (1,%badInFRSense0); + } + + if ($numberRRRFFF_or_FFF_1 >=$numberRRF1 && $numberRRRFFF_or_FFF_1 >= $numberRRRFFF_or_FFF_2 && $numberRRRFFF_or_FFF_1 >= $numberRRF2) { + return (1,%badInFRSense1); + } + + if ($numberRRRFFF_or_FFF_2 >= $numberRRF1 && $numberRRRFFF_or_FFF_2 >= $numberRRRFFF_or_FFF_1 && $numberRRRFFF_or_FFF_2 >=$numberRRF2) { + return (2,%badInFRSense3); + } + + if ($numberRRF2 >= $numberRRF1 && $numberRRF2 >= $numberRRRFFF_or_FFF_1 && $numberRRF2 >= $numberRRRFFF_or_FFF_2 ) { + return (2,%badInFRSense5); + } + + #should not get here: + print STDERR "Error in findBadInFRSenseSOLiDSolexa()!\n"; + return (1,%badInFRSense1); +} + +sub getOrderedStrands { + my ($strand,$order) = ($_[0],$_[1]); + my @arr; + for my $i (0..scalar(@{$strand})-1) { + push(@arr,$strand->[$order->[$i]-1]); + } + return @arr; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub checkClusters { + + my ($ifRenv,$coord_start_chr1_cluster1,$coord_start_chr1_cluster2,$coord_start_chr2_cluster1,$coord_start_chr2_cluster2) = @_; + if ($ifRenv eq 'REVERSE_SENSE') { + if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) { + return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0; + } + return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0; + } + #if NORM + if ($coord_start_chr1_cluster1 <= $coord_start_chr1_cluster2) { + return ($coord_start_chr2_cluster1 >= $coord_start_chr2_cluster2)?1:0; + } + return ($coord_start_chr2_cluster1 <= $coord_start_chr2_cluster2)?1:0; +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub translateSolidToRF { + my ($strandArr,$ends_orderArr)=@_; + my @array; + for my $i (0..scalar(@{$strandArr})-1) { + if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'F') { + push(@array,'F'); + } + if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'F') { + push(@array,'R'); + } + if ($ends_orderArr->[$i]==1 && $strandArr->[$i] eq 'R') { + push(@array,'R'); + } + if ($ends_orderArr->[$i]==2 && $strandArr->[$i] eq 'R') { + push(@array,'F'); + } + } + return @array; +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#convert the links file to the circos format +sub links2segdup{ + + my($id,$color_code,$links_file,$segdup_file)=@_; + + print LOG "# Converting to the circos format...\n"; + + tie (my %hcolor,'Tie::IxHash'); #color-code hash table + foreach my $col (keys %{$color_code}){ + my ($min_links,$max_links)=split(",",$color_code->{$col}); + $hcolor{$col}=[$min_links,$max_links]; + } + + open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; + open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; + + my $index=1; + while(<LINKS>){ + + my ($chr1,$start1,$end1,$chr2,$start2,$end2,$count)=(split)[0,1,2,3,4,5,6]; + + my $color=getColor($count,\%hcolor,"circos"); #get the color-code according the number of links + + print SEGDUP "$index\t$id$chr1\t$start1\t$end1\tcolor=$color\n". #circos output + "$index\t$id$chr2\t$start2\t$end2\tcolor=$color\n"; + $index++; + } + + close LINKS; + close SEGDUP; + print LOG "-- output created: $segdup_file\n"; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#convert the links file to the bedPE format for BEDTools usage +sub links2bedPElinksfile{ + + my ($sample,$links_file,$bedpe_file)=@_; + + open LINKS, "<$links_file" or die "$0: can't open $links_file :$!\n"; + open BEDPE, ">$bedpe_file" or die "$0: can't write in the output: $bedpe_file :$!\n"; + + my $nb_links=1; + + while(<LINKS>){ + + chomp; + my @t=split("\t",$_); + my ($chr1,$start1,$end1,$chr2,$start2,$end2)=splice(@t,0,6); + my $type=($chr1 eq $chr2)? "INTRA":"INTER"; + $type.="_".$t[10]; + + $start1--; $start2--; + + print BEDPE "$chr1\t$start1\t$end1\t$chr2\t$start2\t$end2". + "\t$sample"."_link$nb_links\t$type\t.\t.". + "\t".join("|",@t)."\n"; + + $nb_links++; + } + + close LINKS; + close BEDPE; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub bedPElinks2linksfile{ + + my ($bedpe_file,$links_file)=@_; + + open BEDPE, "<$bedpe_file" or die "$0: can't open: $bedpe_file :$!\n"; + open LINKS, ">$links_file" or die "$0: can't write in the output $links_file :$!\n"; + + while(<BEDPE>){ + + chomp; + my $sample=(split("_",(split("\t",$_))[6]))[0]; + my @t1=(split("\t",$_))[0,1,2,3,4,5]; + my @t2=split(/\|/,(split("\t",$_))[10]); + push(@t2,$sample); + + print LINKS join("\t",@t1)."\t".join("\t",@t2)."\n"; + + } + close BEDPE; + close LINKS; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#convert the links file to the bed format +sub links2bedfile{ + + my ($tag_length,$color_code,$links_file,$bed_file)=@_; + + print LOG "# Converting to the bed format...\n"; + + my $compare=1; + if($links_file!~/compared$/){ + $compare=0; + $tag_length->{none}->{1}=$tag_length->{1}; + $tag_length->{none}->{2}=$tag_length->{2}; + } + + #color-code hash table + tie (my %hcolor,'Tie::IxHash'); + my %color_order; + my $n=1; + foreach my $col (keys %{$color_code}){ + my ($min_links,$max_links)=split(",",$color_code->{$col}); + $hcolor{$col}=[$min_links,$max_links]; + $color_order{$col}=$n; + $n++; + } + + my %pair; + my %pt; + $n=1; + open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; + + my %str=( "F"=>"+", "R"=>"-" ); + + while(<LINKS>){ + + my @t=split; + my $sample=($compare)? pop(@t):"none"; + + my $chr1=$t[0]; + my $chr2=$t[3]; + $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); + $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); + my $same_chr=($chr1 eq $chr2)? 1:0; + + my $count=$t[6]; + my $color=getColor($count,\%hcolor,"bed"); + + my @pairs=deleteBadOrderSensePairs(split(",",$t[7])); + my @strand1=deleteBadOrderSensePairs(split(",",$t[8])); + my @strand2=deleteBadOrderSensePairs(split(",",$t[9])); + my @ends_order1=deleteBadOrderSensePairs(split(",",$t[10])); + my @ends_order2=deleteBadOrderSensePairs(split(",",$t[11])); + my @position1=deleteBadOrderSensePairs(split(",",$t[14])); + my @position2=deleteBadOrderSensePairs(split(",",$t[15])); + my @start1; my @end1; getCoordswithLeftMost(\@start1,\@end1,\@position1,\@strand1,\@ends_order1,$tag_length->{$sample}); + my @start2; my @end2; getCoordswithLeftMost(\@start2,\@end2,\@position2,\@strand2,\@ends_order2,$tag_length->{$sample}); + + + for my $p (0..$#pairs){ + + if (!exists $pair{$pairs[$p]}){ + + if($same_chr){ + + $pair{$pairs[$p]}->{0}=[ $chr1, $start1[$p]-1, $end2[$p], $pairs[$p], 0, $str{$strand1[$p]}, + $start1[$p]-1, $end2[$p], $color, + 2, $tag_length->{$sample}->{$ends_order1[$p]}.",".$tag_length->{$sample}->{$ends_order2[$p]}, "0,".($start2[$p]-$start1[$p]) ]; + $pt{$n}=$pair{$pairs[$p]}->{0}; + $n++; + + }else{ + + $pair{$pairs[$p]}->{1}=[ $chr1, $start1[$p]-1, $end1[$p] , $pairs[$p]."/1", 0, $str{$strand1[$p]}, + $start1[$p]-1, $end1[$p], $color, + 1, $tag_length->{$sample}->{$ends_order1[$p]}, 0]; + $pt{$n}=$pair{$pairs[$p]}->{1}; + $n++; + + + $pair{$pairs[$p]}->{2}=[ $chr2, $start2[$p]-1, $end2[$p], $pairs[$p]."/2", 0, $str{$strand2[$p]}, + $start2[$p]-1, $end2[$p], $color, + 1, $tag_length->{$sample}->{$ends_order2[$p]}, 0]; + $pt{$n}=$pair{$pairs[$p]}->{2}; + $n++; + } + }else{ + + if($same_chr){ + ${$pair{$pairs[$p]}->{0}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{0}}[8]}); + }else{ + ${$pair{$pairs[$p]}->{1}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{1}}[8]}); + ${$pair{$pairs[$p]}->{2}}[8]=$color if($color_order{$color}>$color_order{${$pair{$pairs[$p]}->{2}}[8]}); + } + } + } + } + close LINKS; + + my $nb_pairs=$n-1; + + open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; + print BED "track name=\"$bed_file\" description=\"mate pairs involved in links\" ". + "visibility=2 itemRgb=\"On\"\n"; + + for my $i (1..$nb_pairs){ + print BED join("\t",@{$pt{$i}})."\n"; + } + + close BED; + + print LOG "-- output created: $bed_file\n"; + + undef %pair; + undef %pt; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub deleteBadOrderSensePairs{ + + my (@tab)=@_; + my @tab2; + + foreach my $v (@tab){ + + $v=~s/[\(\)]//g; + push(@tab2,$v) if($v!~/[\$\*\@]$/); + } + return @tab2; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getAllEntries{ + my (@tab)=split (/,/,$_[0]); + my @tab2; + + foreach my $v (@tab){ + + $v=~s/[\(\)]//g; + push(@tab2,$v); + } + return @tab2; +}#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getAllEntriesWOspecialChar{ + my (@tab)=split (/,/,$_[0]); + my @tab2; + + foreach my $v (@tab){ + + $v=~s/[\(\)\$\*\@]//g; + push(@tab2,$v); + } + return @tab2; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub links2SVfile{ + + my($links_file,$sv_file)=@_; + + print LOG "# Converting to the sv output table...\n"; + open LINKS, "<$links_file" or die "$0: can't open $links_file:$!\n"; + open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; + + my @header=qw(chr_type SV_type BAL_type chromosome1 start1-end1 average_dist + chromosome2 start2-end2 nb_pairs score_strand_filtering score_order_filtering score_insert_size_filtering + final_score breakpoint1_start1-end1 breakpoint2_start2-end2); + + my $nb_links=0; + + while (<LINKS>){ + + my @t=split; + my @sv=(); + my $sv_type="-"; + my $strand_ratio="-"; + my $eq_ratio="-"; + my $eq_type="-"; + my $insert_ratio="-"; + my $link="-"; + my ($bk1, $bk2)=("-","-"); + my $score="-"; + + my ($chr1,$start1,$end1)=($t[0],$t[1],$t[2]); + my ($chr2,$start2,$end2)=($t[3],$t[4],$t[5]); + my $nb_pairs=$t[6]; + $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/i); + $chr2 = "chr".$chr2 unless ($chr2 =~ m/chr/i); + my $chr_type=($chr1 eq $chr2)? "INTRA":"INTER"; + + #if strand filtering + if (defined $t[16]){ + #if inter-chr link + $sv_type=$t[16]; + if(defined $t[17] && $t[17]=~/^(\d+)\/(\d+)$/){ + $strand_ratio=floor($1/$2*100)."%"; + $score=$t[18]; + } + if(defined $t[18] && $t[18]=~/^(\d+)\/(\d+)$/){ + #if intra-chr link with insert size filtering + $strand_ratio=floor($1/$2*100)."%"; + $link=floor($t[17]); + if($sv_type!~/^INV/){ + $insert_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); + $score=$t[20]; + }else{ + $score=$t[19]; + } + } + } + + if(defined $t[18] && ($t[18] eq "UNBAL" || $t[18] eq "BAL")){ + + #if strand and order filtering only and/or interchr link + $eq_type=$t[18]; + $eq_ratio=floor($1/$2*100)."%" if($t[19]=~/^(\d+)\/(\d+)$/); + ($bk1, $bk2)=($t[20],$t[21]); + foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} + $score=$t[22]; + + }elsif(defined $t[19] && ($t[19] eq "UNBAL" || $t[19] eq "BAL")){ + + #if all three filtering + $link=floor($t[17]); + $eq_type=$t[19]; + $eq_ratio=floor($1/$2*100)."%" if($t[20]=~/^(\d+)\/(\d+)$/); + + if(defined $t[21] && $t[21]=~/^(\d+)\/(\d+)$/){ + $insert_ratio=floor($1/$2*100)."%"; + ($bk1, $bk2)=($t[22],$t[23]); + $score=$t[24]; + + }else{ + ($bk1, $bk2)=($t[21],$t[22]); + $score=$t[23]; + } + foreach my $bk ($bk1, $bk2){$bk=~s/\),\(/ /g; $bk=~s/(\(|\))//g; $bk=~s/,/-/g;} + + } + + + push(@sv, $chr_type, $sv_type,$eq_type); + push(@sv,"$chr1\t$start1-$end1"); + push(@sv, $link); + push(@sv,"$chr2\t$start2-$end2", + $nb_pairs,$strand_ratio,$eq_ratio,$insert_ratio, decimal($score,4), $bk1, $bk2); + + + print SV join("\t",@sv)."\n"; + } + + close LINKS; + close SV; + + system "sort -k 9,9nr -k 13,13nr $sv_file > $sv_file.sorted"; + + open SV, "<".$sv_file.".sorted" or die "$0: can't open in the output: $sv_file".".sorted :$!\n"; + my @links=<SV>; + close SV; + + open SV, ">$sv_file" or die "$0: can't write in the output: $sv_file :$!\n"; + + print SV join("\t",@header)."\n"; + print SV @links; + close SV; + + unlink($sv_file.".sorted"); + + print LOG "-- output created: $sv_file\n"; + +} +#------------------------------------------------------------------------------# +sub densityCalculation{ + + my ($chr,$chrID,$file,$tag_length,$window_dist,$step,$mates_file,$mates_file_ref,$density_file,$input_format)=@_; + + my @sfile=split(/\./,$$mates_file[$file]); + my $fchr=$sfile[$#sfile]; + + my $fh = new FileHandle; + + my %density; + my %density_ref; + my @ratio; + my ($cov,$cov_ref); + + #FREQUENCY CALCULATION PROCEDURE + print LOG "# $fchr : Frequency calculation procedure...\n"; + &FreqCalculation(\%density,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file[$file],$input_format); + &FreqCalculation(\%density_ref,$chr,$chrID,$tag_length,$window_dist,$step,$$mates_file_ref[$file],$input_format); + + #MAKING RATIO AND OUTPUT + print LOG "\# Ratio calculation procedure...\n"; + $density_file=~s/\/mates\//\/density\//; + $fh->open(">".$density_file) or die "$0: can't write in the output ".$density_file." :$!\n"; + + foreach my $k (1..$chr->{nb_chrs}){ + foreach my $frag (1..$chr->{$k}->{nb_frag}){ + + @ratio= ($chr->{$k}->{name}, + (${$chr->{$k}->{$frag}}[0]+1), + (${$chr->{$k}->{$frag}}[1]+1)); + + $cov=(exists $density{$k}{$frag}->{count})? $density{$k}{$frag}->{count}:0; + $cov_ref=(exists $density_ref{$k}{$frag}->{count})? $density_ref{$k}{$frag}->{count}:0; + + push(@ratio,$cov,$cov_ref); + push(@ratio,log($cov/$cov_ref)) if($cov && $cov_ref); + push(@ratio,-log($cov_ref+1)) if(!$cov && $cov_ref); + push(@ratio,log($cov+1)) if($cov && !$cov_ref); + next if(!$cov && !$cov_ref); + + print $fh join("\t",@ratio)."\n"; + } + } + + $fh->close; + print LOG "-- output created: $density_file\n"; + + undef %density; + undef %density_ref; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub FreqCalculation{ + + my ($density,$chr,$chrID,$tag_length,$window_dist,$step,$mates_file,$input_format) = @_; + + my @sfile=split(/\./,$mates_file); + my $fchr=$sfile[$#sfile]; + my $fh = new FileHandle; + + my $nb_windows=0; + my $warn=100000; + my $record=0; + my %pair; + + my ($sumX,$sumX2) = (0,0); + + print LOG "\# Frequency calculation for $mates_file...\n"; + + if ($mates_file =~ /.gz$/) { + $fh->open("gunzip -c $mates_file |") or die "$0: can't open ".$mates_file.":$!\n"; + }elsif($mates_file =~ /.bam$/){ + o$fh->open("$SAMTOOLS_BIN_DIR/samtools view $mates_file |") or die "$0: can't open ".$mates_file.":$!\n";#GALAXY + }else{ + $fh->open("<".$mates_file) or die "$0: can't open ".$mates_file.":$!\n"; + } + + while(<$fh>){ + + my @t=split; + my $mate=$t[0]; + + my ($chr_read1, $chr_read2, $firstbase_read1, $firstbase_read2, $end_order_read1, $end_order_read2,); + + next if(exists $pair{$mate}); + + next if (!readMateFile(\$chr_read1, \$chr_read2, \$firstbase_read1, \$firstbase_read2,\$end_order_read1, \$end_order_read2, \@t, $input_format,$tag_length)); + + next unless (exists $chrID->{$chr_read1} || exists $chrID->{$chr_read2}); + ($chr_read1, $chr_read2)= ($chrID->{$chr_read1},$chrID->{$chr_read2}); + + $pair{$mate}=undef; + $record++; + + my ($coord_start_read1,$coord_end_read1, $coord_start_read2,$coord_end_read2); + + recupCoords($firstbase_read1,\$coord_start_read1,\$coord_end_read1,$tag_length->{$end_order_read1},$input_format); + recupCoords($firstbase_read2,\$coord_start_read2,\$coord_end_read2,$tag_length->{$end_order_read2},$input_format); + + my $length = abs($coord_start_read1-$coord_start_read2); + $sumX += $length; #add to sum and sum^2 for mean and variance calculation + $sumX2 += $length*$length; + + for(my $i=1;$i<=$chr->{$chr_read1}->{'nb_frag'};$i++){ + + if (abs ($coord_start_read1-${$chr->{$chr_read1}->{$i}}[0]) <= $window_dist){ + + &addToDensity($density,$chr_read1,$i,\$nb_windows) + if(overlap($coord_start_read1,$coord_end_read2,${$chr->{$chr_read1}->{$i}}[0],${$chr->{$chr_read1}->{$i}}[1])); + + }else{ + + $i=getNextFrag($coord_start_read1,$i,${$chr->{$chr_read1}->{$i}}[0],$chr->{$chr_read1}->{nb_frag},$window_dist,$step); + } + } + + if($record>=$warn){ + print LOG "-- $warn mate-pairs analysed - $nb_windows points created\n"; + $warn+=100000; + } + } + $fh->close; + + print LOG "-- $fchr : Total : $record mate-pairs analysed - $nb_windows points created\n"; + + if($record>0){ + + my $mu = $sumX/$record; + my $sigma = sqrt($sumX2/$record - $mu*$mu); + print LOG "-- $fchr : mu length = $mu, sigma length = $sigma\n"; + } + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub ratio2segdup{ + + my($id,$density_file,$segdup_file)=@_; + + print LOG "# Converting to circos format...\n"; + + open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n"; + open SEGDUP, ">$segdup_file" or die "$0: can't write in the output: $segdup_file :$!\n"; + + while(<RATIO>){ + chomp; + my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5]; + print SEGDUP "$id$chr1\t$start1\t$end1\t$ratio\n"; + } + + close RATIO; + close SEGDUP; + print LOG "-- output created: $segdup_file\n"; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub ratio2bedfile{ + + my($density_file,$bed_file)=@_; + + print LOG "# Converting to bedGraph format...\n"; + + open RATIO, "<$density_file" or die "$0: can't open $density_file :$!\n"; + open BED, ">$bed_file" or die "$0: can't write in the output: $bed_file :$!\n"; + print BED "track type=bedGraph name=\"$bed_file\" description=\"log ratios for cnv detection\" ". + "visibility=2 color=255,0,0 alwaysZero=\"On\"\n"; + + while(<RATIO>){ + chomp; + my ($chr1,$start1,$end1,$ratio)=(split /\t/)[0,1,2,5]; + $chr1 = "chr".$chr1 unless ($chr1 =~ m/chr/); + print BED "$chr1\t".($start1-1)."\t$end1\t$ratio\n"; + } + + close RATIO; + close BED; + print LOG "-- output created: $bed_file\n"; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub inverseSense{ + + my $mate_sense=$_[0]; + my %reverse=( 'F' => 'R' , 'R' => 'F' , + 'FF' => 'RR', 'RR' => 'FF', + 'FR' => 'RF', 'RF' => 'FR'); + return $reverse{$mate_sense}; +} + +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getNextFrag{ + + my ($read_start,$frag_num,$frag_start,$frag_last,$window_dist,$step)=@_; + + my $how_far = $read_start-$frag_start; + my $nb_windows_toskip; + + if($how_far>0){ + + $nb_windows_toskip=($how_far/$step)-($window_dist/$step); + $nb_windows_toskip=~ s/\..*//; + $nb_windows_toskip=0 if($nb_windows_toskip<0); + return ($frag_num + $nb_windows_toskip); + } + return $frag_last; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getColor{ + + my($count,$hcolor,$format)=@_; + for my $col ( keys % { $hcolor} ) { + return $col if($count>=$hcolor->{$col}->[0] && $count<=$hcolor->{$col}->[1]); + } + return "white" if($format eq "circos"); + return "255,255,255" if($format eq "bed"); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub recupCoords{ + + my($c_hit,$cs_hit,$ce_hit,$tag_length,$input_format)=@_; + my $strand = 'F'; + + if ($c_hit=~s/^\-//) { + $strand='R'; + $$cs_hit=$c_hit; + $$ce_hit=$c_hit-($tag_length-1); + }else{ + $$cs_hit=$c_hit; + $$ce_hit=$c_hit+($tag_length-1); + } + return $strand; + +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub overlap { + my($cs_hit,$ce_hit,$cs_region,$ce_region)=@_; + if( (($cs_hit < $cs_region) && ($ce_hit < $cs_region )) || (($cs_hit > $ce_region) && ($ce_hit > $ce_region )) ) { + return 0; + } + return 1; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub makeLink { + + my ($link,$chr1,$frag1,$chr2,$frag2,$mt,$nb)=@_; + + if($chr1>$chr2){ + ($chr1,$chr2)= ($chr2,$chr1); + ($frag1,$frag2)= ($frag2,$frag1); + } + + if($chr1 == $chr2){ + if($frag1>$frag2){ + ($frag1,$frag2)= ($frag2,$frag1); + } + } + + if(!exists $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}){ + $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}=$mt; + $$nb++; + }elsif($link->{$chr1}->{$chr2}->{$frag1}->{$frag2}!~/(^|,)$mt(,|$)/){ + $link->{$chr1}->{$chr2}->{$frag1}->{$frag2}.=",$mt"; + } +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#fonction of adding the read to the density profile +sub addToDensity { + + my ($density,$chr1,$frag1,$nb)=@_; + + if(!exists $density->{$chr1}->{$frag1}->{count}){ + $density->{$chr1}->{$frag1}->{count}=1; + $$nb++; + }else{ + $density->{$chr1}->{$frag1}->{count}++; + } +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub floor { + my $nb = $_[0]; + $nb=~ s/\..*//; + return $nb; +} +#------------------------------------------------------------------------------# +sub decimal{ + + my $num=shift; + my $digs_to_cut=shift; + + $num=sprintf("%.".($digs_to_cut-1)."f", $num) if ($num=~/\d+\.(\d){$digs_to_cut,}/); + + return $num; +} + +#------------------------------------------------------------------------------# +sub max { + + my($max) = shift(@_); + foreach my $temp (@_) { + $max = $temp if $temp > $max; + } + return($max); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub min { + + my($min) = shift(@_); + foreach my $temp (@_) { + $min = $temp if $temp < $min; + } + return($min); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub sortTablebyIndex{ + my ($tab1,$tab2)=@_; + my @tab3; + + foreach my $i (@$tab1){ + $tab3[$i]=$$tab2[$$tab1[$i]]; + } + return @tab3; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub round { + my $number = shift || 0; + my $dec = 10 ** (shift || 0); + return int( $dec * $number + .5 * ($number <=> 0)) / $dec; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub getUniqueTable{ + + my (@tab)=@_; + my (%saw,@out)=(); + undef %saw; + return sort(grep(!$saw{$_}++, @tab)); +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +sub catFiles { + + unlink("$_[1]") if(exists $_[1]); + system qq( cat "$_" >> "$_[1]" ) for @{$_[0]}; +} +#------------------------------------------------------------------------------# +#------------------------------------------------------------------------------# +#check if the configuration file is correct +sub validateconfiguration{ + + my %conf=%{$_[0]}; + my $list_prgs="@ARGV"; + + my @general_params=qw(input_format mates_orientation read1_length read2_length mates_file cmap_file); + my @detection_params=qw(split_mate_file window_size step_length split_mate_file); + my @filtering_params=qw(split_link_file nb_pairs_threshold strand_filtering split_link_file); + my @circos_params=qw(organism_id colorcode); + my @bed_params=qw(colorcode); + my @compare_params=qw(list_samples file_suffix); + + foreach my $dir ($conf{general}{output_dir},$conf{general}{tmp_dir}){ + + unless (defined($dir)) { + $dir = "."; + } + unless (-d $dir){ + mkdir $dir or die; + } + $dir.="/" if($dir!~/\/$/); + } + + unless (defined($conf{general}{num_threads})) { + $conf{general}{num_threads} = 1; + } + $conf{general}{num_threads}=24 if($conf{general}{num_threads}>24); + + if($list_prgs!~/links2compare/){ + + foreach my $p (@general_params){ + die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{general}{$p}); + } + + $conf{general}{input_format}="sam" if($conf{general}{input_format} eq "bam"); + + unless (defined($conf{general}{sv_type})) { + $conf{general}{sv_type} = "all"; + } + + $conf{general}{read_lengths}={ 1=> $conf{general}{read1_length}, 2=> $conf{general}{read2_length}}; + } + + if($list_prgs=~/(linking|cnv)/){ + + foreach my $p (@detection_params){ + die("Error Config : The parameter \"$p\" is not defined\n") if (!defined $conf{detection}{$p}); + } + + die("Error Config : The parameter \"mates_file_ref\" is not defined\n") if($list_prgs=~/cnv/ && !defined $conf{detection}{mates_file_ref}); + + if($conf{detection}{step_length}>$conf{detection}{window_size}){ + die("Error Config : Parameter \"step_length\" should not exceed \"window size\"\n"); + } + + unless (-d $conf{general}{tmp_dir}."/mates"){ + mkdir $conf{general}{tmp_dir}."/mates" or die; + } + + if($list_prgs=~/linking/){ + unless (-d $conf{general}{tmp_dir}."/links"){ + mkdir $conf{general}{tmp_dir}."/links" or die; + } + } + if($list_prgs=~/cnv/){ + unless (-d $conf{general}{tmp_dir}."/density"){ + mkdir $conf{general}{tmp_dir}."/density" or die; + } + } + + } + + if($list_prgs=~/filtering/){ + + foreach my $p (@filtering_params) { + die("Error Config : The filtering parameter \"$p\" is not defined\n") if (!defined $conf{filtering}{$p}); + + } + + if(defined($conf{filtering}{chromosomes})) { + my @chrs=split(",",$conf{filtering}{chromosomes}); + my $exclude=($chrs[0]=~/^\-/)? 1:0; + for my $chrName (@chrs){ + + die("Error Config : The filtering parameter \"chromosomes\" is not valid\n") + if(($chrName!~/^\-/ && $exclude) || ($chrName=~/^\-/ && !$exclude)); + + } + } + + if (( $conf{filtering}{order_filtering} )&& !$conf{filtering}{strand_filtering}) { + die("Error Config : The parameter strand_filtering is set to \"0\" while order_filtering is selected". + "\nChange strand_filtering to \"1\" if you want to use the order filtering\n"); + } + if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{order_filtering}) { + die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use order filtering\n"); + } + if (( $conf{filtering}{insert_size_filtering} )&& !$conf{filtering}{strand_filtering}) { + die("Error Config : The parameter strand_filtering is set to \"0\" while insert_size_filtering is selected". + "\nChange strand_filtering to \"1\" if you want to use the insert size filtering\n"); + } + if (( !defined($conf{filtering}{mu_length}) || !defined($conf{filtering}{sigma_length}) )&& $conf{filtering}{insert_size_filtering}) { + die("Error Config : You should set parameters \"mu_length\" and \"sigma_length\" to use discriminate insertions from deletions\n"); + } + + if (!defined($conf{filtering}{indel_sigma_threshold})) { + $conf{filtering}{indel_sigma_threshold} = 2; + } + if (!defined($conf{filtering}{dup_sigma_threshold})) { + $conf{filtering}{dup_sigma_threshold} = 2; + } + if (!defined($conf{filtering}{singleton_sigma_threshold})) { + $conf{filtering}{singleton_sigma_threshold} = 4; + } + + if (!defined($conf{filtering}{nb_pairs_order_threshold})) { + $conf{filtering}{nb_pairs_order_threshold} = 1; + } + + if (!defined($conf{filtering}{final_score_threshold})) { + $conf{filtering}{final_score_threshold} = 0.8; + } + + if ($conf{filtering}{nb_pairs_order_threshold}>$conf{filtering}{nb_pairs_threshold}) { + die("Error Config : Parameter \"nb_pairs_order_threshold\" should not exceed \"nb_pairs_threshold\"\n"); + } + + } + + if($list_prgs=~/2circos$/){ + foreach my $p (@circos_params) { + next if($list_prgs=~/^ratio/ && $p eq "colorcode"); + die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); + } + } + + if($list_prgs=~/2bed$/){ + foreach my $p (@bed_params) { + die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); + } + } + + if($list_prgs=~/links2compare/){ + foreach my $p (@compare_params) { + die("Error Config : The compare parameter \"$p\" is not defined\n") if (!defined $conf{compare}{$p}); + } + + unless (defined($conf{compare}{same_sv_type})) { + $conf{compare}{same_sv_type} = 0; + } + + unless (defined($conf{compare}{min_overlap})) { + $conf{compare}{min_overlap} = 1E-9; + } + + if($conf{compare}{circos_output}){ + foreach my $p (@circos_params) { + next if($list_prgs=~/^ratio/ && $p eq "colorcode"); + die("Error Config : The circos parameter \"$p\" is not defined\n") if (!defined $conf{circos}{$p}); + } + } + if($conf{compare}{bed_output}){ + foreach my $p (@bed_params) { + die("Error Config : The bed parameter \"$p\" is not defined\n") if (!defined $conf{bed}{$p}); + } + die("Error Config : The compare parameter \"list_read_lengths\" is not defined\n") if (!defined $conf{compare}{list_read_lengths}); + + my @samples=split(",",$conf{compare}{list_samples}); + my @read_lengths=split(",",$conf{compare}{list_read_lengths}); + for my $i (0..$#samples){ + my @l=split("-",$read_lengths[$i]); + $conf{compare}{read_lengths}{$samples[$i]}={ 1=> $l[0], 2=> $l[1]}; + } + } + } + + +} +#------------------------------------------------------------------------------# +#::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::#