Mercurial > repos > brasset_jensen > srnapipe
diff bin/ppp.pm @ 61:9185ca0a7b43 draft
Updated package according to recommendations.
author | pierre.pouchin |
---|---|
date | Wed, 16 Jan 2019 08:18:13 -0500 |
parents | 9645d995fb3c |
children |
line wrap: on
line diff
--- a/bin/ppp.pm Wed Oct 24 07:40:20 2018 -0400 +++ b/bin/ppp.pm Wed Jan 16 08:18:13 2019 -0500 @@ -1,230 +1,230 @@ -package ppp; - -use strict; -use warnings; -use FindBin; -use lib $FindBin::Bin; -use Rcall qw ( histogram ); -use Math::CDF; - -use Exporter; -our @ISA = qw( Exporter ); -our @EXPORT_OK = qw( &ping_pong_partners ); - -sub ping_pong_partners -{ - my ( $TE_fai, $sam, $dir, $max ) = @_; - my ( $hashRef, $dupRef, $hasPpp ) = count_mapped ( $TE_fai, $sam ); - my ( %num_per_overlap_size, $overlap_number, $reverseR, $begRev, $endRev, $sensR, $begSens, $endSens, $snum, $rnum, $overlap ); - my ( $SP, $AP, $SN, $AN, $txt ); - my $flag = 0; - my @distri_overlap = (); my @overlaps_names = (); - - open my $ppp_f, '>', $dir."ppp.txt" || die "cannot create ppp.txt $!\n"; - foreach my $k ( sort keys %{$hashRef} ) - { - my $v = $hashRef->{$k}; - my $TE_dir = $dir.$k.'/'; - - %num_per_overlap_size = (); $overlap_number = 0; - $flag = 0; - for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ ) - { - $reverseR = ${$v->[1]}[$i] ; - $begRev = $reverseR->[0]; - $endRev = $begRev + length($reverseR->[1]) - 1; - - my $revR = reverse($reverseR->[1]); - $revR =~ tr/atgcuATGCU/tacgaTACGA/; - - for ( my $j = 0; $j <= $#{$v->[0]}; $j++ ) - { - $sensR = ${$v->[0]}[$j]; - $begSens = $sensR->[0]; - $endSens = $begSens + length($sensR->[1]) - 1; - - if ( $begSens <= $endRev && $endSens > $endRev ) - { - $flag = 1; - mkdir $TE_dir; - open $txt, '>', $TE_dir.'overlap_size.txt' || die "cannot open repartition\n"; - - $overlap = $endRev - $begSens + 1; - $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]}; - $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]}; - - if ( $overlap == 10 ) - { - $hasPpp->{ $sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3] } = 1; - $hasPpp->{ $reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3] } = 1; - } - next if $overlap > $max; - if ( $snum < $rnum ) - { - $num_per_overlap_size{$overlap} += $snum; - $overlap_number += $snum; - } - else - { - $num_per_overlap_size{$overlap} += $rnum ; - $overlap_number += $rnum ; - } - } - } - } - if ( $max != 0 ) - { - my @overlaps = (); - push @overlaps_names, $k; - for my $i (1..$max) - { - $num_per_overlap_size{$i} = 0 unless exists( $num_per_overlap_size{$i} ); - push @overlaps, $num_per_overlap_size{$i}; - } - push @distri_overlap, \@overlaps; - } - - if ( $flag == 1 ) - { - open $AP, '>', $TE_dir."antisensPPP.txt" || die "cannot create antisensPPP\n"; - open $AN, '>', $TE_dir."antisens.txt" || die "cannot create antisens\n"; - for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ ) - { - $reverseR = ${$v->[1]}[$i] ; - my $revR = reverse($reverseR->[1]); - $revR =~ tr/atgcuATGCU/tacgaTACGA/; - $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]}; - if ( $hasPpp->{ $reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3] } == 1 ) - { - print $AP ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n"; - } - else - { - print $AN ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n"; - } - } - close $AP; close $AN; - - open $SP, '>', $TE_dir."sensPPP.txt" || die "cannot create sensPPP\n"; - open $SN, '>', $TE_dir."sens.txt" || die "cannot create sens\n"; - for ( my $j = 0; $j <= $#{$v->[0]}; $j++ ) - { - $sensR = ${$v->[0]}[$j]; - $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]}; - if ( $hasPpp->{ $sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3] } == 1 ) - { - print $SP ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n"; - } - else - { - print $SN ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n"; - } - } - close $SP; close $SN; - - my $histo_png = $TE_dir.'histogram.png'; - histogram( \%num_per_overlap_size, $histo_png, $overlap_number ); - print $txt "size\tnumber\tpercentage of the total overlap number\n"; - foreach my $k ( sort {$a <=> $b} keys %num_per_overlap_size ) - { - my $percentage = 0; - $percentage = $num_per_overlap_size{$k} * 100 / $overlap_number unless $overlap_number == 0; - print $txt "$k\t$num_per_overlap_size{$k}\t"; printf $txt "%.2f\n",$percentage; - } - close $txt; - } - } - - foreach my $tabP ( @distri_overlap ) - { - my $sum = sum($tabP); - my $ten = $tabP->[9]; - my $mean = mean($tabP); - my $std = standard_deviation($tabP, $mean); - my $zsc = z_significance($ten, $mean, $std); - my $name = shift @overlaps_names; - my $prob = 'NA'; - $prob = 1 - &Math::CDF::pnorm( $zsc ) if $zsc ne 'NA'; - print $ppp_f (join ("\t", $name, $sum, $ten, $mean, $std, $zsc, $prob ),"\n" ); - } - close $ppp_f; -} - -sub count_mapped -{ - my ( $fai, $in_file ) = @_; - my ( %mapped, %dup, %has_ppp ); - - open my $f, '<', $fai || die "cannot open $fai $! \n"; - while(<$f>) - { - if ($_ =~ /(.*)\t(\d+)\n/) - { - $mapped{$1} = []; - $mapped{$1}->[0] = []; $mapped{$1}->[1] = []; - } - } - close $f; - - open my $infile, "samtools view '$in_file' |"|| die "cannot open input file $! \n"; - while(<$infile>) - { - unless ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) - { - my @line = split (/\t/,$_); - if ($line[1] == 0) - { - unless ( exists ($dup{$line[3].$line[9].$line[1].$line[2]}) ) - { - push @{$mapped{$line[2]}->[0]} , [$line[3], $line[9], $line[1], $line[2]]; - $has_ppp {$line[3].$line[9].$line[1].$line[2]} = 0; - } - $dup{$line[3].$line[9].$line[1].$line[2]}+=1; - } - elsif ($line[1] == 16) - { - unless ( exists ($dup{$line[3].$line[9].$line[1].$line[2]}) ) - { - push @{$mapped{$line[2]}->[1]} , [$line[3], $line[9], $line[1], $line[2]]; - $has_ppp{$line[3].$line[9].$line[1].$line[2]} = 0; - } - $dup{$line[3].$line[9].$line[1].$line[2]}+=1 - } - } - } - close $infile; - return (\%mapped, \%dup, \%has_ppp ); -} - -sub sum -{ - my $arrayref = shift; - my $result = 0; - foreach (@$arrayref) {$result += $_} - return $result; -} - -sub mean -{ - my $arrayref = shift; - my $result; - foreach (@$arrayref) {$result += $_} - return $result / scalar(@$arrayref); -} - -sub standard_deviation -{ - my ($arrayref, $mean) = @_; - return sqrt ( mean ( [map $_**2 , @$arrayref ]) - ($mean**2)); -} - -sub z_significance -{ - my ($ten, $mean, $std) = @_; - my $z = 'NA'; - $z = (($ten - $mean) / $std) if $std != 0; - return $z; -} - -1; - +package ppp; + +use strict; +use warnings; +use FindBin; +use lib $FindBin::Bin; +use Rcall qw ( histogram ); +use Math::CDF; + +use Exporter; +our @ISA = qw( Exporter ); +our @EXPORT_OK = qw( &ping_pong_partners ); + +sub ping_pong_partners +{ + my ( $TE_fai, $sam, $dir, $max ) = @_; + my ( $hashRef, $dupRef, $hasPpp ) = count_mapped ( $TE_fai, $sam ); + my ( %num_per_overlap_size, $overlap_number, $reverseR, $begRev, $endRev, $sensR, $begSens, $endSens, $snum, $rnum, $overlap ); + my ( $SP, $AP, $SN, $AN, $txt ); + my $flag = 0; + my @distri_overlap = (); my @overlaps_names = (); + + open my $ppp_f, '>', $dir."ppp.txt" || die "cannot create ppp.txt $!\n"; + foreach my $k ( sort keys %{$hashRef} ) + { + my $v = $hashRef->{$k}; + my $TE_dir = $dir.$k.'/'; + + %num_per_overlap_size = (); $overlap_number = 0; + $flag = 0; + for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ ) + { + $reverseR = ${$v->[1]}[$i] ; + $begRev = $reverseR->[0]; + $endRev = $begRev + length($reverseR->[1]) - 1; + + my $revR = reverse($reverseR->[1]); + $revR =~ tr/atgcuATGCU/tacgaTACGA/; + + for ( my $j = 0; $j <= $#{$v->[0]}; $j++ ) + { + $sensR = ${$v->[0]}[$j]; + $begSens = $sensR->[0]; + $endSens = $begSens + length($sensR->[1]) - 1; + + if ( $begSens <= $endRev && $endSens > $endRev ) + { + $flag = 1; + mkdir $TE_dir; + open $txt, '>', $TE_dir.'overlap_size.txt' || die "cannot open repartition\n"; + + $overlap = $endRev - $begSens + 1; + $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]}; + $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]}; + + if ( $overlap == 10 ) + { + $hasPpp->{ $sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3] } = 1; + $hasPpp->{ $reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3] } = 1; + } + next if $overlap > $max; + if ( $snum < $rnum ) + { + $num_per_overlap_size{$overlap} += $snum; + $overlap_number += $snum; + } + else + { + $num_per_overlap_size{$overlap} += $rnum ; + $overlap_number += $rnum ; + } + } + } + } + if ( $max != 0 ) + { + my @overlaps = (); + push @overlaps_names, $k; + for my $i (1..$max) + { + $num_per_overlap_size{$i} = 0 unless exists( $num_per_overlap_size{$i} ); + push @overlaps, $num_per_overlap_size{$i}; + } + push @distri_overlap, \@overlaps; + } + + if ( $flag == 1 ) + { + open $AP, '>', $TE_dir."antisensPPP.txt" || die "cannot create antisensPPP\n"; + open $AN, '>', $TE_dir."antisens.txt" || die "cannot create antisens\n"; + for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ ) + { + $reverseR = ${$v->[1]}[$i] ; + my $revR = reverse($reverseR->[1]); + $revR =~ tr/atgcuATGCU/tacgaTACGA/; + $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]}; + if ( $hasPpp->{ $reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3] } == 1 ) + { + print $AP ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n"; + } + else + { + print $AN ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n"; + } + } + close $AP; close $AN; + + open $SP, '>', $TE_dir."sensPPP.txt" || die "cannot create sensPPP\n"; + open $SN, '>', $TE_dir."sens.txt" || die "cannot create sens\n"; + for ( my $j = 0; $j <= $#{$v->[0]}; $j++ ) + { + $sensR = ${$v->[0]}[$j]; + $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]}; + if ( $hasPpp->{ $sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3] } == 1 ) + { + print $SP ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n"; + } + else + { + print $SN ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n"; + } + } + close $SP; close $SN; + + my $histo_png = $TE_dir.'histogram.png'; + histogram( \%num_per_overlap_size, $histo_png, $overlap_number ); + print $txt "size\tnumber\tpercentage of the total overlap number\n"; + foreach my $k ( sort {$a <=> $b} keys %num_per_overlap_size ) + { + my $percentage = 0; + $percentage = $num_per_overlap_size{$k} * 100 / $overlap_number unless $overlap_number == 0; + print $txt "$k\t$num_per_overlap_size{$k}\t"; printf $txt "%.2f\n",$percentage; + } + close $txt; + } + } + + foreach my $tabP ( @distri_overlap ) + { + my $sum = sum($tabP); + my $ten = $tabP->[9]; + my $mean = mean($tabP); + my $std = standard_deviation($tabP, $mean); + my $zsc = z_significance($ten, $mean, $std); + my $name = shift @overlaps_names; + my $prob = 'NA'; + $prob = 1 - &Math::CDF::pnorm( $zsc ) if $zsc ne 'NA'; + print $ppp_f (join ("\t", $name, $sum, $ten, $mean, $std, $zsc, $prob ),"\n" ); + } + close $ppp_f; +} + +sub count_mapped +{ + my ( $fai, $in_file ) = @_; + my ( %mapped, %dup, %has_ppp ); + + open my $f, '<', $fai || die "cannot open $fai $! \n"; + while(<$f>) + { + if ($_ =~ /(.*)\t(\d+)\n/) + { + $mapped{$1} = []; + $mapped{$1}->[0] = []; $mapped{$1}->[1] = []; + } + } + close $f; + + open my $infile, "samtools view '$in_file' |"|| die "cannot open input file $! \n"; + while(<$infile>) + { + unless ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ ) + { + my @line = split (/\t/,$_); + if ($line[1] == 0) + { + unless ( exists ($dup{$line[3].$line[9].$line[1].$line[2]}) ) + { + push @{$mapped{$line[2]}->[0]} , [$line[3], $line[9], $line[1], $line[2]]; + $has_ppp {$line[3].$line[9].$line[1].$line[2]} = 0; + } + $dup{$line[3].$line[9].$line[1].$line[2]}+=1; + } + elsif ($line[1] == 16) + { + unless ( exists ($dup{$line[3].$line[9].$line[1].$line[2]}) ) + { + push @{$mapped{$line[2]}->[1]} , [$line[3], $line[9], $line[1], $line[2]]; + $has_ppp{$line[3].$line[9].$line[1].$line[2]} = 0; + } + $dup{$line[3].$line[9].$line[1].$line[2]}+=1 + } + } + } + close $infile; + return (\%mapped, \%dup, \%has_ppp ); +} + +sub sum +{ + my $arrayref = shift; + my $result = 0; + foreach (@$arrayref) {$result += $_} + return $result; +} + +sub mean +{ + my $arrayref = shift; + my $result; + foreach (@$arrayref) {$result += $_} + return $result / scalar(@$arrayref); +} + +sub standard_deviation +{ + my ($arrayref, $mean) = @_; + return sqrt ( mean ( [map $_**2 , @$arrayref ]) - ($mean**2)); +} + +sub z_significance +{ + my ($ten, $mean, $std) = @_; + my $z = 'NA'; + $z = (($ten - $mean) / $std) if $std != 0; + return $z; +} + +1; +