comparison bin/ppp.pm @ 1:1df6aaac800e draft

Deleted selected files
author brasset_jensen
date Wed, 13 Dec 2017 10:40:50 -0500
parents
children 4bc00caa60b4
comparison
equal deleted inserted replaced
0:e4e71401c577 1:1df6aaac800e
1 package ppp;
2
3 use strict;
4 use warnings;
5 use FindBin;
6 use lib $FindBin::Bin;
7 use Rcall qw ( histogram );
8 use Math::CDF;
9
10 use Exporter;
11 our @ISA = qw( Exporter );
12 our @EXPORT_OK = qw( &ping_pong_partners );
13
14 sub ping_pong_partners
15 {
16 my ( $TE_fai, $sam, $dir, $max ) = @_;
17 my ( $hashRef, $dupRef, $hasPpp ) = count_mapped ( $TE_fai, $sam );
18 my ( %num_per_overlap_size, $overlap_number, $reverseR, $begRev, $endRev, $sensR, $begSens, $endSens, $snum, $rnum, $overlap );
19 my ( $SP, $AP, $SN, $AN, $txt );
20 my $flag = 0;
21 my @distri_overlap = (); my @overlaps_names = ();
22
23 open my $ppp_f, '>', $dir."ppp.txt" || die "cannot create ppp.txt $!\n";
24 foreach my $k ( sort keys %{$hashRef} )
25 {
26 my $v = $hashRef->{$k};
27 my $TE_dir = $dir.$k.'/';
28
29 %num_per_overlap_size = (); $overlap_number = 0;
30 $flag = 0;
31 for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ )
32 {
33 $reverseR = ${$v->[1]}[$i] ;
34 $begRev = $reverseR->[0];
35 $endRev = $begRev + length($reverseR->[1]) - 1;
36
37 my $revR = reverse($reverseR->[1]);
38 $revR =~ tr/atgcuATGCU/tacgaTACGA/;
39
40 for ( my $j = 0; $j <= $#{$v->[0]}; $j++ )
41 {
42 $sensR = ${$v->[0]}[$j];
43 $begSens = $sensR->[0];
44 $endSens = $begSens + length($sensR->[1]) - 1;
45
46 if ( $begSens <= $endRev && $endSens > $endRev )
47 {
48 $flag = 1;
49 mkdir $TE_dir;
50 open $txt, '>', $TE_dir.'overlap_size.txt' || die "cannot open repartition\n";
51
52 $overlap = $endRev - $begSens + 1;
53 $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]};
54 $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]};
55
56 if ( $overlap == 10 )
57 {
58 $hasPpp->{ $sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3] } = 1;
59 $hasPpp->{ $reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3] } = 1;
60 }
61 next if $overlap > $max;
62 if ( $snum < $rnum )
63 {
64 $num_per_overlap_size{$overlap} += $snum;
65 $overlap_number += $snum;
66 }
67 else
68 {
69 $num_per_overlap_size{$overlap} += $rnum ;
70 $overlap_number += $rnum ;
71 }
72 }
73 }
74 }
75 if ( $max != 0 )
76 {
77 my @overlaps = ();
78 push @overlaps_names, $k;
79 for my $i (1..$max)
80 {
81 $num_per_overlap_size{$i} = 0 unless exists( $num_per_overlap_size{$i} );
82 push @overlaps, $num_per_overlap_size{$i};
83 }
84 push @distri_overlap, \@overlaps;
85 }
86
87 if ( $flag == 1 )
88 {
89 open $AP, '>', $TE_dir."antisensPPP.txt" || die "cannot create antisensPPP\n";
90 open $AN, '>', $TE_dir."antisens.txt" || die "cannot create antisens\n";
91 for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ )
92 {
93 $reverseR = ${$v->[1]}[$i] ;
94 my $revR = reverse($reverseR->[1]);
95 $revR =~ tr/atgcuATGCU/tacgaTACGA/;
96 $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]};
97 if ( $hasPpp->{ $reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3] } == 1 )
98 {
99 print $AP ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n";
100 }
101 else
102 {
103 print $AN ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n";
104 }
105 }
106 close $AP; close $AN;
107
108 open $SP, '>', $TE_dir."sensPPP.txt" || die "cannot create sensPPP\n";
109 open $SN, '>', $TE_dir."sens.txt" || die "cannot create sens\n";
110 for ( my $j = 0; $j <= $#{$v->[0]}; $j++ )
111 {
112 $sensR = ${$v->[0]}[$j];
113 $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]};
114 if ( $hasPpp->{ $sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3] } == 1 )
115 {
116 print $SP ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n";
117 }
118 else
119 {
120 print $SN ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n";
121 }
122 }
123 close $SP; close $SN;
124
125 my $histo_png = $TE_dir.'histogram.png';
126 histogram( \%num_per_overlap_size, $histo_png, $overlap_number );
127 print $txt "size\tnumber\tpercentage of the total overlap number\n";
128 foreach my $k ( sort {$a <=> $b} keys %num_per_overlap_size )
129 {
130 my $percentage = 0;
131 $percentage = $num_per_overlap_size{$k} * 100 / $overlap_number unless $overlap_number == 0;
132 print $txt "$k\t$num_per_overlap_size{$k}\t"; printf $txt "%.2f\n",$percentage;
133 }
134 close $txt;
135 }
136 }
137
138 foreach my $tabP ( @distri_overlap )
139 {
140 my $sum = sum($tabP);
141 my $ten = $tabP->[9];
142 my $mean = mean($tabP);
143 my $std = standard_deviation($tabP, $mean);
144 my $zsc = z_significance($ten, $mean, $std);
145 my $name = shift @overlaps_names;
146 my $prob = 'NA';
147 $prob = 1 - &Math::CDF::pnorm( $zsc ) if $zsc ne 'NA';
148 print $ppp_f (join ("\t", $name, $sum, $ten, $mean, $std, $zsc, $prob ),"\n" );
149 }
150 close $ppp_f;
151 }
152
153 sub count_mapped
154 {
155 my ( $fai, $in_file ) = @_;
156 my ( %mapped, %dup, %has_ppp );
157
158 open my $f, '<', $fai || die "cannot open $fai $! \n";
159 while(<$f>)
160 {
161 if ($_ =~ /(.*)\t(\d+)\n/)
162 {
163 $mapped{$1} = [];
164 $mapped{$1}->[0] = []; $mapped{$1}->[1] = [];
165 }
166 }
167 close $f;
168
169 open my $infile, "samtools view $in_file |"|| die "cannot open input file $! \n";
170 while(<$infile>)
171 {
172 unless ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
173 {
174 my @line = split (/\t/,$_);
175 if ($line[1] == 0)
176 {
177 unless ( exists ($dup{$line[3].$line[9].$line[1].$line[2]}) )
178 {
179 push @{$mapped{$line[2]}->[0]} , [$line[3], $line[9], $line[1], $line[2]];
180 $has_ppp {$line[3].$line[9].$line[1].$line[2]} = 0;
181 }
182 $dup{$line[3].$line[9].$line[1].$line[2]}+=1;
183 }
184 elsif ($line[1] == 16)
185 {
186 unless ( exists ($dup{$line[3].$line[9].$line[1].$line[2]}) )
187 {
188 push @{$mapped{$line[2]}->[1]} , [$line[3], $line[9], $line[1], $line[2]];
189 $has_ppp{$line[3].$line[9].$line[1].$line[2]} = 0;
190 }
191 $dup{$line[3].$line[9].$line[1].$line[2]}+=1
192 }
193 }
194 }
195 close $infile;
196 return (\%mapped, \%dup, \%has_ppp );
197 }
198
199 sub sum
200 {
201 my $arrayref = shift;
202 my $result = 0;
203 foreach (@$arrayref) {$result += $_}
204 return $result;
205 }
206
207 sub mean
208 {
209 my $arrayref = shift;
210 my $result;
211 foreach (@$arrayref) {$result += $_}
212 return $result / scalar(@$arrayref);
213 }
214
215 sub standard_deviation
216 {
217 my ($arrayref, $mean) = @_;
218 return sqrt ( mean ( [map $_**2 , @$arrayref ]) - ($mean**2));
219 }
220
221 sub z_significance
222 {
223 my ($ten, $mean, $std) = @_;
224 my $z = 'NA';
225 $z = (($ten - $mean) / $std) if $std != 0;
226 return $z;
227 }
228
229 1;
230