40
|
1 package ppp;
|
|
2
|
|
3 use strict;
|
|
4 use warnings;
|
|
5 use FindBin;
|
|
6 use lib $FindBin::Bin;
|
|
7 use Rcall qw ( histogram );
|
|
8 use Math::CDF;
|
|
9
|
|
10 use Exporter;
|
|
11 our @ISA = qw( Exporter );
|
|
12 our @EXPORT_OK = qw( &ping_pong_partners );
|
|
13
|
|
14 sub ping_pong_partners
|
|
15 {
|
|
16 my ( $TE_fai, $sam, $dir, $max ) = @_;
|
|
17 my ( $hashRef, $dupRef, $hasPpp ) = count_mapped ( $TE_fai, $sam );
|
|
18 my ( %num_per_overlap_size, $overlap_number, $reverseR, $begRev, $endRev, $sensR, $begSens, $endSens, $snum, $rnum, $overlap );
|
|
19 my ( $SP, $AP, $SN, $AN, $txt );
|
|
20 my $flag = 0;
|
|
21 my @distri_overlap = (); my @overlaps_names = ();
|
|
22
|
|
23 open my $ppp_f, '>', $dir."ppp.txt" || die "cannot create ppp.txt $!\n";
|
|
24 foreach my $k ( sort keys %{$hashRef} )
|
|
25 {
|
|
26 my $v = $hashRef->{$k};
|
|
27 my $TE_dir = $dir.$k.'/';
|
|
28
|
|
29 %num_per_overlap_size = (); $overlap_number = 0;
|
|
30 $flag = 0;
|
|
31 for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ )
|
|
32 {
|
|
33 $reverseR = ${$v->[1]}[$i] ;
|
|
34 $begRev = $reverseR->[0];
|
|
35 $endRev = $begRev + length($reverseR->[1]) - 1;
|
|
36
|
|
37 my $revR = reverse($reverseR->[1]);
|
|
38 $revR =~ tr/atgcuATGCU/tacgaTACGA/;
|
|
39
|
|
40 for ( my $j = 0; $j <= $#{$v->[0]}; $j++ )
|
|
41 {
|
|
42 $sensR = ${$v->[0]}[$j];
|
|
43 $begSens = $sensR->[0];
|
|
44 $endSens = $begSens + length($sensR->[1]) - 1;
|
|
45
|
|
46 if ( $begSens <= $endRev && $endSens > $endRev )
|
|
47 {
|
|
48 $flag = 1;
|
|
49 mkdir $TE_dir;
|
|
50 open $txt, '>', $TE_dir.'overlap_size.txt' || die "cannot open repartition\n";
|
|
51
|
|
52 $overlap = $endRev - $begSens + 1;
|
|
53 $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]};
|
|
54 $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]};
|
|
55
|
|
56 if ( $overlap == 10 )
|
|
57 {
|
|
58 $hasPpp->{ $sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3] } = 1;
|
|
59 $hasPpp->{ $reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3] } = 1;
|
|
60 }
|
|
61 next if $overlap > $max;
|
|
62 if ( $snum < $rnum )
|
|
63 {
|
|
64 $num_per_overlap_size{$overlap} += $snum;
|
|
65 $overlap_number += $snum;
|
|
66 }
|
|
67 else
|
|
68 {
|
|
69 $num_per_overlap_size{$overlap} += $rnum ;
|
|
70 $overlap_number += $rnum ;
|
|
71 }
|
|
72 }
|
|
73 }
|
|
74 }
|
|
75 if ( $max != 0 )
|
|
76 {
|
|
77 my @overlaps = ();
|
|
78 push @overlaps_names, $k;
|
|
79 for my $i (1..$max)
|
|
80 {
|
|
81 $num_per_overlap_size{$i} = 0 unless exists( $num_per_overlap_size{$i} );
|
|
82 push @overlaps, $num_per_overlap_size{$i};
|
|
83 }
|
|
84 push @distri_overlap, \@overlaps;
|
|
85 }
|
|
86
|
|
87 if ( $flag == 1 )
|
|
88 {
|
|
89 open $AP, '>', $TE_dir."antisensPPP.txt" || die "cannot create antisensPPP\n";
|
|
90 open $AN, '>', $TE_dir."antisens.txt" || die "cannot create antisens\n";
|
|
91 for ( my $i = 0; $i <= $#{$v->[1]} ; $i++ )
|
|
92 {
|
|
93 $reverseR = ${$v->[1]}[$i] ;
|
|
94 my $revR = reverse($reverseR->[1]);
|
|
95 $revR =~ tr/atgcuATGCU/tacgaTACGA/;
|
|
96 $rnum = $dupRef->{$reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3]};
|
|
97 if ( $hasPpp->{ $reverseR->[0].$reverseR->[1].$reverseR->[2].$reverseR->[3] } == 1 )
|
|
98 {
|
|
99 print $AP ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n";
|
|
100 }
|
|
101 else
|
|
102 {
|
|
103 print $AN ">$reverseR->[0]|$reverseR->[2]|$reverseR->[3]|$rnum\n$revR\n";
|
|
104 }
|
|
105 }
|
|
106 close $AP; close $AN;
|
|
107
|
|
108 open $SP, '>', $TE_dir."sensPPP.txt" || die "cannot create sensPPP\n";
|
|
109 open $SN, '>', $TE_dir."sens.txt" || die "cannot create sens\n";
|
|
110 for ( my $j = 0; $j <= $#{$v->[0]}; $j++ )
|
|
111 {
|
|
112 $sensR = ${$v->[0]}[$j];
|
|
113 $snum = $dupRef->{$sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3]};
|
|
114 if ( $hasPpp->{ $sensR->[0].$sensR->[1].$sensR->[2].$sensR->[3] } == 1 )
|
|
115 {
|
|
116 print $SP ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n";
|
|
117 }
|
|
118 else
|
|
119 {
|
|
120 print $SN ">$sensR->[0]|$sensR->[2]|$sensR->[3]|$snum\n$sensR->[1]\n";
|
|
121 }
|
|
122 }
|
|
123 close $SP; close $SN;
|
|
124
|
|
125 my $histo_png = $TE_dir.'histogram.png';
|
|
126 histogram( \%num_per_overlap_size, $histo_png, $overlap_number );
|
|
127 print $txt "size\tnumber\tpercentage of the total overlap number\n";
|
|
128 foreach my $k ( sort {$a <=> $b} keys %num_per_overlap_size )
|
|
129 {
|
|
130 my $percentage = 0;
|
|
131 $percentage = $num_per_overlap_size{$k} * 100 / $overlap_number unless $overlap_number == 0;
|
|
132 print $txt "$k\t$num_per_overlap_size{$k}\t"; printf $txt "%.2f\n",$percentage;
|
|
133 }
|
|
134 close $txt;
|
|
135 }
|
|
136 }
|
|
137
|
|
138 foreach my $tabP ( @distri_overlap )
|
|
139 {
|
|
140 my $sum = sum($tabP);
|
|
141 my $ten = $tabP->[9];
|
|
142 my $mean = mean($tabP);
|
|
143 my $std = standard_deviation($tabP, $mean);
|
|
144 my $zsc = z_significance($ten, $mean, $std);
|
|
145 my $name = shift @overlaps_names;
|
|
146 my $prob = 'NA';
|
|
147 $prob = 1 - &Math::CDF::pnorm( $zsc ) if $zsc ne 'NA';
|
|
148 print $ppp_f (join ("\t", $name, $sum, $ten, $mean, $std, $zsc, $prob ),"\n" );
|
|
149 }
|
|
150 close $ppp_f;
|
|
151 }
|
|
152
|
|
153 sub count_mapped
|
|
154 {
|
|
155 my ( $fai, $in_file ) = @_;
|
|
156 my ( %mapped, %dup, %has_ppp );
|
|
157
|
|
158 open my $f, '<', $fai || die "cannot open $fai $! \n";
|
|
159 while(<$f>)
|
|
160 {
|
|
161 if ($_ =~ /(.*)\t(\d+)\n/)
|
|
162 {
|
|
163 $mapped{$1} = [];
|
|
164 $mapped{$1}->[0] = []; $mapped{$1}->[1] = [];
|
|
165 }
|
|
166 }
|
|
167 close $f;
|
|
168
|
|
169 open my $infile, "samtools view $in_file |"|| die "cannot open input file $! \n";
|
|
170 while(<$infile>)
|
|
171 {
|
|
172 unless ($_ =~ /^\@[A-Za-z][A-Za-z](\t[A-Za-z][A-Za-z0-9]:[ -~]+)+$/ || $_ =~ /^\@CO\t.*/ )
|
|
173 {
|
|
174 my @line = split (/\t/,$_);
|
|
175 if ($line[1] == 0)
|
|
176 {
|
|
177 unless ( exists ($dup{$line[3].$line[9].$line[1].$line[2]}) )
|
|
178 {
|
|
179 push @{$mapped{$line[2]}->[0]} , [$line[3], $line[9], $line[1], $line[2]];
|
|
180 $has_ppp {$line[3].$line[9].$line[1].$line[2]} = 0;
|
|
181 }
|
|
182 $dup{$line[3].$line[9].$line[1].$line[2]}+=1;
|
|
183 }
|
|
184 elsif ($line[1] == 16)
|
|
185 {
|
|
186 unless ( exists ($dup{$line[3].$line[9].$line[1].$line[2]}) )
|
|
187 {
|
|
188 push @{$mapped{$line[2]}->[1]} , [$line[3], $line[9], $line[1], $line[2]];
|
|
189 $has_ppp{$line[3].$line[9].$line[1].$line[2]} = 0;
|
|
190 }
|
|
191 $dup{$line[3].$line[9].$line[1].$line[2]}+=1
|
|
192 }
|
|
193 }
|
|
194 }
|
|
195 close $infile;
|
|
196 return (\%mapped, \%dup, \%has_ppp );
|
|
197 }
|
|
198
|
|
199 sub sum
|
|
200 {
|
|
201 my $arrayref = shift;
|
|
202 my $result = 0;
|
|
203 foreach (@$arrayref) {$result += $_}
|
|
204 return $result;
|
|
205 }
|
|
206
|
|
207 sub mean
|
|
208 {
|
|
209 my $arrayref = shift;
|
|
210 my $result;
|
|
211 foreach (@$arrayref) {$result += $_}
|
|
212 return $result / scalar(@$arrayref);
|
|
213 }
|
|
214
|
|
215 sub standard_deviation
|
|
216 {
|
|
217 my ($arrayref, $mean) = @_;
|
|
218 return sqrt ( mean ( [map $_**2 , @$arrayref ]) - ($mean**2));
|
|
219 }
|
|
220
|
|
221 sub z_significance
|
|
222 {
|
|
223 my ($ten, $mean, $std) = @_;
|
|
224 my $z = 'NA';
|
|
225 $z = (($ten - $mean) / $std) if $std != 0;
|
|
226 return $z;
|
|
227 }
|
|
228
|
|
229 1;
|
|
230
|