0
|
1 #!/usr/bin/perl -w
|
|
2 #Filename:
|
|
3 #Author: Tian Dongmei
|
|
4 #Email: tiandm@big.ac.cn
|
|
5 #Date: 2013/7/19
|
|
6 #Modified:
|
|
7 #Description:
|
|
8 my $version=1.00;
|
|
9
|
|
10 use strict;
|
|
11 use Getopt::Long;
|
|
12 #use Math::Cephes qw(:hypergeometrics);
|
|
13
|
|
14 my %opts;
|
|
15 GetOptions(\%opts,"i=s","o=s","h");
|
|
16 if (!(defined $opts{i} and defined $opts{o} ) || defined $opts{h}) { #necessary arguments
|
|
17 &usage;
|
|
18 }
|
|
19
|
|
20 my $filein=$opts{'i'};
|
|
21 my $fileout=$opts{'o'};
|
|
22
|
|
23 open IN,"<$filein"; #input file
|
|
24 open OUT,">$fileout"; #output file
|
|
25
|
|
26 while (my $aline=<IN>) {
|
|
27 chomp $aline;
|
|
28 if ($aline=~/^\#/) {
|
|
29 print OUT $aline,"\tp-value\n";
|
|
30 next;
|
|
31 }
|
|
32 my @tmp=split/\t/,$aline;
|
|
33 my @pos=split/:|-/,$tmp[0];
|
|
34 $tmp[1]=~s/nt//;
|
|
35 my $pv=&phase($tmp[1],$pos[1],$pos[2],$tmp[4]);
|
|
36
|
|
37 print OUT $aline,"\t",$pv,"\n";
|
|
38 }
|
|
39 close IN;
|
|
40 close OUT;
|
|
41
|
|
42 sub phase{
|
|
43 my ($tagL,$start,$end,$tags)=@_;
|
|
44 my @tmp=split/\;/,$tags;
|
|
45 my %tag;
|
|
46 for (my $i=0;$i<@tmp;$i++) {
|
|
47 my @aa=split/\,/,$tmp[$i];
|
|
48 next if($aa[1]-$aa[0]+1 != $tagL);
|
|
49 # $tag{$aa[0].",".$aa[2]}+=$aa[3] if($aa[2] eq "+");
|
|
50 # $tag{($aa[1]).",".$aa[2]}+=$aa[3] if($aa[2] eq "-");
|
|
51 $tag{$aa[0]}+=$aa[3] if($aa[2] eq "+");
|
|
52 $tag{($aa[1]+3)}+=$aa[3] if($aa[2] eq "-");
|
|
53 }
|
|
54
|
|
55 my $pv=&pvalue2(\%tag,$tagL,$start,$end);
|
|
56
|
|
57 return $pv;
|
|
58 }
|
|
59
|
|
60 sub pvalue2{
|
|
61 my ($tag,$tagL,$start,$end)=@_;
|
|
62
|
|
63 my $p=1; my $pp=1;
|
|
64 foreach my $ccs(keys %{$tag}){
|
|
65 my $n=0;
|
|
66 my $k=0;
|
|
67 my $K=0;
|
|
68 my $N=0;
|
|
69
|
|
70 my $cor= $ccs;
|
|
71 my $ss=$cor;
|
|
72 my $ee=($cor+$tagL*10-1)<$end ? $cor+$tagL*10-1 : $end;
|
|
73
|
|
74 my $max=0;
|
|
75 for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand
|
|
76 {
|
|
77 my $x=$i;
|
|
78 if (defined $$tag{$x})
|
|
79 {
|
|
80 if ($max<$$tag{$x}) {$max=$$tag{$x};}
|
|
81 $n +=$$tag{$x};
|
|
82 $N++;
|
|
83 }
|
|
84 }
|
|
85 for (my $i=$ss; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand
|
|
86 {
|
|
87 my $x=$i;
|
|
88 if (defined $$tag{$x})
|
|
89 {
|
|
90 $k +=$$tag{$x};
|
|
91 $K++;
|
|
92 }
|
|
93 }
|
|
94
|
|
95
|
|
96 return $p if($K<3);
|
|
97 return $p if($max/$n>0.8);
|
|
98
|
|
99 my $pn=0;
|
|
100 next if($n==$k);
|
|
101 $pn=10*$k/($n-$k)+1;
|
|
102 $pn = $pn ** ($K-2);
|
|
103 $pn = log($pn);
|
|
104 if ($p<$pn) {
|
|
105 $p=$pn;
|
|
106 }
|
|
107
|
|
108 }
|
|
109
|
|
110 return $p;
|
|
111
|
|
112 }
|
|
113
|
|
114 sub pvalue{
|
|
115 my ($tag,$tagL,$start,$end)=@_;
|
|
116
|
|
117 my $p=1;
|
|
118 foreach my $ccs(keys %{$tag}){
|
|
119 my $n=-1;
|
|
120 my $k=-1;
|
|
121
|
|
122 my ($cor, $str)=split(/,/, $ccs);
|
|
123 if ($str eq "+") # small RNAs on the Watson strand
|
|
124 {
|
|
125 my $ss=$cor;
|
|
126 my $ee=($cor+$tagL*11-1)<$end ? $cor+$tagL*11-1 : $end;
|
|
127 for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand
|
|
128 {
|
|
129 my $x=$i.","."+";
|
|
130 if (defined $$tag{$x})
|
|
131 {
|
|
132 $n=$n+1;
|
|
133 }
|
|
134 }
|
|
135 for (my $i=$ss; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand
|
|
136 {
|
|
137 my $x=$i.","."+";
|
|
138 if (defined $$tag{$x})
|
|
139 {
|
|
140 $k=$k+1;
|
|
141 }
|
|
142 }
|
|
143
|
|
144 for (my $j=$ss-2; $j<=$ee-2; $j++) # calculate n on the antisense strand
|
|
145 {
|
|
146 my $x=$j.","."-";
|
|
147 if (defined $$tag{$x})
|
|
148 {
|
|
149 $n=$n+1;
|
|
150 }
|
|
151 }
|
|
152
|
|
153 for (my $j=$ss+$tagL-2; $j<=$ee-2; $j=$j+$tagL) # calculate k on the antisense strand
|
|
154 {
|
|
155 my $x=$j.","."-";
|
|
156 if (defined $$tag{$x})
|
|
157 {
|
|
158 $k=$k+1;
|
|
159 }
|
|
160 }
|
|
161 }
|
|
162
|
|
163 elsif ($str eq "-") # small RNAs on the Crick strand
|
|
164 {
|
|
165 my $ee=$cor;
|
|
166 my $ss=$cor-$tagL*11+1> $start ? $cor-$tagL*11+1 : $start;
|
|
167 for (my $i=$ss; $i<=$ee; $i++) # calculate n on the sense strand
|
|
168 {
|
|
169 my $x=$i.","."-";
|
|
170 if (defined $$tag{$x})
|
|
171 {
|
|
172 $n=$n+1;
|
|
173 }
|
|
174 }
|
|
175 for (my $i=$ss+$tagL-1; $i<=$ee; $i=$i+$tagL) # calculate k on the sense strand
|
|
176 {
|
|
177 my $x=$i.","."-";
|
|
178 if (defined $$tag{$x})
|
|
179 {
|
|
180 $k=$k+1;
|
|
181 }
|
|
182 }
|
|
183
|
|
184 for (my $j=$ss+2; $j<=$ee+2; $j++) # calculate n on the antisense strand
|
|
185 {
|
|
186 my $x=$j.","."+";
|
|
187 if (defined $$tag{$x})
|
|
188 {
|
|
189 $n=$n+1;
|
|
190 }
|
|
191 }
|
|
192 for (my $j=$ss+2; $j<=$ee+2; $j=$j+$tagL) # calculate k on the antisense strand
|
|
193 {
|
|
194 my $x=$j.","."+";
|
|
195 if (defined $$tag{$x})
|
|
196 {
|
|
197 $k=$k+1;
|
|
198 }
|
|
199 }
|
|
200 }
|
|
201
|
|
202 next if($k<3);
|
|
203
|
|
204 my $pn=0; my $N=$tagL*11*2-1; my $M=21;
|
|
205 for (my $w=$k; $w<=$M; $w++) # calculate p-value from n and k
|
|
206 {
|
|
207 my $c=1;
|
|
208 my $rr=1;
|
|
209 my $rw=1;
|
|
210
|
|
211 for (my $j=0; $j<=$w-1; $j++)
|
|
212 {
|
|
213 $c=$c*($M-$j)/($j+1);
|
|
214 }
|
|
215 for (my $x=0; $x<=$n-$w-1; $x++)
|
|
216 {
|
|
217 $rr=$rr*($N-$M-$x)/($x+1);
|
|
218 }
|
|
219 for (my $y=0; $y<=$n-1; $y++)
|
|
220 {
|
|
221 $rw=$rw*($y+1)/($N-$y);
|
|
222 }
|
|
223 my $pr=$c*$rr*$rw;
|
|
224
|
|
225 $pn=$pn+$pr;
|
|
226 }
|
|
227
|
|
228 $p=$pn<$p ? $pn :$p;
|
|
229
|
|
230 if ($p<0.001) #select and output small RNA clusters with p<0.001
|
|
231
|
|
232 {
|
|
233
|
|
234 return $p;
|
|
235
|
|
236 }
|
|
237
|
|
238 }
|
|
239 return $p;
|
|
240 }
|
|
241
|
|
242 sub usage{
|
|
243 print <<"USAGE";
|
|
244 Version $version
|
|
245 Usage:
|
|
246 $0 -i -o
|
|
247 options:
|
|
248 -i input file
|
|
249 -o output file
|
|
250 -h help
|
|
251 USAGE
|
|
252 exit(1);
|
|
253 }
|
|
254
|