Mercurial > repos > bigrna > gpsrna
comparison precursors.pl @ 0:87fe81de0931 draft default tip
Uploaded
author | bigrna |
---|---|
date | Sun, 04 Jan 2015 02:47:25 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:87fe81de0931 |
---|---|
1 #!/usr/bin/perl -w | |
2 #Filename: | |
3 #Author: Tian Dongmei | |
4 #Email: tiandm@big.ac.cn | |
5 #Date: 2013/7/19 | |
6 #Modified: | |
7 #Description: | |
8 my $version=1.00; | |
9 | |
10 use strict; | |
11 use Getopt::Long; | |
12 #use RNA; | |
13 | |
14 my %opts; | |
15 GetOptions(\%opts,"map=s","g=s","d:i","f:i","o=s","e:f","s=s","h"); | |
16 if (!(defined $opts{map} and defined $opts{g} and defined $opts{o} and defined $opts{s} ) || defined $opts{h}) { #necessary arguments | |
17 &usage; | |
18 } | |
19 | |
20 my $checkno=1; | |
21 my $filein=$opts{'map'}; | |
22 my $faout=$opts{'o'}; | |
23 my $strout=$opts{'s'}; | |
24 my $genome= $opts{'g'}; | |
25 | |
26 my $maxd=defined $opts{'d'} ? $opts{'d'} : 200; | |
27 my $flank=defined $opts{'f'}? $opts{'f'} : 10; | |
28 | |
29 my $MAX_ENERGY=-18; | |
30 if (defined $opts{'e'}) {$MAX_ENERGY=$opts{'e'};} | |
31 my $MAX_UNPAIR=5; | |
32 my $MIN_PAIR=15; | |
33 my $MAX_SIZEDIFF=4; | |
34 my $MAX_BULGE=2; | |
35 my $ASYMMETRY=5; | |
36 my $MIN_UNPAIR=0; | |
37 my $MIN_SPACE=5; | |
38 my $MAX_SPACE=$maxd; | |
39 my $FLANK=$flank; | |
40 | |
41 ######### load in genome sequences start ######## | |
42 my %genome; | |
43 my %lng; | |
44 my $name; | |
45 open IN,"<$genome"; | |
46 while (my $aline=<IN>) { | |
47 chomp $aline; | |
48 next if($aline=~/^\#/); | |
49 if ($aline=~/^>(\S+)/) { | |
50 $name=$1; | |
51 next; | |
52 } | |
53 $genome{$name} .=$aline; | |
54 } | |
55 close IN; | |
56 foreach my $key (keys %genome) { | |
57 $lng{$key}=length($genome{$key}); | |
58 } | |
59 ####### load in genome sequences end ########## | |
60 | |
61 my %breaks; ### reads number bigger than 3 | |
62 open IN,"<$filein"; #input file | |
63 while (my $aline=<IN>) { | |
64 chomp $aline; | |
65 my @tmp=split/\t/,$aline; | |
66 $tmp[0]=~/_x(\d+)$/; | |
67 my $no=$1; | |
68 next if($no<3); | |
69 #my $trand=&find_strand($tmp[9]); | |
70 #my @pos=split/\.\./,$tmp[5]; | |
71 my $end=$tmp[3]+length($tmp[4])-1; | |
72 if($tmp[1] eq "-"){$tmp[4]=revcom($tmp[4]);} | |
73 push @{$breaks{$tmp[2]}{$tmp[1]}},[$tmp[3],$end,$no,$tmp[4]]; ### 0 base | |
74 } | |
75 close IN; | |
76 | |
77 my %cites; ### peaks | |
78 foreach my $chr (keys %breaks) { | |
79 foreach my $strand (keys %{$breaks{$chr}}) { | |
80 my @array=@{$breaks{$chr}{$strand}}; | |
81 @array=sort{$a->[0]<=>$b->[0]} @array; | |
82 for (my $i=0;$i<@array;$i++) { | |
83 my $start=$array[$i][0];my $end=$array[$i][1]; | |
84 my @subarray=(); | |
85 push @subarray,$array[$i]; | |
86 | |
87 for (my $j=$i+1;$j<@array;$j++) { | |
88 if ($start<$array[$j][1] && $end>$array[$j][0]) { ###overlap | |
89 push @subarray,$array[$j]; | |
90 ($start,$end)=&newpos($start,$end,$array[$j][0],$array[$j][1]); | |
91 } | |
92 else{ | |
93 $i=$j-1; | |
94 &find_cites(\@subarray,$chr,$strand); | |
95 last; | |
96 } | |
97 } | |
98 } | |
99 } | |
100 } | |
101 | |
102 my %cluster; | |
103 foreach my $chr (keys %cites) { | |
104 foreach my $strand (keys %{$cites{$chr}}) { | |
105 my @array=@{$sites{$chr}{$strand}}; | |
106 @array=sort{$a->[0]<=>$b->[0]} @array; | |
107 for (my $i=0;$i<@array;$i++) { | |
108 my $start=$array[$i][0];my $end=$array[$i][1]; | |
109 my @subarray=(); | |
110 push @subarray,$array[$i]; | |
111 | |
112 for (my $j=$i+1;$j<@array;$j++) { | |
113 if ($end>$array[$j][0]-$maxd) { ###distance less than 200bp | |
114 push @subarray,$array[$j]; | |
115 ($start,$end)=&newpos($start,$end,$array[$j][0],$array[$j][1]); | |
116 } | |
117 else{ | |
118 @{$cluster{$chr}{$strand}{$i}}=@subarray; | |
119 $i=$j-1; | |
120 last; | |
121 } | |
122 } | |
123 } | |
124 | |
125 } | |
126 } | |
127 | |
128 | |
129 open FA,">$faout"; #output file | |
130 open STR,">$strout"; | |
131 foreach my $chr (keys %cluster) { | |
132 foreach my $strand (keys %{$cluster{$chr}}) { | |
133 foreach my $no (keys %{$cluster{$chr}{$strand}}) { | |
134 my @array2=@{$cluster{$chr}{$strand}{$no}}; | |
135 @array2=sort{$a->[0]<=>$b->[0]} @array2; | |
136 &excise(\@array2,$chr,$strand); | |
137 } | |
138 } | |
139 } | |
140 close FA; | |
141 close STR; | |
142 sub oneCiteDn{ | |
143 my ($array,$a,$chr,$strand)=@_; | |
144 | |
145 my $ss=$$array[$a][0]-$flank; | |
146 $ss=0 if($ss<0); | |
147 my $ee=$$array[$a][1]+$maxd+$flank; | |
148 $ee=$lng{$chr} if($ee>$lng{$chr}); | |
149 | |
150 my $seq=substr($genome{$chr},$ss,$ee-$ss+1); | |
151 if($strand eq "-"){$seq=revcom($seq);} | |
152 | |
153 my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee); | |
154 return $val; | |
155 } | |
156 sub oneCiteUp{ | |
157 my ($array,$a,$chr,$strand)=@_; | |
158 | |
159 my $ss=$$array[$a][0]-$maxd-$flank; | |
160 $ss=0 if($ss<0); | |
161 my $ee=$$array[$a][1]+$flank; | |
162 $ee=$lng{$chr} if($ee>$lng{$chr}); | |
163 | |
164 my $seq=substr($genome{$chr},$ss,$ee-$ss+1); | |
165 if($strand eq "-"){$seq=revcom($seq);} | |
166 | |
167 my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee); | |
168 return $val; | |
169 | |
170 } | |
171 | |
172 sub twoCites{ | |
173 my ($array,$a,$b,$chr,$strand)=@_; | |
174 | |
175 my $ss=$$array[$a][0]-$flank; | |
176 $ss=0 if($ss<0); | |
177 my $ee=$$array[$b][1]+$flank; | |
178 $ee=$lng{$chr} if($ee>$lng{$chr}); | |
179 | |
180 my $seq=substr($genome{$chr},$ss,$ee-$ss+1); | |
181 if($strand eq "-"){$seq=revcom($seq);} | |
182 | |
183 # my( $str,$mfe)=RNA::fold($seq); | |
184 # return 0 if($mfe>$MAX_ENERGY); ### minimum mfe | |
185 my $val=&ffw2($seq,$$array[$a][3],$$array[$b][3],$chr,$strand,$ss,$ee); | |
186 | |
187 return $val; | |
188 | |
189 } | |
190 sub excise{ | |
191 my ($cluster,$chr,$strand)=@_; | |
192 | |
193 if(@{$cluster}==1){ | |
194 $ok=&oneCiteDn($cluster,0,$chr,$strand); | |
195 $ok=&oneCiteUp($cluster,0,$chr,$strand); | |
196 }else{ | |
197 my $peak_pos=0; | |
198 | |
199 for (my $i=0;$i<@{$cluster};$i++) { | |
200 if($$cluster[$i][2]>$$cluster[$peak_pos][2]){$peak_pos=$i;} | |
201 } | |
202 | |
203 my $ok=0; | |
204 for (my $i=0;$i<@{$cluster};$i++) { | |
205 next if($i==$peak_pos); | |
206 if($i<$peak_pos){$ok=&twoCites($cluster,$i,$peak_pos,$chr,$strand);} | |
207 else{$ok=&twoCites($cluster,$peak_pos,$i,$chr,$strand);} | |
208 last if($ok); | |
209 } | |
210 if (!$ok) { | |
211 $ok=&oneCiteDn($cluster,$peak_pos,$chr,$strand); | |
212 $ok=&oneCiteUp($cluster,$peak_pos,$chr,$strand); | |
213 } | |
214 | |
215 } | |
216 } | |
217 | |
218 sub ffw2{ | |
219 my ($seq,$tag1,$tag2,$chr,$strand,$ss,$ee)=@_; | |
220 | |
221 my $N_count=$seq=~tr/N//; ## precursor sequence has not more than 5 Ns | |
222 if ($N_count > 5) { | |
223 return 0; | |
224 } | |
225 | |
226 my $seq_length=length $seq; | |
227 # position tag1 and tag2 | |
228 my $tag1_beg=index($seq,$tag1,0)+1; | |
229 if ($tag1_beg < 1) { | |
230 warn "[ffw2] coordinate error.\n"; | |
231 # $fold->{reason}="coordinate error"; | |
232 return 0; | |
233 } | |
234 my $tag2_beg=index($seq,$tag2,0)+1; | |
235 if ($tag2_beg < 1) { | |
236 warn "[ffw2] coordinate error.\n"; | |
237 # $fold->{reason}="coordinate error"; | |
238 return 0; | |
239 } | |
240 if ($tag2_beg < $tag1_beg) { | |
241 # swap tag1 and tag2 | |
242 ($tag1,$tag2)=($tag2,$tag1); | |
243 ($tag1_beg,$tag2_beg)=($tag2_beg,$tag1_beg); | |
244 } | |
245 my $tag1_end=$tag1_beg+length($tag1)-1; | |
246 my $tag2_end=$tag2_beg+length($tag2)-1; | |
247 # re-clipping | |
248 my $beg=$tag1_beg-$FLANK; $beg=1 if $beg < 1; | |
249 my $end=$tag2_end+$FLANK; $end=$seq_length if $end > $seq_length; | |
250 $seq=substr($seq,$beg-1,$end-$beg+1); | |
251 $seq_length=length $seq; | |
252 # re-reposition | |
253 $tag1_beg=index($seq,$tag1,0)+1; | |
254 if ($tag1_beg < 1) { | |
255 warn "[ffw2] coordinate error.\n"; | |
256 # $fold->{reason}="coordinate error"; | |
257 return 0; | |
258 } | |
259 | |
260 $tag2_beg=index($seq,$tag2,0)+1; | |
261 if ($tag2_beg < 1) { | |
262 warn "[ffw2] coordinate error.\n"; | |
263 # $fold->{reason}="coordinate error"; | |
264 return 0; | |
265 } | |
266 $tag1_end=$tag1_beg+length($tag1)-1; | |
267 $tag2_end=$tag2_beg+length($tag2)-1; | |
268 | |
269 # fold | |
270 #my ($struct,$mfe)=RNA::fold($seq); | |
271 my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; | |
272 my @rawfolds=split/\s+/,$rnafold; | |
273 my $struct=$rawfolds[1]; | |
274 my $mfe=$rawfolds[-1]; | |
275 $mfe=~s/\(//; | |
276 $mfe=~s/\)//; | |
277 #$mfe=sprintf "%.2f", $mfe; | |
278 if ($mfe > $MAX_ENERGY) {return 0;} | |
279 | |
280 # tag1 | |
281 my $tag1_length=$tag1_end-$tag1_beg+1; | |
282 my $tag1_struct=substr($struct,$tag1_beg-1,$tag1_length); | |
283 my $tag1_arm=which_arm($tag1_struct); | |
284 my $tag1_unpair=$tag1_struct=~tr/.//; | |
285 my $tag1_pair=$tag1_length-$tag1_unpair; | |
286 my $tag1_max_bulge=biggest_bulge($tag1_struct); | |
287 if ($tag1_arm ne "5p") { return 0;} # tag not in stem | |
288 # if ($tag1_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag1_unpair ($MAX_UNPAIR)"; return $pass} | |
289 if ($tag1_pair < $MIN_PAIR) {return 0;} | |
290 if ($tag1_max_bulge > $MAX_BULGE) {return 0;} | |
291 | |
292 # tag2 | |
293 my $tag2_length=$tag2_end-$tag2_beg+1; | |
294 my $tag2_struct=substr($struct,$tag2_beg-1,$tag2_length); | |
295 my $tag2_arm=which_arm($tag2_struct); | |
296 my $tag2_unpair=$tag2_struct=~tr/.//; | |
297 my $tag2_pair=$tag2_length-$tag2_unpair; | |
298 my $tag2_max_bulge=biggest_bulge($tag2_struct); | |
299 if ($tag2_arm ne "3p") {return 0;} # star not in stem | |
300 # if ($tag2_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag2_unpair ($MAX_UNPAIR)"; return $pass} | |
301 if ($tag2_pair < $MIN_PAIR) {return 0;} | |
302 if ($tag2_max_bulge > $MAX_BULGE) {return 0;} | |
303 | |
304 # space size between miR and miR* | |
305 my $space=$tag2_beg-$tag1_end-1; | |
306 if ($space < $MIN_SPACE) {return 0;} | |
307 if ($space > $MAX_SPACE) {return 0;} | |
308 | |
309 # size diff of miR and miR* | |
310 my $size_diff=abs($tag1_length-$tag2_length); | |
311 if ($size_diff > $MAX_SIZEDIFF) {return 0;} | |
312 | |
313 # build base pairing table | |
314 my %pairtable; | |
315 &parse_struct($struct,\%pairtable); # coords count from 1 | |
316 | |
317 my $asy1=get_asy(\%pairtable,$tag1_beg,$tag1_end); | |
318 my $asy2=get_asy(\%pairtable,$tag2_beg,$tag2_end); | |
319 my $asy=($asy1 < $asy2) ? $asy1 : $asy2; | |
320 if ($asy > $ASYMMETRY) {return 0} | |
321 | |
322 # duplex fold, determine whether two matures like a miR/miR* ike duplex | |
323 my ($like_mir_duplex1,$duplex_pair,$overhang1,$overhang2)=likeMirDuplex1($tag1,$tag2); | |
324 # parse hairpin, determine whether two matures form miR/miR* duplex in hairpin context | |
325 my ($like_mir_duplex2,$duplex_pair2,$overhang_b,$overhang_t)=likeMirDuplex2(\%pairtable,$tag1_beg,$tag1_end,$tag2_beg,$tag2_end); | |
326 if ($like_mir_duplex1==0 && $like_mir_duplex2==0) { | |
327 return 0; | |
328 } | |
329 | |
330 print FA ">$chr:$strand:$ss..$ee\n$seq\n"; | |
331 print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n"; | |
332 | |
333 return 1; | |
334 } | |
335 | |
336 sub ffw1{ | |
337 my ($seq,$tag,$chr,$strand,$ss,$ee)=@_; | |
338 my $pass=0; | |
339 | |
340 my $N_count=$seq=~tr/N//; | |
341 if ($N_count > 5) { | |
342 return 0; | |
343 } | |
344 | |
345 my $seq_length=length $seq; | |
346 my $tag_length=length $tag; | |
347 | |
348 # position | |
349 my $tag_beg=index($seq,$tag,0)+1; | |
350 if ($tag_beg < 1) { | |
351 warn "[ffw1] coordinate error.\n"; | |
352 return $pass; | |
353 } | |
354 my $tag_end=$tag_beg+length($tag)-1; | |
355 | |
356 | |
357 # define candidate precursor by hybrid short arm to long arm, not solid enough | |
358 my($beg,$end)=define_precursor($seq,$tag); | |
359 if (not defined $beg) { | |
360 return $pass; | |
361 } | |
362 if (not defined $end) { | |
363 return $pass; | |
364 } | |
365 $seq=substr($seq,$beg-1,$end-$beg+1); | |
366 $seq_length=length $seq; | |
367 | |
368 | |
369 # fold | |
370 #my ($struct,$mfe)=RNA::fold($seq); | |
371 my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; | |
372 my @rawfolds=split/\s+/,$rnafold; | |
373 my $struct=$rawfolds[1]; | |
374 my $mfe=$rawfolds[-1]; | |
375 $mfe=~s/\(//; | |
376 $mfe=~s/\)//; | |
377 | |
378 if ($mfe > $MAX_ENERGY) { | |
379 $pass=0; | |
380 return $pass; | |
381 } | |
382 | |
383 # reposition | |
384 $tag_beg=index($seq,$tag,0)+1; | |
385 if ($tag_beg < 1) { | |
386 warn "[ffw1] coordinate error.\n"; | |
387 return 0; | |
388 } | |
389 $tag_end=$tag_beg+length($tag)-1; | |
390 | |
391 my $tag_struct=substr($struct,$tag_beg-1,$tag_length); | |
392 my $tag_arm=which_arm($tag_struct); | |
393 my $tag_unpair=$tag_struct=~tr/.//; | |
394 my $tag_pair=$tag_length-$tag_unpair; | |
395 my $tag_max_bulge=biggest_bulge($tag_struct); | |
396 if ($tag_arm eq "-") { return $pass;} | |
397 # if ($tag_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag_unpair ($MAX_UNPAIR)"; return $pass} | |
398 if ($tag_pair < $MIN_PAIR) { return $pass;} | |
399 if ($tag_max_bulge > $MAX_BULGE) {return $pass;} | |
400 | |
401 # build base pairing table | |
402 my %pairtable; | |
403 &parse_struct($struct,\%pairtable); # coords count from 1 | |
404 | |
405 # get star | |
406 my ($star_beg,$star_end)=get_star(\%pairtable,$tag_beg,$tag_end); | |
407 my $star=substr($seq,$star_beg-1,$star_end-$star_beg+1); | |
408 my $star_length=$star_end-$star_beg+1; | |
409 my $star_struct=substr($struct,$star_beg-1,$star_end-$star_beg+1); | |
410 my $star_arm=which_arm($star_struct); | |
411 my $star_unpair=$star_struct=~tr/.//; | |
412 my $star_pair=$star_length-$star_unpair; | |
413 my $star_max_bulge=biggest_bulge($star_struct); | |
414 if ($star_arm eq "-") { return $pass;} | |
415 # if ($star_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$star_unpair ($MAX_UNPAIR)"; return $pass} | |
416 if ($star_pair < $MIN_PAIR) {return $pass;} | |
417 if ($star_max_bulge > $MAX_BULGE) {return $pass;} | |
418 | |
419 if ($tag_arm eq $star_arm) {return $pass;} | |
420 | |
421 # space size between miR and miR* | |
422 my $space; | |
423 if ($tag_beg < $star_beg) { | |
424 $space=$star_beg-$tag_end-1; | |
425 } | |
426 else { | |
427 $space=$tag_beg-$star_end-1; | |
428 } | |
429 if ($space < $MIN_SPACE) { return $pass;} | |
430 if ($space > $MAX_SPACE) { return $pass;} | |
431 | |
432 # size diff | |
433 my $size_diff=abs($tag_length-$star_length); | |
434 if ($size_diff > $MAX_SIZEDIFF) { return $pass;} | |
435 | |
436 # asymmetry | |
437 my $asy=get_asy(\%pairtable,$tag_beg,$tag_end); | |
438 if ($asy > $ASYMMETRY) {return $pass;} | |
439 | |
440 $pass=1; | |
441 print FA ">$chr:$strand:$ss..$ee\n$seq\n"; | |
442 print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n"; | |
443 return $pass; | |
444 | |
445 } | |
446 sub get_star { | |
447 my($table,$beg,$end)=@_; | |
448 | |
449 my ($s1,$e1,$s2,$e2); # s1 pair to s2, e1 pair to e2 | |
450 foreach my $i ($beg..$end) { | |
451 if (defined $table->{$i}) { | |
452 my $j=$table->{$i}; | |
453 $s1=$i; | |
454 $s2=$j; | |
455 last; | |
456 } | |
457 } | |
458 foreach my $i (reverse ($beg..$end)) { | |
459 if (defined $table->{$i}) { | |
460 my $j=$table->{$i}; | |
461 $e1=$i; | |
462 $e2=$j; | |
463 last; | |
464 } | |
465 } | |
466 # print "$s1,$e1 $s2,$e2\n"; | |
467 | |
468 # correct terminus | |
469 my $off1=$s1-$beg; | |
470 my $off2=$end-$e1; | |
471 $s2+=$off1; | |
472 $s2+=2; # 081009 | |
473 $e2-=$off2; $e2=1 if $e2 < 1; | |
474 $e2+=2; $e2=1 if $e2 < 1; # 081009 | |
475 ($s2,$e2)=($e2,$s2) if ($s2 > $e2); | |
476 return ($s2,$e2); | |
477 } | |
478 | |
479 sub define_precursor { | |
480 my $seq=shift; | |
481 my $tag=shift; | |
482 | |
483 my $seq_length=length $seq; | |
484 my $tag_length=length $tag; | |
485 my $tag_beg=index($seq,$tag,0)+1; | |
486 my $tag_end=$tag_beg+$tag_length-1; | |
487 | |
488 # split the candidate region into short arm and long arm | |
489 my $tag_arm; | |
490 my ($larm,$larm_beg,$larm_end); | |
491 my ($sarm,$sarm_beg,$sarm_end); | |
492 if ($tag_beg-1 < $seq_length-$tag_end) { # on 5' arm | |
493 $sarm=substr($seq,0,$tag_end); | |
494 $larm=substr($seq,$tag_end); | |
495 $sarm_beg=1; | |
496 $sarm_end=$tag_end; | |
497 $larm_beg=$tag_end+1; | |
498 $larm_end=$seq_length; | |
499 $tag_arm="5p"; | |
500 } | |
501 else { | |
502 $larm=substr($seq,0,$tag_beg-1); # on 3' arm | |
503 $sarm=substr($seq,$tag_beg-1); | |
504 $larm_beg=1; | |
505 $larm_end=$tag_beg-1; | |
506 $sarm_beg=$tag_beg; | |
507 $sarm_end=$seq_length; | |
508 $tag_arm="3p"; | |
509 } | |
510 | |
511 # print "$sarm_beg,$sarm_end $sarm\n"; | |
512 # print "$larm_beg,$larm_end $larm\n"; | |
513 | |
514 # clipping short arm | |
515 if ($tag_arm eq "5p") { | |
516 $sarm_beg=$tag_beg-$flank; $sarm_beg=1 if $sarm_beg < 1; | |
517 $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1); | |
518 } | |
519 else { | |
520 $sarm_end=$tag_end+$flank; $sarm_end=$seq_length if $sarm_end > $seq_length; | |
521 $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1); | |
522 } | |
523 # print "$sarm_beg,$sarm_end $sarm\n"; | |
524 # print "$larm_beg,$larm_end $larm\n"; | |
525 | |
526 # define the precursor by hybriding short arm to long arm | |
527 =cut #modify in 2014-10-28 | |
528 my $duplex=RNA::duplexfold($sarm,$larm); | |
529 my $struct=$duplex->{structure}; | |
530 my $energy=sprintf "%.2f", $duplex->{energy}; | |
531 my ($str1,$str2)=split(/&/,$struct); | |
532 my $pair=$str1=~tr/(//; | |
533 # print "pair=$pair\n"; | |
534 my $beg1=$duplex->{i}+1-length($str1); | |
535 my $end1=$duplex->{i}; | |
536 my $beg2=$duplex->{j}; | |
537 my $end2=$duplex->{j}+length($str2)-1; | |
538 =cut | |
539 ###### new codes begin | |
540 my $duplex=`perl -e 'print "$sarm\n$larm"' | RNAduplex`; | |
541 #(.(.(((.....(((.&))))))...).). 1,16 : 1,13 (-7.20) | |
542 my @tmpduplex=split/\s+/,$duplex; | |
543 my $struct=$tmpduplex[0]; | |
544 $tmpduplex[-1]=~s/[(|)]//g; | |
545 my $energy=$tmpduplex[-1]; | |
546 my ($str1,$str2)=split(/&/,$struct); | |
547 my $pair=$str1=~tr/(//; | |
548 my ($beg1,$end1)=split/,/,$tmpduplex[1]; | |
549 my ($beg2,$end2)=split/,/,$tmpduplex[3]; | |
550 ######## new codes end | |
551 | |
552 # print "$beg1:$end1 $beg2:$end2\n"; | |
553 # transform coordinates | |
554 $beg1=$beg1+$sarm_beg-1; | |
555 $end1=$end1+$sarm_beg-1; | |
556 $beg2=$beg2+$larm_beg-1; | |
557 $end2=$end2+$larm_beg-1; | |
558 # print "$beg1:$end1 $beg2:$end2\n"; | |
559 | |
560 my $off5p=$beg1-$sarm_beg; | |
561 my $off3p=$sarm_end-$end1; | |
562 $beg2-=$off3p; $beg2=1 if $beg2 < 1; | |
563 $end2+=$off5p; $end2=$seq_length if $end2 > $seq_length; | |
564 | |
565 # print "$beg1:$end1 $beg2:$end2\n"; | |
566 | |
567 my $beg=$sarm_beg < $beg2 ? $sarm_beg : $beg2; | |
568 my $end=$sarm_end > $end2 ? $sarm_end : $end2; | |
569 | |
570 return if $pair < $MIN_PAIR; | |
571 # print "$beg,$end\n"; | |
572 return ($beg,$end); | |
573 } | |
574 | |
575 | |
576 # duplex fold, judge whether two short seqs like a miRNA/miRNA* duplex | |
577 sub likeMirDuplex1 { | |
578 my $seq1=shift; | |
579 my $seq2=shift; | |
580 my $like_mir_duplex=1; | |
581 | |
582 my $length1=length $seq1; | |
583 my $length2=length $seq2; | |
584 =cut | |
585 my $duplex=RNA::duplexfold($seq1, $seq2); | |
586 my $duplex_struct=$duplex->{structure}; | |
587 my $duplex_energy=sprintf "%.2f", $duplex->{energy}; | |
588 my ($str1,$str2)=split(/&/,$duplex_struct); | |
589 my $beg1=$duplex->{i}+1-length($str1); | |
590 my $end1=$duplex->{i}; | |
591 my $beg2=$duplex->{j}; | |
592 my $end2=$duplex->{j}+length($str2)-1; | |
593 =cut | |
594 my $duplex=`perl -e 'print "$seq1\n$seq2"' | RNAduplex`; | |
595 #(.(.(((.....(((.&))))))...).). 1,16 : 1,13 (-7.20) | |
596 my @tmpduplex=split/\s+/,$duplex; | |
597 my $duplex_struct=$tmpduplex[0]; | |
598 $tmpduplex[-1]=~s/[(|)]//g; | |
599 my $duplex_energy=$tmpduplex[-1]; | |
600 my ($str1,$str2)=split(/&/,$duplex_struct); | |
601 #my $pair=$str1=~tr/(//; | |
602 my ($beg1,$end1)=split/,/,$tmpduplex[1]; | |
603 my ($beg2,$end2)=split/,/,$tmpduplex[3]; | |
604 | |
605 # revise beg1, end1, beg2, end2 | |
606 $str1=~/^(\.*)/; | |
607 $beg1+=length($1); | |
608 $str1=~/(\.*)$/; | |
609 $end1-=length($1); | |
610 $str2=~/^(\.*)/; | |
611 $beg2+=length($1); | |
612 $str2=~/(\.*)$/; | |
613 $end2-=length($1); | |
614 | |
615 my $pair_num=$str1=~tr/(//; | |
616 my $overhang1=($length2-$end2)-($beg1-1); # 3' overhang at hairpin bottom | |
617 my $overhang2=($length1-$end1)-($beg2-1); # 3' overhang at hairpin neck | |
618 # print $pair_num,"\n"; | |
619 # print $overhang1,"\n"; | |
620 # print $overhang2,"\n"; | |
621 if ($pair_num < 13) { | |
622 $like_mir_duplex=0; | |
623 } | |
624 if ($overhang1 < 0 || $overhang2 < 0 ) { | |
625 $like_mir_duplex=0; | |
626 } | |
627 if ($overhang1 > 4 || $overhang2 > 4) { | |
628 $like_mir_duplex=0; | |
629 } | |
630 return ($like_mir_duplex,$pair_num,$overhang1,$overhang2); | |
631 } | |
632 | |
633 # judge whether two matures form miR/miR* duplex, in hairpin context | |
634 sub likeMirDuplex2 { | |
635 my ($table,$beg1,$end1,$beg2,$end2)=@_; | |
636 my $like_mir_duplex=1; | |
637 | |
638 # s1 e1 | |
639 # 5 ----------------------------3 | |
640 # | | |||| ||| | | |
641 #3 -------------------------------5 | |
642 # e2 s2 | |
643 | |
644 my $pair_num=0; | |
645 my $overhang1=0; | |
646 my $overhang2=0; | |
647 my ($s1,$e1,$s2,$e2); | |
648 foreach my $i ($beg1..$end1) { | |
649 if (defined $table->{$i}) { | |
650 my $j=$table->{$i}; | |
651 if ($j <= $end2 && $j >= $beg2) { | |
652 $s1=$i; | |
653 $e2=$j; | |
654 last; | |
655 } | |
656 } | |
657 } | |
658 foreach my $i (reverse ($beg1..$end1)) { | |
659 if (defined $table->{$i}) { | |
660 my $j=$table->{$i}; | |
661 if ($j <= $end2 && $j >= $beg2) { | |
662 $e1=$i; | |
663 $s2=$j; | |
664 last; | |
665 } | |
666 } | |
667 } | |
668 | |
669 # print "$beg1,$end1 $s1,$e1\n"; | |
670 # print "$beg2,$end2 $s2,$e2\n"; | |
671 | |
672 foreach my $i ($beg1..$end1) { | |
673 if (defined $table->{$i}) { | |
674 my $j=$table->{$i}; | |
675 if ($j <= $end2 && $j >= $beg2) { | |
676 ++$pair_num; | |
677 } | |
678 } | |
679 } | |
680 if (defined $s1 && defined $e2) { | |
681 $overhang1=($end2-$e2)-($s1-$beg1); | |
682 } | |
683 if (defined $e1 && defined $s2) { | |
684 $overhang2=($end1-$e1)-($s2-$beg2); | |
685 } | |
686 | |
687 if ($pair_num < 13) { | |
688 $like_mir_duplex=0; | |
689 } | |
690 if ($overhang1 < 0 && $overhang2 < 0) { | |
691 $like_mir_duplex=0; | |
692 } | |
693 return ($like_mir_duplex,$pair_num,$overhang1,$overhang2); | |
694 } | |
695 sub parse_struct { | |
696 my $struct=shift; | |
697 my $table=shift; | |
698 | |
699 my @t=split('',$struct); | |
700 my @lbs; # left brackets | |
701 foreach my $k (0..$#t) { | |
702 if ($t[$k] eq "(") { | |
703 push @lbs, $k+1; | |
704 } | |
705 elsif ($t[$k] eq ")") { | |
706 my $lb=pop @lbs; | |
707 my $rb=$k+1; | |
708 $table->{$lb}=$rb; | |
709 $table->{$rb}=$lb; | |
710 } | |
711 } | |
712 if (@lbs) { | |
713 warn "unbalanced RNA struct.\n"; | |
714 } | |
715 } | |
716 sub which_arm { | |
717 my $substruct=shift; | |
718 my $arm; | |
719 if ($substruct=~/\(/ && $substruct=~/\)/) { | |
720 $arm="-"; | |
721 } | |
722 elsif ($substruct=~/\(/) { | |
723 $arm="5p"; | |
724 } | |
725 else { | |
726 $arm="3p"; | |
727 } | |
728 return $arm; | |
729 } | |
730 sub biggest_bulge { | |
731 my $struct=shift; | |
732 my $bulge_size=0; | |
733 my $max_bulge=0; | |
734 while ($struct=~/(\.+)/g) { | |
735 $bulge_size=length $1; | |
736 if ($bulge_size > $max_bulge) { | |
737 $max_bulge=$bulge_size; | |
738 } | |
739 } | |
740 return $max_bulge; | |
741 } | |
742 sub get_asy { | |
743 my($table,$a1,$a2)=@_; | |
744 my ($pre_i,$pre_j); | |
745 my $asymmetry=0; | |
746 foreach my $i ($a1..$a2) { | |
747 if (defined $table->{$i}) { | |
748 my $j=$table->{$i}; | |
749 if (defined $pre_i && defined $pre_j) { | |
750 my $diff=($i-$pre_i)+($j-$pre_j); | |
751 $asymmetry += abs($diff); | |
752 } | |
753 $pre_i=$i; | |
754 $pre_j=$j; | |
755 } | |
756 } | |
757 return $asymmetry; | |
758 } | |
759 | |
760 sub peaks{ | |
761 my @cluster=@{$_[0]}; | |
762 | |
763 return if(@cluster<1); | |
764 | |
765 my $max=0; my $index=-1; | |
766 for (my $i=0;$i<@cluster;$i++) { | |
767 if($cluster[$i][2]>$max){ | |
768 $max=$cluster[$i][2]; | |
769 $index=$i; | |
770 } | |
771 } | |
772 # &excise(\@cluster,$index,$_[1],$_[2]); | |
773 return($index); | |
774 } | |
775 | |
776 sub find_cites{ | |
777 my @tmp=@{$_[0]}; | |
778 my $i=&peaks(\@tmp); | |
779 | |
780 my $start=$tmp[$i][0]; | |
781 my $total=0; my $node5=0; | |
782 for (my $j=0;$j<@tmp ;$j++) { | |
783 $total+=$tmp[$j][2]; | |
784 $node5 +=$tmp[$j][2] if($tmp[$j][0]-$start<=2 && $tmp[$j][0]-$start>=-2); | |
785 } | |
786 push @{$cites{$_[1]}{$_[2]}},$tmp[$i] if($node5/$total>0.80 && $tmp[$i][2]/$node5>0.5); | |
787 } | |
788 | |
789 sub newpos{ | |
790 my ($a,$b,$c,$d)=@_; | |
791 my $s= $a>$c ? $c : $a; | |
792 my $e=$b>$d ? $b : $d; | |
793 return($s,$e); | |
794 } | |
795 | |
796 sub rev{ | |
797 | |
798 my($sequence)=@_; | |
799 | |
800 my $rev=reverse $sequence; | |
801 | |
802 return $rev; | |
803 } | |
804 | |
805 sub com{ | |
806 | |
807 my($sequence)=@_; | |
808 | |
809 $sequence=~tr/acgtuACGTU/TGCAATGCAA/; | |
810 | |
811 return $sequence; | |
812 } | |
813 | |
814 sub revcom{ | |
815 | |
816 my($sequence)=@_; | |
817 | |
818 my $revcom=rev(com($sequence)); | |
819 | |
820 return $revcom; | |
821 } | |
822 | |
823 sub find_strand{ | |
824 | |
825 #A subroutine to find the strand, parsing different blast formats | |
826 my($other)=@_; | |
827 | |
828 my $strand="+"; | |
829 | |
830 if($other=~/-/){ | |
831 $strand="-"; | |
832 } | |
833 | |
834 if($other=~/minus/i){ | |
835 $strand="-"; | |
836 } | |
837 | |
838 return($strand); | |
839 } | |
840 sub usage{ | |
841 print <<"USAGE"; | |
842 Version $version | |
843 Usage: | |
844 $0 -map -g -d -f -o -s -e | |
845 options: | |
846 -map input file# align result # bst. format | |
847 -g input file # genome sequence fasta format | |
848 -d <int> Maximal space between miRNA and miRNA* (200) | |
849 -f <int> Flank sequence length of miRNA precursor (10) | |
850 -o output file# percursor fasta file | |
851 -s output file# precursor structure file | |
852 -e <folat> Maximal free energy allowed for a miRNA precursor (-18 kcal/mol) | |
853 | |
854 -h help | |
855 USAGE | |
856 exit(1); | |
857 } | |
858 |