comparison precursors.pl @ 0:87fe81de0931 draft default tip

Uploaded
author bigrna
date Sun, 04 Jan 2015 02:47:25 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:87fe81de0931
1 #!/usr/bin/perl -w
2 #Filename:
3 #Author: Tian Dongmei
4 #Email: tiandm@big.ac.cn
5 #Date: 2013/7/19
6 #Modified:
7 #Description:
8 my $version=1.00;
9
10 use strict;
11 use Getopt::Long;
12 #use RNA;
13
14 my %opts;
15 GetOptions(\%opts,"map=s","g=s","d:i","f:i","o=s","e:f","s=s","h");
16 if (!(defined $opts{map} and defined $opts{g} and defined $opts{o} and defined $opts{s} ) || defined $opts{h}) { #necessary arguments
17 &usage;
18 }
19
20 my $checkno=1;
21 my $filein=$opts{'map'};
22 my $faout=$opts{'o'};
23 my $strout=$opts{'s'};
24 my $genome= $opts{'g'};
25
26 my $maxd=defined $opts{'d'} ? $opts{'d'} : 200;
27 my $flank=defined $opts{'f'}? $opts{'f'} : 10;
28
29 my $MAX_ENERGY=-18;
30 if (defined $opts{'e'}) {$MAX_ENERGY=$opts{'e'};}
31 my $MAX_UNPAIR=5;
32 my $MIN_PAIR=15;
33 my $MAX_SIZEDIFF=4;
34 my $MAX_BULGE=2;
35 my $ASYMMETRY=5;
36 my $MIN_UNPAIR=0;
37 my $MIN_SPACE=5;
38 my $MAX_SPACE=$maxd;
39 my $FLANK=$flank;
40
41 ######### load in genome sequences start ########
42 my %genome;
43 my %lng;
44 my $name;
45 open IN,"<$genome";
46 while (my $aline=<IN>) {
47 chomp $aline;
48 next if($aline=~/^\#/);
49 if ($aline=~/^>(\S+)/) {
50 $name=$1;
51 next;
52 }
53 $genome{$name} .=$aline;
54 }
55 close IN;
56 foreach my $key (keys %genome) {
57 $lng{$key}=length($genome{$key});
58 }
59 ####### load in genome sequences end ##########
60
61 my %breaks; ### reads number bigger than 3
62 open IN,"<$filein"; #input file
63 while (my $aline=<IN>) {
64 chomp $aline;
65 my @tmp=split/\t/,$aline;
66 $tmp[0]=~/_x(\d+)$/;
67 my $no=$1;
68 next if($no<3);
69 #my $trand=&find_strand($tmp[9]);
70 #my @pos=split/\.\./,$tmp[5];
71 my $end=$tmp[3]+length($tmp[4])-1;
72 if($tmp[1] eq "-"){$tmp[4]=revcom($tmp[4]);}
73 push @{$breaks{$tmp[2]}{$tmp[1]}},[$tmp[3],$end,$no,$tmp[4]]; ### 0 base
74 }
75 close IN;
76
77 my %cites; ### peaks
78 foreach my $chr (keys %breaks) {
79 foreach my $strand (keys %{$breaks{$chr}}) {
80 my @array=@{$breaks{$chr}{$strand}};
81 @array=sort{$a->[0]<=>$b->[0]} @array;
82 for (my $i=0;$i<@array;$i++) {
83 my $start=$array[$i][0];my $end=$array[$i][1];
84 my @subarray=();
85 push @subarray,$array[$i];
86
87 for (my $j=$i+1;$j<@array;$j++) {
88 if ($start<$array[$j][1] && $end>$array[$j][0]) { ###overlap
89 push @subarray,$array[$j];
90 ($start,$end)=&newpos($start,$end,$array[$j][0],$array[$j][1]);
91 }
92 else{
93 $i=$j-1;
94 &find_cites(\@subarray,$chr,$strand);
95 last;
96 }
97 }
98 }
99 }
100 }
101
102 my %cluster;
103 foreach my $chr (keys %cites) {
104 foreach my $strand (keys %{$cites{$chr}}) {
105 my @array=@{$sites{$chr}{$strand}};
106 @array=sort{$a->[0]<=>$b->[0]} @array;
107 for (my $i=0;$i<@array;$i++) {
108 my $start=$array[$i][0];my $end=$array[$i][1];
109 my @subarray=();
110 push @subarray,$array[$i];
111
112 for (my $j=$i+1;$j<@array;$j++) {
113 if ($end>$array[$j][0]-$maxd) { ###distance less than 200bp
114 push @subarray,$array[$j];
115 ($start,$end)=&newpos($start,$end,$array[$j][0],$array[$j][1]);
116 }
117 else{
118 @{$cluster{$chr}{$strand}{$i}}=@subarray;
119 $i=$j-1;
120 last;
121 }
122 }
123 }
124
125 }
126 }
127
128
129 open FA,">$faout"; #output file
130 open STR,">$strout";
131 foreach my $chr (keys %cluster) {
132 foreach my $strand (keys %{$cluster{$chr}}) {
133 foreach my $no (keys %{$cluster{$chr}{$strand}}) {
134 my @array2=@{$cluster{$chr}{$strand}{$no}};
135 @array2=sort{$a->[0]<=>$b->[0]} @array2;
136 &excise(\@array2,$chr,$strand);
137 }
138 }
139 }
140 close FA;
141 close STR;
142 sub oneCiteDn{
143 my ($array,$a,$chr,$strand)=@_;
144
145 my $ss=$$array[$a][0]-$flank;
146 $ss=0 if($ss<0);
147 my $ee=$$array[$a][1]+$maxd+$flank;
148 $ee=$lng{$chr} if($ee>$lng{$chr});
149
150 my $seq=substr($genome{$chr},$ss,$ee-$ss+1);
151 if($strand eq "-"){$seq=revcom($seq);}
152
153 my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee);
154 return $val;
155 }
156 sub oneCiteUp{
157 my ($array,$a,$chr,$strand)=@_;
158
159 my $ss=$$array[$a][0]-$maxd-$flank;
160 $ss=0 if($ss<0);
161 my $ee=$$array[$a][1]+$flank;
162 $ee=$lng{$chr} if($ee>$lng{$chr});
163
164 my $seq=substr($genome{$chr},$ss,$ee-$ss+1);
165 if($strand eq "-"){$seq=revcom($seq);}
166
167 my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee);
168 return $val;
169
170 }
171
172 sub twoCites{
173 my ($array,$a,$b,$chr,$strand)=@_;
174
175 my $ss=$$array[$a][0]-$flank;
176 $ss=0 if($ss<0);
177 my $ee=$$array[$b][1]+$flank;
178 $ee=$lng{$chr} if($ee>$lng{$chr});
179
180 my $seq=substr($genome{$chr},$ss,$ee-$ss+1);
181 if($strand eq "-"){$seq=revcom($seq);}
182
183 # my( $str,$mfe)=RNA::fold($seq);
184 # return 0 if($mfe>$MAX_ENERGY); ### minimum mfe
185 my $val=&ffw2($seq,$$array[$a][3],$$array[$b][3],$chr,$strand,$ss,$ee);
186
187 return $val;
188
189 }
190 sub excise{
191 my ($cluster,$chr,$strand)=@_;
192
193 if(@{$cluster}==1){
194 $ok=&oneCiteDn($cluster,0,$chr,$strand);
195 $ok=&oneCiteUp($cluster,0,$chr,$strand);
196 }else{
197 my $peak_pos=0;
198
199 for (my $i=0;$i<@{$cluster};$i++) {
200 if($$cluster[$i][2]>$$cluster[$peak_pos][2]){$peak_pos=$i;}
201 }
202
203 my $ok=0;
204 for (my $i=0;$i<@{$cluster};$i++) {
205 next if($i==$peak_pos);
206 if($i<$peak_pos){$ok=&twoCites($cluster,$i,$peak_pos,$chr,$strand);}
207 else{$ok=&twoCites($cluster,$peak_pos,$i,$chr,$strand);}
208 last if($ok);
209 }
210 if (!$ok) {
211 $ok=&oneCiteDn($cluster,$peak_pos,$chr,$strand);
212 $ok=&oneCiteUp($cluster,$peak_pos,$chr,$strand);
213 }
214
215 }
216 }
217
218 sub ffw2{
219 my ($seq,$tag1,$tag2,$chr,$strand,$ss,$ee)=@_;
220
221 my $N_count=$seq=~tr/N//; ## precursor sequence has not more than 5 Ns
222 if ($N_count > 5) {
223 return 0;
224 }
225
226 my $seq_length=length $seq;
227 # position tag1 and tag2
228 my $tag1_beg=index($seq,$tag1,0)+1;
229 if ($tag1_beg < 1) {
230 warn "[ffw2] coordinate error.\n";
231 # $fold->{reason}="coordinate error";
232 return 0;
233 }
234 my $tag2_beg=index($seq,$tag2,0)+1;
235 if ($tag2_beg < 1) {
236 warn "[ffw2] coordinate error.\n";
237 # $fold->{reason}="coordinate error";
238 return 0;
239 }
240 if ($tag2_beg < $tag1_beg) {
241 # swap tag1 and tag2
242 ($tag1,$tag2)=($tag2,$tag1);
243 ($tag1_beg,$tag2_beg)=($tag2_beg,$tag1_beg);
244 }
245 my $tag1_end=$tag1_beg+length($tag1)-1;
246 my $tag2_end=$tag2_beg+length($tag2)-1;
247 # re-clipping
248 my $beg=$tag1_beg-$FLANK; $beg=1 if $beg < 1;
249 my $end=$tag2_end+$FLANK; $end=$seq_length if $end > $seq_length;
250 $seq=substr($seq,$beg-1,$end-$beg+1);
251 $seq_length=length $seq;
252 # re-reposition
253 $tag1_beg=index($seq,$tag1,0)+1;
254 if ($tag1_beg < 1) {
255 warn "[ffw2] coordinate error.\n";
256 # $fold->{reason}="coordinate error";
257 return 0;
258 }
259
260 $tag2_beg=index($seq,$tag2,0)+1;
261 if ($tag2_beg < 1) {
262 warn "[ffw2] coordinate error.\n";
263 # $fold->{reason}="coordinate error";
264 return 0;
265 }
266 $tag1_end=$tag1_beg+length($tag1)-1;
267 $tag2_end=$tag2_beg+length($tag2)-1;
268
269 # fold
270 #my ($struct,$mfe)=RNA::fold($seq);
271 my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`;
272 my @rawfolds=split/\s+/,$rnafold;
273 my $struct=$rawfolds[1];
274 my $mfe=$rawfolds[-1];
275 $mfe=~s/\(//;
276 $mfe=~s/\)//;
277 #$mfe=sprintf "%.2f", $mfe;
278 if ($mfe > $MAX_ENERGY) {return 0;}
279
280 # tag1
281 my $tag1_length=$tag1_end-$tag1_beg+1;
282 my $tag1_struct=substr($struct,$tag1_beg-1,$tag1_length);
283 my $tag1_arm=which_arm($tag1_struct);
284 my $tag1_unpair=$tag1_struct=~tr/.//;
285 my $tag1_pair=$tag1_length-$tag1_unpair;
286 my $tag1_max_bulge=biggest_bulge($tag1_struct);
287 if ($tag1_arm ne "5p") { return 0;} # tag not in stem
288 # if ($tag1_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag1_unpair ($MAX_UNPAIR)"; return $pass}
289 if ($tag1_pair < $MIN_PAIR) {return 0;}
290 if ($tag1_max_bulge > $MAX_BULGE) {return 0;}
291
292 # tag2
293 my $tag2_length=$tag2_end-$tag2_beg+1;
294 my $tag2_struct=substr($struct,$tag2_beg-1,$tag2_length);
295 my $tag2_arm=which_arm($tag2_struct);
296 my $tag2_unpair=$tag2_struct=~tr/.//;
297 my $tag2_pair=$tag2_length-$tag2_unpair;
298 my $tag2_max_bulge=biggest_bulge($tag2_struct);
299 if ($tag2_arm ne "3p") {return 0;} # star not in stem
300 # if ($tag2_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag2_unpair ($MAX_UNPAIR)"; return $pass}
301 if ($tag2_pair < $MIN_PAIR) {return 0;}
302 if ($tag2_max_bulge > $MAX_BULGE) {return 0;}
303
304 # space size between miR and miR*
305 my $space=$tag2_beg-$tag1_end-1;
306 if ($space < $MIN_SPACE) {return 0;}
307 if ($space > $MAX_SPACE) {return 0;}
308
309 # size diff of miR and miR*
310 my $size_diff=abs($tag1_length-$tag2_length);
311 if ($size_diff > $MAX_SIZEDIFF) {return 0;}
312
313 # build base pairing table
314 my %pairtable;
315 &parse_struct($struct,\%pairtable); # coords count from 1
316
317 my $asy1=get_asy(\%pairtable,$tag1_beg,$tag1_end);
318 my $asy2=get_asy(\%pairtable,$tag2_beg,$tag2_end);
319 my $asy=($asy1 < $asy2) ? $asy1 : $asy2;
320 if ($asy > $ASYMMETRY) {return 0}
321
322 # duplex fold, determine whether two matures like a miR/miR* ike duplex
323 my ($like_mir_duplex1,$duplex_pair,$overhang1,$overhang2)=likeMirDuplex1($tag1,$tag2);
324 # parse hairpin, determine whether two matures form miR/miR* duplex in hairpin context
325 my ($like_mir_duplex2,$duplex_pair2,$overhang_b,$overhang_t)=likeMirDuplex2(\%pairtable,$tag1_beg,$tag1_end,$tag2_beg,$tag2_end);
326 if ($like_mir_duplex1==0 && $like_mir_duplex2==0) {
327 return 0;
328 }
329
330 print FA ">$chr:$strand:$ss..$ee\n$seq\n";
331 print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n";
332
333 return 1;
334 }
335
336 sub ffw1{
337 my ($seq,$tag,$chr,$strand,$ss,$ee)=@_;
338 my $pass=0;
339
340 my $N_count=$seq=~tr/N//;
341 if ($N_count > 5) {
342 return 0;
343 }
344
345 my $seq_length=length $seq;
346 my $tag_length=length $tag;
347
348 # position
349 my $tag_beg=index($seq,$tag,0)+1;
350 if ($tag_beg < 1) {
351 warn "[ffw1] coordinate error.\n";
352 return $pass;
353 }
354 my $tag_end=$tag_beg+length($tag)-1;
355
356
357 # define candidate precursor by hybrid short arm to long arm, not solid enough
358 my($beg,$end)=define_precursor($seq,$tag);
359 if (not defined $beg) {
360 return $pass;
361 }
362 if (not defined $end) {
363 return $pass;
364 }
365 $seq=substr($seq,$beg-1,$end-$beg+1);
366 $seq_length=length $seq;
367
368
369 # fold
370 #my ($struct,$mfe)=RNA::fold($seq);
371 my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`;
372 my @rawfolds=split/\s+/,$rnafold;
373 my $struct=$rawfolds[1];
374 my $mfe=$rawfolds[-1];
375 $mfe=~s/\(//;
376 $mfe=~s/\)//;
377
378 if ($mfe > $MAX_ENERGY) {
379 $pass=0;
380 return $pass;
381 }
382
383 # reposition
384 $tag_beg=index($seq,$tag,0)+1;
385 if ($tag_beg < 1) {
386 warn "[ffw1] coordinate error.\n";
387 return 0;
388 }
389 $tag_end=$tag_beg+length($tag)-1;
390
391 my $tag_struct=substr($struct,$tag_beg-1,$tag_length);
392 my $tag_arm=which_arm($tag_struct);
393 my $tag_unpair=$tag_struct=~tr/.//;
394 my $tag_pair=$tag_length-$tag_unpair;
395 my $tag_max_bulge=biggest_bulge($tag_struct);
396 if ($tag_arm eq "-") { return $pass;}
397 # if ($tag_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag_unpair ($MAX_UNPAIR)"; return $pass}
398 if ($tag_pair < $MIN_PAIR) { return $pass;}
399 if ($tag_max_bulge > $MAX_BULGE) {return $pass;}
400
401 # build base pairing table
402 my %pairtable;
403 &parse_struct($struct,\%pairtable); # coords count from 1
404
405 # get star
406 my ($star_beg,$star_end)=get_star(\%pairtable,$tag_beg,$tag_end);
407 my $star=substr($seq,$star_beg-1,$star_end-$star_beg+1);
408 my $star_length=$star_end-$star_beg+1;
409 my $star_struct=substr($struct,$star_beg-1,$star_end-$star_beg+1);
410 my $star_arm=which_arm($star_struct);
411 my $star_unpair=$star_struct=~tr/.//;
412 my $star_pair=$star_length-$star_unpair;
413 my $star_max_bulge=biggest_bulge($star_struct);
414 if ($star_arm eq "-") { return $pass;}
415 # if ($star_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$star_unpair ($MAX_UNPAIR)"; return $pass}
416 if ($star_pair < $MIN_PAIR) {return $pass;}
417 if ($star_max_bulge > $MAX_BULGE) {return $pass;}
418
419 if ($tag_arm eq $star_arm) {return $pass;}
420
421 # space size between miR and miR*
422 my $space;
423 if ($tag_beg < $star_beg) {
424 $space=$star_beg-$tag_end-1;
425 }
426 else {
427 $space=$tag_beg-$star_end-1;
428 }
429 if ($space < $MIN_SPACE) { return $pass;}
430 if ($space > $MAX_SPACE) { return $pass;}
431
432 # size diff
433 my $size_diff=abs($tag_length-$star_length);
434 if ($size_diff > $MAX_SIZEDIFF) { return $pass;}
435
436 # asymmetry
437 my $asy=get_asy(\%pairtable,$tag_beg,$tag_end);
438 if ($asy > $ASYMMETRY) {return $pass;}
439
440 $pass=1;
441 print FA ">$chr:$strand:$ss..$ee\n$seq\n";
442 print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n";
443 return $pass;
444
445 }
446 sub get_star {
447 my($table,$beg,$end)=@_;
448
449 my ($s1,$e1,$s2,$e2); # s1 pair to s2, e1 pair to e2
450 foreach my $i ($beg..$end) {
451 if (defined $table->{$i}) {
452 my $j=$table->{$i};
453 $s1=$i;
454 $s2=$j;
455 last;
456 }
457 }
458 foreach my $i (reverse ($beg..$end)) {
459 if (defined $table->{$i}) {
460 my $j=$table->{$i};
461 $e1=$i;
462 $e2=$j;
463 last;
464 }
465 }
466 # print "$s1,$e1 $s2,$e2\n";
467
468 # correct terminus
469 my $off1=$s1-$beg;
470 my $off2=$end-$e1;
471 $s2+=$off1;
472 $s2+=2; # 081009
473 $e2-=$off2; $e2=1 if $e2 < 1;
474 $e2+=2; $e2=1 if $e2 < 1; # 081009
475 ($s2,$e2)=($e2,$s2) if ($s2 > $e2);
476 return ($s2,$e2);
477 }
478
479 sub define_precursor {
480 my $seq=shift;
481 my $tag=shift;
482
483 my $seq_length=length $seq;
484 my $tag_length=length $tag;
485 my $tag_beg=index($seq,$tag,0)+1;
486 my $tag_end=$tag_beg+$tag_length-1;
487
488 # split the candidate region into short arm and long arm
489 my $tag_arm;
490 my ($larm,$larm_beg,$larm_end);
491 my ($sarm,$sarm_beg,$sarm_end);
492 if ($tag_beg-1 < $seq_length-$tag_end) { # on 5' arm
493 $sarm=substr($seq,0,$tag_end);
494 $larm=substr($seq,$tag_end);
495 $sarm_beg=1;
496 $sarm_end=$tag_end;
497 $larm_beg=$tag_end+1;
498 $larm_end=$seq_length;
499 $tag_arm="5p";
500 }
501 else {
502 $larm=substr($seq,0,$tag_beg-1); # on 3' arm
503 $sarm=substr($seq,$tag_beg-1);
504 $larm_beg=1;
505 $larm_end=$tag_beg-1;
506 $sarm_beg=$tag_beg;
507 $sarm_end=$seq_length;
508 $tag_arm="3p";
509 }
510
511 # print "$sarm_beg,$sarm_end $sarm\n";
512 # print "$larm_beg,$larm_end $larm\n";
513
514 # clipping short arm
515 if ($tag_arm eq "5p") {
516 $sarm_beg=$tag_beg-$flank; $sarm_beg=1 if $sarm_beg < 1;
517 $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1);
518 }
519 else {
520 $sarm_end=$tag_end+$flank; $sarm_end=$seq_length if $sarm_end > $seq_length;
521 $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1);
522 }
523 # print "$sarm_beg,$sarm_end $sarm\n";
524 # print "$larm_beg,$larm_end $larm\n";
525
526 # define the precursor by hybriding short arm to long arm
527 =cut #modify in 2014-10-28
528 my $duplex=RNA::duplexfold($sarm,$larm);
529 my $struct=$duplex->{structure};
530 my $energy=sprintf "%.2f", $duplex->{energy};
531 my ($str1,$str2)=split(/&/,$struct);
532 my $pair=$str1=~tr/(//;
533 # print "pair=$pair\n";
534 my $beg1=$duplex->{i}+1-length($str1);
535 my $end1=$duplex->{i};
536 my $beg2=$duplex->{j};
537 my $end2=$duplex->{j}+length($str2)-1;
538 =cut
539 ###### new codes begin
540 my $duplex=`perl -e 'print "$sarm\n$larm"' | RNAduplex`;
541 #(.(.(((.....(((.&))))))...).). 1,16 : 1,13 (-7.20)
542 my @tmpduplex=split/\s+/,$duplex;
543 my $struct=$tmpduplex[0];
544 $tmpduplex[-1]=~s/[(|)]//g;
545 my $energy=$tmpduplex[-1];
546 my ($str1,$str2)=split(/&/,$struct);
547 my $pair=$str1=~tr/(//;
548 my ($beg1,$end1)=split/,/,$tmpduplex[1];
549 my ($beg2,$end2)=split/,/,$tmpduplex[3];
550 ######## new codes end
551
552 # print "$beg1:$end1 $beg2:$end2\n";
553 # transform coordinates
554 $beg1=$beg1+$sarm_beg-1;
555 $end1=$end1+$sarm_beg-1;
556 $beg2=$beg2+$larm_beg-1;
557 $end2=$end2+$larm_beg-1;
558 # print "$beg1:$end1 $beg2:$end2\n";
559
560 my $off5p=$beg1-$sarm_beg;
561 my $off3p=$sarm_end-$end1;
562 $beg2-=$off3p; $beg2=1 if $beg2 < 1;
563 $end2+=$off5p; $end2=$seq_length if $end2 > $seq_length;
564
565 # print "$beg1:$end1 $beg2:$end2\n";
566
567 my $beg=$sarm_beg < $beg2 ? $sarm_beg : $beg2;
568 my $end=$sarm_end > $end2 ? $sarm_end : $end2;
569
570 return if $pair < $MIN_PAIR;
571 # print "$beg,$end\n";
572 return ($beg,$end);
573 }
574
575
576 # duplex fold, judge whether two short seqs like a miRNA/miRNA* duplex
577 sub likeMirDuplex1 {
578 my $seq1=shift;
579 my $seq2=shift;
580 my $like_mir_duplex=1;
581
582 my $length1=length $seq1;
583 my $length2=length $seq2;
584 =cut
585 my $duplex=RNA::duplexfold($seq1, $seq2);
586 my $duplex_struct=$duplex->{structure};
587 my $duplex_energy=sprintf "%.2f", $duplex->{energy};
588 my ($str1,$str2)=split(/&/,$duplex_struct);
589 my $beg1=$duplex->{i}+1-length($str1);
590 my $end1=$duplex->{i};
591 my $beg2=$duplex->{j};
592 my $end2=$duplex->{j}+length($str2)-1;
593 =cut
594 my $duplex=`perl -e 'print "$seq1\n$seq2"' | RNAduplex`;
595 #(.(.(((.....(((.&))))))...).). 1,16 : 1,13 (-7.20)
596 my @tmpduplex=split/\s+/,$duplex;
597 my $duplex_struct=$tmpduplex[0];
598 $tmpduplex[-1]=~s/[(|)]//g;
599 my $duplex_energy=$tmpduplex[-1];
600 my ($str1,$str2)=split(/&/,$duplex_struct);
601 #my $pair=$str1=~tr/(//;
602 my ($beg1,$end1)=split/,/,$tmpduplex[1];
603 my ($beg2,$end2)=split/,/,$tmpduplex[3];
604
605 # revise beg1, end1, beg2, end2
606 $str1=~/^(\.*)/;
607 $beg1+=length($1);
608 $str1=~/(\.*)$/;
609 $end1-=length($1);
610 $str2=~/^(\.*)/;
611 $beg2+=length($1);
612 $str2=~/(\.*)$/;
613 $end2-=length($1);
614
615 my $pair_num=$str1=~tr/(//;
616 my $overhang1=($length2-$end2)-($beg1-1); # 3' overhang at hairpin bottom
617 my $overhang2=($length1-$end1)-($beg2-1); # 3' overhang at hairpin neck
618 # print $pair_num,"\n";
619 # print $overhang1,"\n";
620 # print $overhang2,"\n";
621 if ($pair_num < 13) {
622 $like_mir_duplex=0;
623 }
624 if ($overhang1 < 0 || $overhang2 < 0 ) {
625 $like_mir_duplex=0;
626 }
627 if ($overhang1 > 4 || $overhang2 > 4) {
628 $like_mir_duplex=0;
629 }
630 return ($like_mir_duplex,$pair_num,$overhang1,$overhang2);
631 }
632
633 # judge whether two matures form miR/miR* duplex, in hairpin context
634 sub likeMirDuplex2 {
635 my ($table,$beg1,$end1,$beg2,$end2)=@_;
636 my $like_mir_duplex=1;
637
638 # s1 e1
639 # 5 ----------------------------3
640 # | | |||| ||| |
641 #3 -------------------------------5
642 # e2 s2
643
644 my $pair_num=0;
645 my $overhang1=0;
646 my $overhang2=0;
647 my ($s1,$e1,$s2,$e2);
648 foreach my $i ($beg1..$end1) {
649 if (defined $table->{$i}) {
650 my $j=$table->{$i};
651 if ($j <= $end2 && $j >= $beg2) {
652 $s1=$i;
653 $e2=$j;
654 last;
655 }
656 }
657 }
658 foreach my $i (reverse ($beg1..$end1)) {
659 if (defined $table->{$i}) {
660 my $j=$table->{$i};
661 if ($j <= $end2 && $j >= $beg2) {
662 $e1=$i;
663 $s2=$j;
664 last;
665 }
666 }
667 }
668
669 # print "$beg1,$end1 $s1,$e1\n";
670 # print "$beg2,$end2 $s2,$e2\n";
671
672 foreach my $i ($beg1..$end1) {
673 if (defined $table->{$i}) {
674 my $j=$table->{$i};
675 if ($j <= $end2 && $j >= $beg2) {
676 ++$pair_num;
677 }
678 }
679 }
680 if (defined $s1 && defined $e2) {
681 $overhang1=($end2-$e2)-($s1-$beg1);
682 }
683 if (defined $e1 && defined $s2) {
684 $overhang2=($end1-$e1)-($s2-$beg2);
685 }
686
687 if ($pair_num < 13) {
688 $like_mir_duplex=0;
689 }
690 if ($overhang1 < 0 && $overhang2 < 0) {
691 $like_mir_duplex=0;
692 }
693 return ($like_mir_duplex,$pair_num,$overhang1,$overhang2);
694 }
695 sub parse_struct {
696 my $struct=shift;
697 my $table=shift;
698
699 my @t=split('',$struct);
700 my @lbs; # left brackets
701 foreach my $k (0..$#t) {
702 if ($t[$k] eq "(") {
703 push @lbs, $k+1;
704 }
705 elsif ($t[$k] eq ")") {
706 my $lb=pop @lbs;
707 my $rb=$k+1;
708 $table->{$lb}=$rb;
709 $table->{$rb}=$lb;
710 }
711 }
712 if (@lbs) {
713 warn "unbalanced RNA struct.\n";
714 }
715 }
716 sub which_arm {
717 my $substruct=shift;
718 my $arm;
719 if ($substruct=~/\(/ && $substruct=~/\)/) {
720 $arm="-";
721 }
722 elsif ($substruct=~/\(/) {
723 $arm="5p";
724 }
725 else {
726 $arm="3p";
727 }
728 return $arm;
729 }
730 sub biggest_bulge {
731 my $struct=shift;
732 my $bulge_size=0;
733 my $max_bulge=0;
734 while ($struct=~/(\.+)/g) {
735 $bulge_size=length $1;
736 if ($bulge_size > $max_bulge) {
737 $max_bulge=$bulge_size;
738 }
739 }
740 return $max_bulge;
741 }
742 sub get_asy {
743 my($table,$a1,$a2)=@_;
744 my ($pre_i,$pre_j);
745 my $asymmetry=0;
746 foreach my $i ($a1..$a2) {
747 if (defined $table->{$i}) {
748 my $j=$table->{$i};
749 if (defined $pre_i && defined $pre_j) {
750 my $diff=($i-$pre_i)+($j-$pre_j);
751 $asymmetry += abs($diff);
752 }
753 $pre_i=$i;
754 $pre_j=$j;
755 }
756 }
757 return $asymmetry;
758 }
759
760 sub peaks{
761 my @cluster=@{$_[0]};
762
763 return if(@cluster<1);
764
765 my $max=0; my $index=-1;
766 for (my $i=0;$i<@cluster;$i++) {
767 if($cluster[$i][2]>$max){
768 $max=$cluster[$i][2];
769 $index=$i;
770 }
771 }
772 # &excise(\@cluster,$index,$_[1],$_[2]);
773 return($index);
774 }
775
776 sub find_cites{
777 my @tmp=@{$_[0]};
778 my $i=&peaks(\@tmp);
779
780 my $start=$tmp[$i][0];
781 my $total=0; my $node5=0;
782 for (my $j=0;$j<@tmp ;$j++) {
783 $total+=$tmp[$j][2];
784 $node5 +=$tmp[$j][2] if($tmp[$j][0]-$start<=2 && $tmp[$j][0]-$start>=-2);
785 }
786 push @{$cites{$_[1]}{$_[2]}},$tmp[$i] if($node5/$total>0.80 && $tmp[$i][2]/$node5>0.5);
787 }
788
789 sub newpos{
790 my ($a,$b,$c,$d)=@_;
791 my $s= $a>$c ? $c : $a;
792 my $e=$b>$d ? $b : $d;
793 return($s,$e);
794 }
795
796 sub rev{
797
798 my($sequence)=@_;
799
800 my $rev=reverse $sequence;
801
802 return $rev;
803 }
804
805 sub com{
806
807 my($sequence)=@_;
808
809 $sequence=~tr/acgtuACGTU/TGCAATGCAA/;
810
811 return $sequence;
812 }
813
814 sub revcom{
815
816 my($sequence)=@_;
817
818 my $revcom=rev(com($sequence));
819
820 return $revcom;
821 }
822
823 sub find_strand{
824
825 #A subroutine to find the strand, parsing different blast formats
826 my($other)=@_;
827
828 my $strand="+";
829
830 if($other=~/-/){
831 $strand="-";
832 }
833
834 if($other=~/minus/i){
835 $strand="-";
836 }
837
838 return($strand);
839 }
840 sub usage{
841 print <<"USAGE";
842 Version $version
843 Usage:
844 $0 -map -g -d -f -o -s -e
845 options:
846 -map input file# align result # bst. format
847 -g input file # genome sequence fasta format
848 -d <int> Maximal space between miRNA and miRNA* (200)
849 -f <int> Flank sequence length of miRNA precursor (10)
850 -o output file# percursor fasta file
851 -s output file# precursor structure file
852 -e <folat> Maximal free energy allowed for a miRNA precursor (-18 kcal/mol)
853
854 -h help
855 USAGE
856 exit(1);
857 }
858