Mercurial > repos > bigrna > gpsrna
diff precursors.pl @ 0:87fe81de0931 draft default tip
Uploaded
author | bigrna |
---|---|
date | Sun, 04 Jan 2015 02:47:25 -0500 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/precursors.pl Sun Jan 04 02:47:25 2015 -0500 @@ -0,0 +1,858 @@ +#!/usr/bin/perl -w +#Filename: +#Author: Tian Dongmei +#Email: tiandm@big.ac.cn +#Date: 2013/7/19 +#Modified: +#Description: +my $version=1.00; + +use strict; +use Getopt::Long; +#use RNA; + +my %opts; +GetOptions(\%opts,"map=s","g=s","d:i","f:i","o=s","e:f","s=s","h"); +if (!(defined $opts{map} and defined $opts{g} and defined $opts{o} and defined $opts{s} ) || defined $opts{h}) { #necessary arguments +&usage; +} + +my $checkno=1; +my $filein=$opts{'map'}; +my $faout=$opts{'o'}; +my $strout=$opts{'s'}; +my $genome= $opts{'g'}; + +my $maxd=defined $opts{'d'} ? $opts{'d'} : 200; +my $flank=defined $opts{'f'}? $opts{'f'} : 10; + +my $MAX_ENERGY=-18; +if (defined $opts{'e'}) {$MAX_ENERGY=$opts{'e'};} +my $MAX_UNPAIR=5; +my $MIN_PAIR=15; +my $MAX_SIZEDIFF=4; +my $MAX_BULGE=2; +my $ASYMMETRY=5; +my $MIN_UNPAIR=0; +my $MIN_SPACE=5; +my $MAX_SPACE=$maxd; +my $FLANK=$flank; + +######### load in genome sequences start ######## +my %genome; +my %lng; +my $name; +open IN,"<$genome"; +while (my $aline=<IN>) { + chomp $aline; + next if($aline=~/^\#/); + if ($aline=~/^>(\S+)/) { + $name=$1; + next; + } + $genome{$name} .=$aline; +} +close IN; +foreach my $key (keys %genome) { + $lng{$key}=length($genome{$key}); +} +####### load in genome sequences end ########## + +my %breaks; ### reads number bigger than 3 +open IN,"<$filein"; #input file +while (my $aline=<IN>) { + chomp $aline; + my @tmp=split/\t/,$aline; + $tmp[0]=~/_x(\d+)$/; + my $no=$1; + next if($no<3); + #my $trand=&find_strand($tmp[9]); + #my @pos=split/\.\./,$tmp[5]; + my $end=$tmp[3]+length($tmp[4])-1; + if($tmp[1] eq "-"){$tmp[4]=revcom($tmp[4]);} + push @{$breaks{$tmp[2]}{$tmp[1]}},[$tmp[3],$end,$no,$tmp[4]]; ### 0 base +} +close IN; + +my %cites; ### peaks +foreach my $chr (keys %breaks) { + foreach my $strand (keys %{$breaks{$chr}}) { + my @array=@{$breaks{$chr}{$strand}}; + @array=sort{$a->[0]<=>$b->[0]} @array; + for (my $i=0;$i<@array;$i++) { + my $start=$array[$i][0];my $end=$array[$i][1]; + my @subarray=(); + push @subarray,$array[$i]; + + for (my $j=$i+1;$j<@array;$j++) { + if ($start<$array[$j][1] && $end>$array[$j][0]) { ###overlap + push @subarray,$array[$j]; + ($start,$end)=&newpos($start,$end,$array[$j][0],$array[$j][1]); + } + else{ + $i=$j-1; + &find_cites(\@subarray,$chr,$strand); + last; + } + } + } + } +} + +my %cluster; +foreach my $chr (keys %cites) { + foreach my $strand (keys %{$cites{$chr}}) { + my @array=@{$sites{$chr}{$strand}}; + @array=sort{$a->[0]<=>$b->[0]} @array; + for (my $i=0;$i<@array;$i++) { + my $start=$array[$i][0];my $end=$array[$i][1]; + my @subarray=(); + push @subarray,$array[$i]; + + for (my $j=$i+1;$j<@array;$j++) { + if ($end>$array[$j][0]-$maxd) { ###distance less than 200bp + push @subarray,$array[$j]; + ($start,$end)=&newpos($start,$end,$array[$j][0],$array[$j][1]); + } + else{ + @{$cluster{$chr}{$strand}{$i}}=@subarray; + $i=$j-1; + last; + } + } + } + + } +} + + +open FA,">$faout"; #output file +open STR,">$strout"; +foreach my $chr (keys %cluster) { + foreach my $strand (keys %{$cluster{$chr}}) { + foreach my $no (keys %{$cluster{$chr}{$strand}}) { + my @array2=@{$cluster{$chr}{$strand}{$no}}; + @array2=sort{$a->[0]<=>$b->[0]} @array2; + &excise(\@array2,$chr,$strand); + } + } +} +close FA; +close STR; +sub oneCiteDn{ + my ($array,$a,$chr,$strand)=@_; + + my $ss=$$array[$a][0]-$flank; + $ss=0 if($ss<0); + my $ee=$$array[$a][1]+$maxd+$flank; + $ee=$lng{$chr} if($ee>$lng{$chr}); + + my $seq=substr($genome{$chr},$ss,$ee-$ss+1); + if($strand eq "-"){$seq=revcom($seq);} + + my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee); + return $val; +} +sub oneCiteUp{ + my ($array,$a,$chr,$strand)=@_; + + my $ss=$$array[$a][0]-$maxd-$flank; + $ss=0 if($ss<0); + my $ee=$$array[$a][1]+$flank; + $ee=$lng{$chr} if($ee>$lng{$chr}); + + my $seq=substr($genome{$chr},$ss,$ee-$ss+1); + if($strand eq "-"){$seq=revcom($seq);} + + my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee); + return $val; + +} + +sub twoCites{ + my ($array,$a,$b,$chr,$strand)=@_; + + my $ss=$$array[$a][0]-$flank; + $ss=0 if($ss<0); + my $ee=$$array[$b][1]+$flank; + $ee=$lng{$chr} if($ee>$lng{$chr}); + + my $seq=substr($genome{$chr},$ss,$ee-$ss+1); + if($strand eq "-"){$seq=revcom($seq);} + +# my( $str,$mfe)=RNA::fold($seq); +# return 0 if($mfe>$MAX_ENERGY); ### minimum mfe + my $val=&ffw2($seq,$$array[$a][3],$$array[$b][3],$chr,$strand,$ss,$ee); + + return $val; + +} +sub excise{ + my ($cluster,$chr,$strand)=@_; + + if(@{$cluster}==1){ + $ok=&oneCiteDn($cluster,0,$chr,$strand); + $ok=&oneCiteUp($cluster,0,$chr,$strand); + }else{ + my $peak_pos=0; + + for (my $i=0;$i<@{$cluster};$i++) { + if($$cluster[$i][2]>$$cluster[$peak_pos][2]){$peak_pos=$i;} + } + + my $ok=0; + for (my $i=0;$i<@{$cluster};$i++) { + next if($i==$peak_pos); + if($i<$peak_pos){$ok=&twoCites($cluster,$i,$peak_pos,$chr,$strand);} + else{$ok=&twoCites($cluster,$peak_pos,$i,$chr,$strand);} + last if($ok); + } + if (!$ok) { + $ok=&oneCiteDn($cluster,$peak_pos,$chr,$strand); + $ok=&oneCiteUp($cluster,$peak_pos,$chr,$strand); + } + + } +} + +sub ffw2{ + my ($seq,$tag1,$tag2,$chr,$strand,$ss,$ee)=@_; + + my $N_count=$seq=~tr/N//; ## precursor sequence has not more than 5 Ns + if ($N_count > 5) { + return 0; + } + + my $seq_length=length $seq; + # position tag1 and tag2 + my $tag1_beg=index($seq,$tag1,0)+1; + if ($tag1_beg < 1) { + warn "[ffw2] coordinate error.\n"; +# $fold->{reason}="coordinate error"; + return 0; + } + my $tag2_beg=index($seq,$tag2,0)+1; + if ($tag2_beg < 1) { + warn "[ffw2] coordinate error.\n"; +# $fold->{reason}="coordinate error"; + return 0; + } + if ($tag2_beg < $tag1_beg) { + # swap tag1 and tag2 + ($tag1,$tag2)=($tag2,$tag1); + ($tag1_beg,$tag2_beg)=($tag2_beg,$tag1_beg); + } + my $tag1_end=$tag1_beg+length($tag1)-1; + my $tag2_end=$tag2_beg+length($tag2)-1; + # re-clipping + my $beg=$tag1_beg-$FLANK; $beg=1 if $beg < 1; + my $end=$tag2_end+$FLANK; $end=$seq_length if $end > $seq_length; + $seq=substr($seq,$beg-1,$end-$beg+1); + $seq_length=length $seq; + # re-reposition + $tag1_beg=index($seq,$tag1,0)+1; + if ($tag1_beg < 1) { + warn "[ffw2] coordinate error.\n"; +# $fold->{reason}="coordinate error"; + return 0; + } + + $tag2_beg=index($seq,$tag2,0)+1; + if ($tag2_beg < 1) { + warn "[ffw2] coordinate error.\n"; +# $fold->{reason}="coordinate error"; + return 0; + } + $tag1_end=$tag1_beg+length($tag1)-1; + $tag2_end=$tag2_beg+length($tag2)-1; + + # fold + #my ($struct,$mfe)=RNA::fold($seq); + my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; + my @rawfolds=split/\s+/,$rnafold; + my $struct=$rawfolds[1]; + my $mfe=$rawfolds[-1]; + $mfe=~s/\(//; + $mfe=~s/\)//; + #$mfe=sprintf "%.2f", $mfe; + if ($mfe > $MAX_ENERGY) {return 0;} + + # tag1 + my $tag1_length=$tag1_end-$tag1_beg+1; + my $tag1_struct=substr($struct,$tag1_beg-1,$tag1_length); + my $tag1_arm=which_arm($tag1_struct); + my $tag1_unpair=$tag1_struct=~tr/.//; + my $tag1_pair=$tag1_length-$tag1_unpair; + my $tag1_max_bulge=biggest_bulge($tag1_struct); + if ($tag1_arm ne "5p") { return 0;} # tag not in stem +# if ($tag1_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag1_unpair ($MAX_UNPAIR)"; return $pass} + if ($tag1_pair < $MIN_PAIR) {return 0;} + if ($tag1_max_bulge > $MAX_BULGE) {return 0;} + + # tag2 + my $tag2_length=$tag2_end-$tag2_beg+1; + my $tag2_struct=substr($struct,$tag2_beg-1,$tag2_length); + my $tag2_arm=which_arm($tag2_struct); + my $tag2_unpair=$tag2_struct=~tr/.//; + my $tag2_pair=$tag2_length-$tag2_unpair; + my $tag2_max_bulge=biggest_bulge($tag2_struct); + if ($tag2_arm ne "3p") {return 0;} # star not in stem +# if ($tag2_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag2_unpair ($MAX_UNPAIR)"; return $pass} + if ($tag2_pair < $MIN_PAIR) {return 0;} + if ($tag2_max_bulge > $MAX_BULGE) {return 0;} + + # space size between miR and miR* + my $space=$tag2_beg-$tag1_end-1; + if ($space < $MIN_SPACE) {return 0;} + if ($space > $MAX_SPACE) {return 0;} + + # size diff of miR and miR* + my $size_diff=abs($tag1_length-$tag2_length); + if ($size_diff > $MAX_SIZEDIFF) {return 0;} + + # build base pairing table + my %pairtable; + &parse_struct($struct,\%pairtable); # coords count from 1 + + my $asy1=get_asy(\%pairtable,$tag1_beg,$tag1_end); + my $asy2=get_asy(\%pairtable,$tag2_beg,$tag2_end); + my $asy=($asy1 < $asy2) ? $asy1 : $asy2; + if ($asy > $ASYMMETRY) {return 0} + + # duplex fold, determine whether two matures like a miR/miR* ike duplex + my ($like_mir_duplex1,$duplex_pair,$overhang1,$overhang2)=likeMirDuplex1($tag1,$tag2); + # parse hairpin, determine whether two matures form miR/miR* duplex in hairpin context + my ($like_mir_duplex2,$duplex_pair2,$overhang_b,$overhang_t)=likeMirDuplex2(\%pairtable,$tag1_beg,$tag1_end,$tag2_beg,$tag2_end); + if ($like_mir_duplex1==0 && $like_mir_duplex2==0) { + return 0; + } + + print FA ">$chr:$strand:$ss..$ee\n$seq\n"; + print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n"; + + return 1; +} + +sub ffw1{ + my ($seq,$tag,$chr,$strand,$ss,$ee)=@_; + my $pass=0; + + my $N_count=$seq=~tr/N//; + if ($N_count > 5) { + return 0; + } + + my $seq_length=length $seq; + my $tag_length=length $tag; + + # position + my $tag_beg=index($seq,$tag,0)+1; + if ($tag_beg < 1) { + warn "[ffw1] coordinate error.\n"; + return $pass; + } + my $tag_end=$tag_beg+length($tag)-1; + + + # define candidate precursor by hybrid short arm to long arm, not solid enough + my($beg,$end)=define_precursor($seq,$tag); + if (not defined $beg) { + return $pass; + } + if (not defined $end) { + return $pass; + } + $seq=substr($seq,$beg-1,$end-$beg+1); + $seq_length=length $seq; + + + # fold + #my ($struct,$mfe)=RNA::fold($seq); + my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; + my @rawfolds=split/\s+/,$rnafold; + my $struct=$rawfolds[1]; + my $mfe=$rawfolds[-1]; + $mfe=~s/\(//; + $mfe=~s/\)//; + + if ($mfe > $MAX_ENERGY) { + $pass=0; + return $pass; + } + + # reposition + $tag_beg=index($seq,$tag,0)+1; + if ($tag_beg < 1) { + warn "[ffw1] coordinate error.\n"; + return 0; + } + $tag_end=$tag_beg+length($tag)-1; + + my $tag_struct=substr($struct,$tag_beg-1,$tag_length); + my $tag_arm=which_arm($tag_struct); + my $tag_unpair=$tag_struct=~tr/.//; + my $tag_pair=$tag_length-$tag_unpair; + my $tag_max_bulge=biggest_bulge($tag_struct); + if ($tag_arm eq "-") { return $pass;} +# if ($tag_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag_unpair ($MAX_UNPAIR)"; return $pass} + if ($tag_pair < $MIN_PAIR) { return $pass;} + if ($tag_max_bulge > $MAX_BULGE) {return $pass;} + + # build base pairing table + my %pairtable; + &parse_struct($struct,\%pairtable); # coords count from 1 + + # get star + my ($star_beg,$star_end)=get_star(\%pairtable,$tag_beg,$tag_end); + my $star=substr($seq,$star_beg-1,$star_end-$star_beg+1); + my $star_length=$star_end-$star_beg+1; + my $star_struct=substr($struct,$star_beg-1,$star_end-$star_beg+1); + my $star_arm=which_arm($star_struct); + my $star_unpair=$star_struct=~tr/.//; + my $star_pair=$star_length-$star_unpair; + my $star_max_bulge=biggest_bulge($star_struct); + if ($star_arm eq "-") { return $pass;} +# if ($star_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$star_unpair ($MAX_UNPAIR)"; return $pass} + if ($star_pair < $MIN_PAIR) {return $pass;} + if ($star_max_bulge > $MAX_BULGE) {return $pass;} + + if ($tag_arm eq $star_arm) {return $pass;} + + # space size between miR and miR* + my $space; + if ($tag_beg < $star_beg) { + $space=$star_beg-$tag_end-1; + } + else { + $space=$tag_beg-$star_end-1; + } + if ($space < $MIN_SPACE) { return $pass;} + if ($space > $MAX_SPACE) { return $pass;} + + # size diff + my $size_diff=abs($tag_length-$star_length); + if ($size_diff > $MAX_SIZEDIFF) { return $pass;} + + # asymmetry + my $asy=get_asy(\%pairtable,$tag_beg,$tag_end); + if ($asy > $ASYMMETRY) {return $pass;} + + $pass=1; + print FA ">$chr:$strand:$ss..$ee\n$seq\n"; + print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n"; + return $pass; + +} +sub get_star { + my($table,$beg,$end)=@_; + + my ($s1,$e1,$s2,$e2); # s1 pair to s2, e1 pair to e2 + foreach my $i ($beg..$end) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + $s1=$i; + $s2=$j; + last; + } + } + foreach my $i (reverse ($beg..$end)) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + $e1=$i; + $e2=$j; + last; + } + } +# print "$s1,$e1 $s2,$e2\n"; + + # correct terminus + my $off1=$s1-$beg; + my $off2=$end-$e1; + $s2+=$off1; + $s2+=2; # 081009 + $e2-=$off2; $e2=1 if $e2 < 1; + $e2+=2; $e2=1 if $e2 < 1; # 081009 + ($s2,$e2)=($e2,$s2) if ($s2 > $e2); + return ($s2,$e2); +} + +sub define_precursor { + my $seq=shift; + my $tag=shift; + + my $seq_length=length $seq; + my $tag_length=length $tag; + my $tag_beg=index($seq,$tag,0)+1; + my $tag_end=$tag_beg+$tag_length-1; + + # split the candidate region into short arm and long arm + my $tag_arm; + my ($larm,$larm_beg,$larm_end); + my ($sarm,$sarm_beg,$sarm_end); + if ($tag_beg-1 < $seq_length-$tag_end) { # on 5' arm + $sarm=substr($seq,0,$tag_end); + $larm=substr($seq,$tag_end); + $sarm_beg=1; + $sarm_end=$tag_end; + $larm_beg=$tag_end+1; + $larm_end=$seq_length; + $tag_arm="5p"; + } + else { + $larm=substr($seq,0,$tag_beg-1); # on 3' arm + $sarm=substr($seq,$tag_beg-1); + $larm_beg=1; + $larm_end=$tag_beg-1; + $sarm_beg=$tag_beg; + $sarm_end=$seq_length; + $tag_arm="3p"; + } + +# print "$sarm_beg,$sarm_end $sarm\n"; +# print "$larm_beg,$larm_end $larm\n"; + + # clipping short arm + if ($tag_arm eq "5p") { + $sarm_beg=$tag_beg-$flank; $sarm_beg=1 if $sarm_beg < 1; + $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1); + } + else { + $sarm_end=$tag_end+$flank; $sarm_end=$seq_length if $sarm_end > $seq_length; + $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1); + } +# print "$sarm_beg,$sarm_end $sarm\n"; +# print "$larm_beg,$larm_end $larm\n"; + + # define the precursor by hybriding short arm to long arm +=cut #modify in 2014-10-28 + my $duplex=RNA::duplexfold($sarm,$larm); + my $struct=$duplex->{structure}; + my $energy=sprintf "%.2f", $duplex->{energy}; + my ($str1,$str2)=split(/&/,$struct); + my $pair=$str1=~tr/(//; +# print "pair=$pair\n"; + my $beg1=$duplex->{i}+1-length($str1); + my $end1=$duplex->{i}; + my $beg2=$duplex->{j}; + my $end2=$duplex->{j}+length($str2)-1; +=cut +###### new codes begin + my $duplex=`perl -e 'print "$sarm\n$larm"' | RNAduplex`; + #(.(.(((.....(((.&))))))...).). 1,16 : 1,13 (-7.20) + my @tmpduplex=split/\s+/,$duplex; + my $struct=$tmpduplex[0]; + $tmpduplex[-1]=~s/[(|)]//g; + my $energy=$tmpduplex[-1]; + my ($str1,$str2)=split(/&/,$struct); + my $pair=$str1=~tr/(//; + my ($beg1,$end1)=split/,/,$tmpduplex[1]; + my ($beg2,$end2)=split/,/,$tmpduplex[3]; +######## new codes end + +# print "$beg1:$end1 $beg2:$end2\n"; + # transform coordinates + $beg1=$beg1+$sarm_beg-1; + $end1=$end1+$sarm_beg-1; + $beg2=$beg2+$larm_beg-1; + $end2=$end2+$larm_beg-1; +# print "$beg1:$end1 $beg2:$end2\n"; + + my $off5p=$beg1-$sarm_beg; + my $off3p=$sarm_end-$end1; + $beg2-=$off3p; $beg2=1 if $beg2 < 1; + $end2+=$off5p; $end2=$seq_length if $end2 > $seq_length; + +# print "$beg1:$end1 $beg2:$end2\n"; + + my $beg=$sarm_beg < $beg2 ? $sarm_beg : $beg2; + my $end=$sarm_end > $end2 ? $sarm_end : $end2; + + return if $pair < $MIN_PAIR; +# print "$beg,$end\n"; + return ($beg,$end); +} + + +# duplex fold, judge whether two short seqs like a miRNA/miRNA* duplex +sub likeMirDuplex1 { + my $seq1=shift; + my $seq2=shift; + my $like_mir_duplex=1; + + my $length1=length $seq1; + my $length2=length $seq2; +=cut + my $duplex=RNA::duplexfold($seq1, $seq2); + my $duplex_struct=$duplex->{structure}; + my $duplex_energy=sprintf "%.2f", $duplex->{energy}; + my ($str1,$str2)=split(/&/,$duplex_struct); + my $beg1=$duplex->{i}+1-length($str1); + my $end1=$duplex->{i}; + my $beg2=$duplex->{j}; + my $end2=$duplex->{j}+length($str2)-1; +=cut + my $duplex=`perl -e 'print "$seq1\n$seq2"' | RNAduplex`; + #(.(.(((.....(((.&))))))...).). 1,16 : 1,13 (-7.20) + my @tmpduplex=split/\s+/,$duplex; + my $duplex_struct=$tmpduplex[0]; + $tmpduplex[-1]=~s/[(|)]//g; + my $duplex_energy=$tmpduplex[-1]; + my ($str1,$str2)=split(/&/,$duplex_struct); + #my $pair=$str1=~tr/(//; + my ($beg1,$end1)=split/,/,$tmpduplex[1]; + my ($beg2,$end2)=split/,/,$tmpduplex[3]; + + # revise beg1, end1, beg2, end2 + $str1=~/^(\.*)/; + $beg1+=length($1); + $str1=~/(\.*)$/; + $end1-=length($1); + $str2=~/^(\.*)/; + $beg2+=length($1); + $str2=~/(\.*)$/; + $end2-=length($1); + + my $pair_num=$str1=~tr/(//; + my $overhang1=($length2-$end2)-($beg1-1); # 3' overhang at hairpin bottom + my $overhang2=($length1-$end1)-($beg2-1); # 3' overhang at hairpin neck +# print $pair_num,"\n"; +# print $overhang1,"\n"; +# print $overhang2,"\n"; + if ($pair_num < 13) { + $like_mir_duplex=0; + } + if ($overhang1 < 0 || $overhang2 < 0 ) { + $like_mir_duplex=0; + } + if ($overhang1 > 4 || $overhang2 > 4) { + $like_mir_duplex=0; + } + return ($like_mir_duplex,$pair_num,$overhang1,$overhang2); +} + +# judge whether two matures form miR/miR* duplex, in hairpin context +sub likeMirDuplex2 { + my ($table,$beg1,$end1,$beg2,$end2)=@_; + my $like_mir_duplex=1; + +# s1 e1 +# 5 ----------------------------3 +# | | |||| ||| | +#3 -------------------------------5 +# e2 s2 + + my $pair_num=0; + my $overhang1=0; + my $overhang2=0; + my ($s1,$e1,$s2,$e2); + foreach my $i ($beg1..$end1) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + if ($j <= $end2 && $j >= $beg2) { + $s1=$i; + $e2=$j; + last; + } + } + } + foreach my $i (reverse ($beg1..$end1)) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + if ($j <= $end2 && $j >= $beg2) { + $e1=$i; + $s2=$j; + last; + } + } + } + +# print "$beg1,$end1 $s1,$e1\n"; +# print "$beg2,$end2 $s2,$e2\n"; + + foreach my $i ($beg1..$end1) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + if ($j <= $end2 && $j >= $beg2) { + ++$pair_num; + } + } + } + if (defined $s1 && defined $e2) { + $overhang1=($end2-$e2)-($s1-$beg1); + } + if (defined $e1 && defined $s2) { + $overhang2=($end1-$e1)-($s2-$beg2); + } + + if ($pair_num < 13) { + $like_mir_duplex=0; + } + if ($overhang1 < 0 && $overhang2 < 0) { + $like_mir_duplex=0; + } + return ($like_mir_duplex,$pair_num,$overhang1,$overhang2); +} +sub parse_struct { + my $struct=shift; + my $table=shift; + + my @t=split('',$struct); + my @lbs; # left brackets + foreach my $k (0..$#t) { + if ($t[$k] eq "(") { + push @lbs, $k+1; + } + elsif ($t[$k] eq ")") { + my $lb=pop @lbs; + my $rb=$k+1; + $table->{$lb}=$rb; + $table->{$rb}=$lb; + } + } + if (@lbs) { + warn "unbalanced RNA struct.\n"; + } +} +sub which_arm { + my $substruct=shift; + my $arm; + if ($substruct=~/\(/ && $substruct=~/\)/) { + $arm="-"; + } + elsif ($substruct=~/\(/) { + $arm="5p"; + } + else { + $arm="3p"; + } + return $arm; +} +sub biggest_bulge { + my $struct=shift; + my $bulge_size=0; + my $max_bulge=0; + while ($struct=~/(\.+)/g) { + $bulge_size=length $1; + if ($bulge_size > $max_bulge) { + $max_bulge=$bulge_size; + } + } + return $max_bulge; +} +sub get_asy { + my($table,$a1,$a2)=@_; + my ($pre_i,$pre_j); + my $asymmetry=0; + foreach my $i ($a1..$a2) { + if (defined $table->{$i}) { + my $j=$table->{$i}; + if (defined $pre_i && defined $pre_j) { + my $diff=($i-$pre_i)+($j-$pre_j); + $asymmetry += abs($diff); + } + $pre_i=$i; + $pre_j=$j; + } + } + return $asymmetry; +} + +sub peaks{ + my @cluster=@{$_[0]}; + + return if(@cluster<1); + + my $max=0; my $index=-1; + for (my $i=0;$i<@cluster;$i++) { + if($cluster[$i][2]>$max){ + $max=$cluster[$i][2]; + $index=$i; + } + } +# &excise(\@cluster,$index,$_[1],$_[2]); + return($index); +} + +sub find_cites{ + my @tmp=@{$_[0]}; + my $i=&peaks(\@tmp); + + my $start=$tmp[$i][0]; + my $total=0; my $node5=0; + for (my $j=0;$j<@tmp ;$j++) { + $total+=$tmp[$j][2]; + $node5 +=$tmp[$j][2] if($tmp[$j][0]-$start<=2 && $tmp[$j][0]-$start>=-2); + } + push @{$cites{$_[1]}{$_[2]}},$tmp[$i] if($node5/$total>0.80 && $tmp[$i][2]/$node5>0.5); +} + +sub newpos{ + my ($a,$b,$c,$d)=@_; + my $s= $a>$c ? $c : $a; + my $e=$b>$d ? $b : $d; + return($s,$e); +} + +sub rev{ + + my($sequence)=@_; + + my $rev=reverse $sequence; + + return $rev; +} + +sub com{ + + my($sequence)=@_; + + $sequence=~tr/acgtuACGTU/TGCAATGCAA/; + + return $sequence; +} + +sub revcom{ + + my($sequence)=@_; + + my $revcom=rev(com($sequence)); + + return $revcom; +} + +sub find_strand{ + + #A subroutine to find the strand, parsing different blast formats + my($other)=@_; + + my $strand="+"; + + if($other=~/-/){ + $strand="-"; + } + + if($other=~/minus/i){ + $strand="-"; + } + + return($strand); +} +sub usage{ +print <<"USAGE"; +Version $version +Usage: +$0 -map -g -d -f -o -s -e +options: + -map input file# align result # bst. format + -g input file # genome sequence fasta format + -d <int> Maximal space between miRNA and miRNA* (200) + -f <int> Flank sequence length of miRNA precursor (10) + -o output file# percursor fasta file + -s output file# precursor structure file + -e <folat> Maximal free energy allowed for a miRNA precursor (-18 kcal/mol) + + -h help +USAGE +exit(1); +} +