Mercurial > repos > big-tiandm > mirplant2
view precursors.pl @ 35:0979f7a58686 draft
Uploaded
author | big-tiandm |
---|---|
date | Thu, 31 Jul 2014 03:08:13 -0400 |
parents | 1131b4008650 |
children | 4c0b1a94b882 |
line wrap: on
line source
#!/usr/bin/perl -w #Filename: #Author: Tian Dongmei #Email: tiandm@big.ac.cn #Date: 2013/7/19 #Modified: #Description: my $version=1.00; use strict; use Getopt::Long; use RNA; my %opts; GetOptions(\%opts,"map=s","g=s","d:i","f:i","o=s","e:f","s=s","h"); if (!(defined $opts{map} and defined $opts{g} and defined $opts{o} and defined $opts{s} ) || defined $opts{h}) { #necessary arguments &usage; } my $filein=$opts{'map'}; my $faout=$opts{'o'}; my $strout=$opts{'s'}; my $genome= $opts{'g'}; my $maxd=defined $opts{'d'} ? $opts{'d'} : 200; my $flank=defined $opts{'f'}? $opts{'f'} : 10; my $MAX_ENERGY=-18; if (defined $opts{'e'}) {$MAX_ENERGY=$opts{'e'};} my $MAX_UNPAIR=5; my $MIN_PAIR=15; my $MAX_SIZEDIFF=4; my $MAX_BULGE=2; my $ASYMMETRY=5; my $MIN_UNPAIR=0; my $MIN_SPACE=5; my $MAX_SPACE=$maxd; my $FLANK=$flank; ######### load in genome sequences start ######## my %genome; my %lng; my $name; open IN,"<$genome"; while (my $aline=<IN>) { chomp $aline; next if($aline=~/^\#/); if ($aline=~/^>(\S+)/) { $name=$1; next; } $genome{$name} .=$aline; } close IN; foreach my $key (keys %genome) { $lng{$key}=length($genome{$key}); } ####### load in genome sequences end ########## my %breaks; ### reads number bigger than 3 open IN,"<$filein"; #input file while (my $aline=<IN>) { chomp $aline; my @tmp=split/\t/,$aline; $tmp[0]=~/_x(\d+)$/; my $no=$1; next if($no<3); #my $trand=&find_strand($tmp[9]); #my @pos=split/\.\./,$tmp[5]; my $end=$tmp[3]+length($tmp[4])-1; if($tmp[1] eq "-"){$tmp[4]=revcom($tmp[4]);} push @{$breaks{$tmp[2]}{$tmp[1]}},[$tmp[3],$end,$no,$tmp[4]]; ### 0 base } close IN; my %cites; ### peaks foreach my $chr (keys %breaks) { foreach my $strand (keys %{$breaks{$chr}}) { my @array=@{$breaks{$chr}{$strand}}; @array=sort{$a->[0]<=>$b->[0]} @array; for (my $i=0;$i<@array;$i++) { my $start=$array[$i][0];my $end=$array[$i][1]; my @subarray=(); push @subarray,$array[$i]; for (my $j=$i+1;$j<@array;$j++) { if ($start<$array[$j][1] && $end>$array[$j][0]) { push @subarray,$array[$j]; ($start,$end)=&newpos($start,$end,$array[$j][0],$array[$j][1]); } else{ $i=$j; &find_cites(\@subarray,$chr,$strand); last; } } } } } open FA,">$faout"; #output file open STR,">$strout"; foreach my $chr (keys %cites) { foreach my $strand (keys %{$cites{$chr}}) { my @array2=@{$cites{$chr}{$strand}}; @array2=sort{$a->[0]<=>$b->[0]} @array2; &excise(\@array2,$chr,$strand); } } close FA; close STR; sub oneCiteDn{ my ($array,$a,$chr,$strand)=@_; my $ss=$$array[$a][0]-$flank; $ss=0 if($ss<0); my $ee=$$array[$a][1]+$maxd+$flank; $ee=$lng{$chr} if($ee>$lng{$chr}); my $seq=substr($genome{$chr},$ss,$ee-$ss+1); if($strand eq "-"){$seq=revcom($seq);} my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee); return $val; } sub oneCiteUp{ my ($array,$a,$chr,$strand)=@_; my $ss=$$array[$a][0]-$maxd-$flank; $ss=0 if($ss<0); my $ee=$$array[$a][1]+$flank; $ee=$lng{$chr} if($ee>$lng{$chr}); my $seq=substr($genome{$chr},$ss,$ee-$ss+1); if($strand eq "-"){$seq=revcom($seq);} my $val=&ffw1($seq,$$array[$a][3],$chr,$strand,$ss,$ee); return $val; } sub twoCites{ my ($array,$a,$b,$chr,$strand)=@_; my $ss=$$array[$a][0]-$flank; $ss=0 if($ss<0); my $ee=$$array[$b][1]+$flank; $ee=$lng{$chr} if($ee>$lng{$chr}); my $seq=substr($genome{$chr},$ss,$ee-$ss+1); if($strand eq "-"){$seq=revcom($seq);} # my( $str,$mfe)=RNA::fold($seq); # return 0 if($mfe>$MAX_ENERGY); ### minimum mfe my $val=&ffw2($seq,$$array[$a][3],$$array[$b][3],$chr,$strand,$ss,$ee); return $val; } sub excise{ my ($cluster,$chr,$strand)=@_; my $last_pos=0; for (my $i=0;$i<@{$cluster};$i++) { next if($$cluster[$i][0]<$last_pos); my $ok=0; for (my $j=$i+1;$j<@{$cluster} ;$j++) { if($$cluster[$j][0]-$$cluster[$i][1]>$maxd){ $i=$j; last; }else{ $ok=&twoCites($cluster,$i,$j,$chr,$strand); if($ok){ $last_pos=$$cluster[$j][1]+$flank; $i=$j; last;} } } next if($ok); $ok=&oneCiteDn($cluster,$i,$chr,$strand); if($ok){$last_pos=$$cluster[$i][1]+$maxd+$flank; next;} $ok=&oneCiteUp($cluster,$i,$chr,$strand); if($ok){$last_pos=$$cluster[$i][1]+$flank;next;} } } sub ffw2{ my ($seq,$tag1,$tag2,$chr,$strand,$ss,$ee)=@_; my $N_count=$seq=~tr/N//; ## precursor sequence has not more than 5 Ns if ($N_count > 5) { return 0; } my $seq_length=length $seq; # position tag1 and tag2 my $tag1_beg=index($seq,$tag1,0)+1; if ($tag1_beg < 1) { warn "[ffw2] coordinate error.\n"; # $fold->{reason}="coordinate error"; return 0; } my $tag2_beg=index($seq,$tag2,0)+1; if ($tag2_beg < 1) { warn "[ffw2] coordinate error.\n"; # $fold->{reason}="coordinate error"; return 0; } if ($tag2_beg < $tag1_beg) { # swap tag1 and tag2 ($tag1,$tag2)=($tag2,$tag1); ($tag1_beg,$tag2_beg)=($tag2_beg,$tag1_beg); } my $tag1_end=$tag1_beg+length($tag1)-1; my $tag2_end=$tag2_beg+length($tag2)-1; # re-clipping my $beg=$tag1_beg-$FLANK; $beg=1 if $beg < 1; my $end=$tag2_end+$FLANK; $end=$seq_length if $end > $seq_length; $seq=substr($seq,$beg-1,$end-$beg+1); $seq_length=length $seq; # re-reposition $tag1_beg=index($seq,$tag1,0)+1; if ($tag1_beg < 1) { warn "[ffw2] coordinate error.\n"; # $fold->{reason}="coordinate error"; return 0; } $tag2_beg=index($seq,$tag2,0)+1; if ($tag2_beg < 1) { warn "[ffw2] coordinate error.\n"; # $fold->{reason}="coordinate error"; return 0; } $tag1_end=$tag1_beg+length($tag1)-1; $tag2_end=$tag2_beg+length($tag2)-1; # fold my ($struct,$mfe)=RNA::fold($seq); $mfe=sprintf "%.2f", $mfe; if ($mfe > $MAX_ENERGY) {return 0;} # tag1 my $tag1_length=$tag1_end-$tag1_beg+1; my $tag1_struct=substr($struct,$tag1_beg-1,$tag1_length); my $tag1_arm=which_arm($tag1_struct); my $tag1_unpair=$tag1_struct=~tr/.//; my $tag1_pair=$tag1_length-$tag1_unpair; my $tag1_max_bulge=biggest_bulge($tag1_struct); if ($tag1_arm ne "5p") { return 0;} # tag not in stem # if ($tag1_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag1_unpair ($MAX_UNPAIR)"; return $pass} if ($tag1_pair < $MIN_PAIR) {return 0;} if ($tag1_max_bulge > $MAX_BULGE) {return 0;} # tag2 my $tag2_length=$tag2_end-$tag2_beg+1; my $tag2_struct=substr($struct,$tag2_beg-1,$tag2_length); my $tag2_arm=which_arm($tag2_struct); my $tag2_unpair=$tag2_struct=~tr/.//; my $tag2_pair=$tag2_length-$tag2_unpair; my $tag2_max_bulge=biggest_bulge($tag2_struct); if ($tag2_arm ne "3p") {return 0;} # star not in stem # if ($tag2_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag2_unpair ($MAX_UNPAIR)"; return $pass} if ($tag2_pair < $MIN_PAIR) {return 0;} if ($tag2_max_bulge > $MAX_BULGE) {return 0;} # space size between miR and miR* my $space=$tag2_beg-$tag1_end-1; if ($space < $MIN_SPACE) {return 0;} if ($space > $MAX_SPACE) {return 0;} # size diff of miR and miR* my $size_diff=abs($tag1_length-$tag2_length); if ($size_diff > $MAX_SIZEDIFF) {return 0;} # build base pairing table my %pairtable; &parse_struct($struct,\%pairtable); # coords count from 1 my $asy1=get_asy(\%pairtable,$tag1_beg,$tag1_end); my $asy2=get_asy(\%pairtable,$tag2_beg,$tag2_end); my $asy=($asy1 < $asy2) ? $asy1 : $asy2; if ($asy > $ASYMMETRY) {return 0} # duplex fold, determine whether two matures like a miR/miR* ike duplex my ($like_mir_duplex1,$duplex_pair,$overhang1,$overhang2)=likeMirDuplex1($tag1,$tag2); # parse hairpin, determine whether two matures form miR/miR* duplex in hairpin context my ($like_mir_duplex2,$duplex_pair2,$overhang_b,$overhang_t)=likeMirDuplex2(\%pairtable,$tag1_beg,$tag1_end,$tag2_beg,$tag2_end); if ($like_mir_duplex1==0 && $like_mir_duplex2==0) { return 0; } print FA ">$chr:$strand:$ss..$ee\n$seq\n"; print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n"; return 1; } sub ffw1{ my ($seq,$tag,$chr,$strand,$ss,$ee)=@_; my $pass=0; my $N_count=$seq=~tr/N//; if ($N_count > 5) { return 0; } my $seq_length=length $seq; my $tag_length=length $tag; # position my $tag_beg=index($seq,$tag,0)+1; if ($tag_beg < 1) { warn "[ffw1] coordinate error.\n"; return $pass; } my $tag_end=$tag_beg+length($tag)-1; # define candidate precursor by hybrid short arm to long arm, not solid enough my($beg,$end)=define_precursor($seq,$tag); if (not defined $beg) { return $pass; } if (not defined $end) { return $pass; } $seq=substr($seq,$beg-1,$end-$beg+1); $seq_length=length $seq; # fold my ($struct,$mfe)=RNA::fold($seq); $mfe=sprintf "%.2f",$mfe; if ($mfe > $MAX_ENERGY) { $pass=0; return $pass; } # reposition $tag_beg=index($seq,$tag,0)+1; if ($tag_beg < 1) { warn "[ffw1] coordinate error.\n"; return 0; } $tag_end=$tag_beg+length($tag)-1; my $tag_struct=substr($struct,$tag_beg-1,$tag_length); my $tag_arm=which_arm($tag_struct); my $tag_unpair=$tag_struct=~tr/.//; my $tag_pair=$tag_length-$tag_unpair; my $tag_max_bulge=biggest_bulge($tag_struct); if ($tag_arm eq "-") { return $pass;} # if ($tag_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$tag_unpair ($MAX_UNPAIR)"; return $pass} if ($tag_pair < $MIN_PAIR) { return $pass;} if ($tag_max_bulge > $MAX_BULGE) {return $pass;} # build base pairing table my %pairtable; &parse_struct($struct,\%pairtable); # coords count from 1 # get star my ($star_beg,$star_end)=get_star(\%pairtable,$tag_beg,$tag_end); my $star=substr($seq,$star_beg-1,$star_end-$star_beg+1); my $star_length=$star_end-$star_beg+1; my $star_struct=substr($struct,$star_beg-1,$star_end-$star_beg+1); my $star_arm=which_arm($star_struct); my $star_unpair=$star_struct=~tr/.//; my $star_pair=$star_length-$star_unpair; my $star_max_bulge=biggest_bulge($star_struct); if ($star_arm eq "-") { return $pass;} # if ($star_unpair > $MAX_UNPAIR) {$fold->{reason}="unpair=$star_unpair ($MAX_UNPAIR)"; return $pass} if ($star_pair < $MIN_PAIR) {return $pass;} if ($star_max_bulge > $MAX_BULGE) {return $pass;} if ($tag_arm eq $star_arm) {return $pass;} # space size between miR and miR* my $space; if ($tag_beg < $star_beg) { $space=$star_beg-$tag_end-1; } else { $space=$tag_beg-$star_end-1; } if ($space < $MIN_SPACE) { return $pass;} if ($space > $MAX_SPACE) { return $pass;} # size diff my $size_diff=abs($tag_length-$star_length); if ($size_diff > $MAX_SIZEDIFF) { return $pass;} # asymmetry my $asy=get_asy(\%pairtable,$tag_beg,$tag_end); if ($asy > $ASYMMETRY) {return $pass;} $pass=1; print FA ">$chr:$strand:$ss..$ee\n$seq\n"; print STR ">$chr:$strand:$ss..$ee\n$seq\n$struct\t($mfe)\n"; return $pass; } sub get_star { my($table,$beg,$end)=@_; my ($s1,$e1,$s2,$e2); # s1 pair to s2, e1 pair to e2 foreach my $i ($beg..$end) { if (defined $table->{$i}) { my $j=$table->{$i}; $s1=$i; $s2=$j; last; } } foreach my $i (reverse ($beg..$end)) { if (defined $table->{$i}) { my $j=$table->{$i}; $e1=$i; $e2=$j; last; } } # print "$s1,$e1 $s2,$e2\n"; # correct terminus my $off1=$s1-$beg; my $off2=$end-$e1; $s2+=$off1; $s2+=2; # 081009 $e2-=$off2; $e2=1 if $e2 < 1; $e2+=2; $e2=1 if $e2 < 1; # 081009 ($s2,$e2)=($e2,$s2) if ($s2 > $e2); return ($s2,$e2); } sub define_precursor { my $seq=shift; my $tag=shift; my $seq_length=length $seq; my $tag_length=length $tag; my $tag_beg=index($seq,$tag,0)+1; my $tag_end=$tag_beg+$tag_length-1; # split the candidate region into short arm and long arm my $tag_arm; my ($larm,$larm_beg,$larm_end); my ($sarm,$sarm_beg,$sarm_end); if ($tag_beg-1 < $seq_length-$tag_end) { # on 5' arm $sarm=substr($seq,0,$tag_end); $larm=substr($seq,$tag_end); $sarm_beg=1; $sarm_end=$tag_end; $larm_beg=$tag_end+1; $larm_end=$seq_length; $tag_arm="5p"; } else { $larm=substr($seq,0,$tag_beg-1); # on 3' arm $sarm=substr($seq,$tag_beg-1); $larm_beg=1; $larm_end=$tag_beg-1; $sarm_beg=$tag_beg; $sarm_end=$seq_length; $tag_arm="3p"; } # print "$sarm_beg,$sarm_end $sarm\n"; # print "$larm_beg,$larm_end $larm\n"; # clipping short arm if ($tag_arm eq "5p") { $sarm_beg=$tag_beg-$flank; $sarm_beg=1 if $sarm_beg < 1; $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1); } else { $sarm_end=$tag_end+$flank; $sarm_end=$seq_length if $sarm_end > $seq_length; $sarm=substr($seq,$sarm_beg-1,$sarm_end-$sarm_beg+1); } # print "$sarm_beg,$sarm_end $sarm\n"; # print "$larm_beg,$larm_end $larm\n"; # define the precursor by hybriding short arm to long arm my $duplex=RNA::duplexfold($sarm,$larm); my $struct=$duplex->{structure}; my $energy=sprintf "%.2f", $duplex->{energy}; my ($str1,$str2)=split(/&/,$struct); my $pair=$str1=~tr/(//; # print "pair=$pair\n"; my $beg1=$duplex->{i}+1-length($str1); my $end1=$duplex->{i}; my $beg2=$duplex->{j}; my $end2=$duplex->{j}+length($str2)-1; # print "$beg1:$end1 $beg2:$end2\n"; # transform coordinates $beg1=$beg1+$sarm_beg-1; $end1=$end1+$sarm_beg-1; $beg2=$beg2+$larm_beg-1; $end2=$end2+$larm_beg-1; # print "$beg1:$end1 $beg2:$end2\n"; my $off5p=$beg1-$sarm_beg; my $off3p=$sarm_end-$end1; $beg2-=$off3p; $beg2=1 if $beg2 < 1; $end2+=$off5p; $end2=$seq_length if $end2 > $seq_length; # print "$beg1:$end1 $beg2:$end2\n"; my $beg=$sarm_beg < $beg2 ? $sarm_beg : $beg2; my $end=$sarm_end > $end2 ? $sarm_end : $end2; return if $pair < $MIN_PAIR; # print "$beg,$end\n"; return ($beg,$end); } # duplex fold, judge whether two short seqs like a miRNA/miRNA* duplex sub likeMirDuplex1 { my $seq1=shift; my $seq2=shift; my $like_mir_duplex=1; my $length1=length $seq1; my $length2=length $seq2; my $duplex=RNA::duplexfold($seq1, $seq2); my $duplex_struct=$duplex->{structure}; my $duplex_energy=sprintf "%.2f", $duplex->{energy}; my ($str1,$str2)=split(/&/,$duplex_struct); my $beg1=$duplex->{i}+1-length($str1); my $end1=$duplex->{i}; my $beg2=$duplex->{j}; my $end2=$duplex->{j}+length($str2)-1; # revise beg1, end1, beg2, end2 $str1=~/^(\.*)/; $beg1+=length($1); $str1=~/(\.*)$/; $end1-=length($1); $str2=~/^(\.*)/; $beg2+=length($1); $str2=~/(\.*)$/; $end2-=length($1); my $pair_num=$str1=~tr/(//; my $overhang1=($length2-$end2)-($beg1-1); # 3' overhang at hairpin bottom my $overhang2=($length1-$end1)-($beg2-1); # 3' overhang at hairpin neck # print $pair_num,"\n"; # print $overhang1,"\n"; # print $overhang2,"\n"; if ($pair_num < 13) { $like_mir_duplex=0; } if ($overhang1 < 0 || $overhang2 < 0 ) { $like_mir_duplex=0; } if ($overhang1 > 4 || $overhang2 > 4) { $like_mir_duplex=0; } return ($like_mir_duplex,$pair_num,$overhang1,$overhang2); } # judge whether two matures form miR/miR* duplex, in hairpin context sub likeMirDuplex2 { my ($table,$beg1,$end1,$beg2,$end2)=@_; my $like_mir_duplex=1; # s1 e1 # 5 ----------------------------3 # | | |||| ||| | #3 -------------------------------5 # e2 s2 my $pair_num=0; my $overhang1=0; my $overhang2=0; my ($s1,$e1,$s2,$e2); foreach my $i ($beg1..$end1) { if (defined $table->{$i}) { my $j=$table->{$i}; if ($j <= $end2 && $j >= $beg2) { $s1=$i; $e2=$j; last; } } } foreach my $i (reverse ($beg1..$end1)) { if (defined $table->{$i}) { my $j=$table->{$i}; if ($j <= $end2 && $j >= $beg2) { $e1=$i; $s2=$j; last; } } } # print "$beg1,$end1 $s1,$e1\n"; # print "$beg2,$end2 $s2,$e2\n"; foreach my $i ($beg1..$end1) { if (defined $table->{$i}) { my $j=$table->{$i}; if ($j <= $end2 && $j >= $beg2) { ++$pair_num; } } } if (defined $s1 && defined $e2) { $overhang1=($end2-$e2)-($s1-$beg1); } if (defined $e1 && defined $s2) { $overhang2=($end1-$e1)-($s2-$beg2); } if ($pair_num < 13) { $like_mir_duplex=0; } if ($overhang1 < 0 && $overhang2 < 0) { $like_mir_duplex=0; } return ($like_mir_duplex,$pair_num,$overhang1,$overhang2); } sub parse_struct { my $struct=shift; my $table=shift; my @t=split('',$struct); my @lbs; # left brackets foreach my $k (0..$#t) { if ($t[$k] eq "(") { push @lbs, $k+1; } elsif ($t[$k] eq ")") { my $lb=pop @lbs; my $rb=$k+1; $table->{$lb}=$rb; $table->{$rb}=$lb; } } if (@lbs) { warn "unbalanced RNA struct.\n"; } } sub which_arm { my $substruct=shift; my $arm; if ($substruct=~/\(/ && $substruct=~/\)/) { $arm="-"; } elsif ($substruct=~/\(/) { $arm="5p"; } else { $arm="3p"; } return $arm; } sub biggest_bulge { my $struct=shift; my $bulge_size=0; my $max_bulge=0; while ($struct=~/(\.+)/g) { $bulge_size=length $1; if ($bulge_size > $max_bulge) { $max_bulge=$bulge_size; } } return $max_bulge; } sub get_asy { my($table,$a1,$a2)=@_; my ($pre_i,$pre_j); my $asymmetry=0; foreach my $i ($a1..$a2) { if (defined $table->{$i}) { my $j=$table->{$i}; if (defined $pre_i && defined $pre_j) { my $diff=($i-$pre_i)+($j-$pre_j); $asymmetry += abs($diff); } $pre_i=$i; $pre_j=$j; } } return $asymmetry; } sub peaks{ my @cluster=@{$_[0]}; return if(@cluster<1); my $max=0; my $index=-1; for (my $i=0;$i<@cluster;$i++) { if($cluster[$i][2]>$max){ $max=$cluster[$i][2]; $index=$i; } } # &excise(\@cluster,$index,$_[1],$_[2]); return($index); } sub find_cites{ my @tmp=@{$_[0]}; my $i=&peaks(\@tmp); my $start=$tmp[$i][0]; my $total=0; my $node5=0; for (my $j=0;$j<@tmp ;$j++) { $total+=$tmp[$j][2]; $node5 +=$tmp[$j][2] if($tmp[$j][0]-$start<=2 && $tmp[$j][0]-$start>=-2); } push @{$cites{$_[1]}{$_[2]}},$tmp[$i] if($node5/$total>0.80 && $tmp[$i][2]/$node5>0.5); } sub newpos{ my ($a,$b,$c,$d)=@_; my $s= $a>$c ? $c : $a; my $e=$b>$d ? $b : $d; return($s,$e); } sub rev{ my($sequence)=@_; my $rev=reverse $sequence; return $rev; } sub com{ my($sequence)=@_; $sequence=~tr/acgtuACGTU/TGCAATGCAA/; return $sequence; } sub revcom{ my($sequence)=@_; my $revcom=rev(com($sequence)); return $revcom; } sub find_strand{ #A subroutine to find the strand, parsing different blast formats my($other)=@_; my $strand="+"; if($other=~/-/){ $strand="-"; } if($other=~/minus/i){ $strand="-"; } return($strand); } sub usage{ print <<"USAGE"; Version $version Usage: $0 -map -g -d -f -o -s -e options: -map input file# align result # bst. format -g input file # genome sequence fasta format -d <int> Maximal space between miRNA and miRNA* (200) -f <int> Flank sequence length of miRNA precursor (10) -o output file# percursor fasta file -s output file# precursor structure file -e <folat> Maximal free energy allowed for a miRNA precursor (-18 kcal/mol) -h help USAGE exit(1); }