# HG changeset patch # User big-tiandm # Date 1406280056 14400 # Node ID dc5a29826c7d9807642f245632ed1198a8bb62bd # Parent 861eb4fbfbd035a7f1aed4412023267d0bfc6089 Uploaded diff -r 861eb4fbfbd0 -r dc5a29826c7d miRDeep_plant.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/miRDeep_plant.pl Fri Jul 25 05:20:56 2014 -0400 @@ -0,0 +1,1544 @@ +#!/usr/bin/perl + +use warnings; +use strict; +use Getopt::Std; + + + +################################# MIRDEEP ################################################# + +################################## USAGE ################################################## + + +my $usage= +"$0 file_signature file_structure temp_out_directory + +This is the core algorithm of miRDeep. It takes as input a file in blastparsed format with +information on the positions of reads aligned to potential precursor sequences (signature). +It also takes as input an RNAfold output file, giving information on the sequence, structure +and mimimum free energy of the potential precursor sequences. + +Extra arguments can be given. -s specifies a fastafile containing the known mature miRNA +sequences that should be considered for conservation purposes. -t prints out the potential +precursor sequences that do _not_ exceed the cut-off (default prints out the sequences that +exceeds the cut-off). -u gives limited output, that is only the ids of the potential precursors +that exceed the cut-off. -v varies the cut-off. -x is a sensitive option for Sanger sequences +obtained through conventional cloning. -z consider the number of base pairings in the lower +stems (this option is not well tested). + +-h print this usage +-s fasta file with known miRNAs +#-o temp directory ,maked befor running the program. +-t print filtered +-u limited output (only ids) +-v cut-off (default 1) +-x sensitive option for Sanger sequences +-y use Randfold +-z consider Drosha processing +"; + + + + + +############################################################################################ + +################################### INPUT ################################################## + + +#signature file in blast_parsed format +my $file_blast_parsed=shift or die $usage; + +#structure file outputted from RNAfold +my $file_struct=shift or die $usage; + +my $tmpdir=shift or die $usage; +#options +my %options=(); +getopts("hs:tuv:xyz",\%options); + + + + + + +############################################################################################# + +############################# GLOBAL VARIABLES ############################################## + + +#parameters +my $nucleus_lng=11; + +my $score_star=3.9; +my $score_star_not=-1.3; +my $score_nucleus=7.63; +my $score_nucleus_not=-1.17; +my $score_randfold=1.37; +my $score_randfold_not=-3.624; +my $score_intercept=0.3; +my @scores_stem=(-3.1,-2.3,-2.2,-1.6,-1.5,0.1,0.6,0.8,0.9,0.9,0); +my $score_min=1; +if($options{v}){$score_min=$options{v};} +if($options{x}){$score_min=-5;} + +my $e=2.718281828; + +#hashes +my %hash_desc; +my %hash_seq; +my %hash_struct; +my %hash_mfe; +my %hash_nuclei; +my %hash_mirs; +my %hash_query; +my %hash_comp; +my %hash_bp; + +#other variables +my $subject_old; +my $message_filter; +my $message_score; +my $lines; +my $out_of_bound; + + + +############################################################################################## + +################################ MAIN ###################################################### + + +#print help if that option is used +if($options{h}){die $usage;} +unless ($tmpdir=~/\/$/) {$tmpdir .="/";} +if(!(-s $tmpdir)){mkdir $tmpdir;} +$tmpdir .="TMP_DIR/"; +mkdir $tmpdir; + +#parse structure file outputted from RNAfold +parse_file_struct($file_struct); + +#if conservation is scored, the fasta file of known miRNA sequences is parsed +if($options{s}){create_hash_nuclei($options{s})}; + +#parse signature file in blast_parsed format and resolve each potential precursor +parse_file_blast_parsed($file_blast_parsed); +`rm -rf $tmpdir`; +exit; + + + + +############################################################################################## + +############################## SUBROUTINES ################################################### + + + +sub parse_file_blast_parsed{ + +# read through the signature blastparsed file, fills up a hash with information on queries +# (deep sequences) mapping to the current subject (potential precursor) and resolve each +# potential precursor in turn + + my $file_blast_parsed=shift; + + open (FILE_BLAST_PARSED, "<$file_blast_parsed") or die "can not open $file_blast_parsed\n"; + while (my $line=){ + if($line=~/^(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)$/){ + my $query=$1; + my $query_lng=$2; + my $query_beg=$3; + my $query_end=$4; + my $subject=$5; + my $subject_lng=$6; + my $subject_beg=$7; + my $subject_end=$8; + my $e_value=$9; + my $pid=$10; + my $bitscore=$11; + my $other=$12; + + #if the new line concerns a new subject (potential precursor) then the old subject must be resolved + if($subject_old and $subject_old ne $subject){ + resolve_potential_precursor(); + } + + #resolve the strand + my $strand=find_strand($other); + + #resolve the number of reads that the deep sequence represents + my $freq=find_freq($query); + + #read information of the query (deep sequence) into hash + $hash_query{$query}{"subject_beg"}=$subject_beg; + $hash_query{$query}{"subject_end"}=$subject_end; + $hash_query{$query}{"strand"}=$strand; + $hash_query{$query}{"freq"}=$freq; + + #save the signature information + $lines.=$line; + + $subject_old=$subject; + } + } + resolve_potential_precursor(); +} + +sub resolve_potential_precursor{ + +# dissects the potential precursor in parts by filling hashes, and tests if it passes the +# initial filter and the scoring filter + +# binary variable whether the potential precursor is still viable + my $ret=1; +#print STDERR ">$subject_old\n"; + + fill_structure(); +#print STDERR "\%hash_bp",scalar keys %hash_bp,"\n"; + fill_pri(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + + fill_mature(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + + fill_star(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + + fill_loop(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + + fill_lower_flanks(); +#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; + +# do_test_assemble(); + +# this is the actual classification + unless(pass_filtering_initial() and pass_threshold_score()){$ret=0;} + + print_results($ret); + + reset_variables(); + + return; + +} + + + +sub print_results{ + + my $ret=shift; + +# print out if the precursor is accepted and accepted precursors should be printed out +# or if the potential precursor is discarded and discarded potential precursors should +# be printed out + + if((!$options{t} and $ret) or ($options{t} and !$ret)){ + #full output + unless($options{u}){ + if($message_filter){print $message_filter;} + if($message_score){print $message_score;} + print_hash_comp(); + print $lines,"\n\n"; + return; + } + #limited output (only ids) + my $id=$hash_comp{"pri_id"}; + print "$id\n"; + } +} + + + + + + + +sub pass_threshold_score{ + +# this is the scoring + + #minimum free energy of the potential precursor +# my $score_mfe=score_mfe($hash_comp{"pri_mfe"}); + my $score_mfe=score_mfe($hash_comp{"pri_mfe"},$hash_comp{"pri_end"}); + + #count of reads that map in accordance with Dicer processing + my $score_freq=score_freq($hash_comp{"freq"}); +#print STDERR "score_mfe: $score_mfe\nscore_freq: $score_freq\n"; + + #basic score + my $score=$score_mfe+$score_freq; + + #scoring of conserved nucleus/seed (optional) + if($options{s}){ + + #if the nucleus is conserved + if(test_nucleus_conservation()){ + + #nucleus from position 2-8 + my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng); + + #resolve DNA/RNA ambiguities + $nucleus=~tr/[T]/[U]/; + + #print score contribution + score_s("score_nucleus\t$score_nucleus"); + + #print the ids of known miRNAs with same nucleus + score_s("$hash_mirs{$nucleus}"); +#print STDERR "score_nucleus\t$score_nucleus\n"; + + #add to score + $score+=$score_nucleus; + + #if the nucleus is not conserved + }else{ + #print (negative) score contribution + score_s("score_nucleus\t$score_nucleus_not"); + + #add (negative) score contribution + $score+=$score_nucleus_not; + } + } + + #if the majority of potential star reads fall as expected from Dicer processing + if($hash_comp{"star_read"}){ + score_s("score_star\t$score_star"); +#print STDERR "score_star\t$score_star\n"; + $score+=$score_star; + }else{ + score_s("score_star\t$score_star_not"); +#print STDERR "score_star_not\t$score_star_not\n"; + $score+=$score_star_not; + } + + #score lower stems for potential for Drosha recognition (highly optional) + if($options{z}){ + my $stem_bp=$hash_comp{"stem_bp"}; + my $score_stem=$scores_stem[$stem_bp]; + $score+=$score_stem; + score_s("score_stem\t$score_stem"); + } + +#print STDERR "score_intercept\t$score_intercept\n"; + + $score+=$score_intercept; + + #score for randfold (optional) + if($options{y}){ + +# only calculate randfold value if it can make the difference between the potential precursor +# being accepted or discarded + if($score+$score_randfold>=$score_min and $score+$score_randfold_not<=$score_min){ + + #randfold value<0.05 + if(test_randfold()){$score+=$score_randfold;score_s("score_randfold\t$score_randfold");} + + #randfold value>0.05 + else{$score+=$score_randfold_not;score_s("score_randfold\t$score_randfold_not");} + } + } + + #round off values to one decimal + my $round_mfe=round($score_mfe*10)/10; + my $round_freq=round($score_freq*10)/10; + my $round=round($score*10)/10; + + #print scores + score_s("score_mfe\t$round_mfe\nscore_freq\t$round_freq\nscore\t$round"); + + #return 1 if the potential precursor is accepted, return 0 if discarded + unless($score>=$score_min){return 0;} + return 1; +} + +sub test_randfold{ + + #print sequence to temporary file, test randfold value, return 1 or 0 + +# print_file("pri_seq.fa",">pri_seq\n".$hash_comp{"pri_seq"}); + my $tmpfile=$tmpdir.$hash_comp{"pri_id"}; + open(FILE, ">$tmpfile"); + print FILE ">pri_seq\n",$hash_comp{"pri_seq"}; + close FILE; + +# my $p_value=`randfold -s $tmpfile 999 | cut -f 3`; + my $p1=`randfold -s $tmpfile 999 | cut -f 3`; + my $p2=`randfold -s $tmpfile 999 | cut -f 3`; + my $p_value=($p1+$p2)/2; + wait; +# system "rm $tmpfile"; + + if($p_value<=0.05){return 1;} + + return 0; +} + + +#sub print_file{ + + #print string to file + +# my($file,$string)=@_; + +# open(FILE, ">$file"); +# print FILE "$string"; +# close FILE; +#} + + +sub test_nucleus_conservation{ + + #test if nucleus is identical to nucleus from known miRNA, return 1 or 0 + + my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng); + $nucleus=~tr/[T]/[U]/; + if($hash_nuclei{$nucleus}){return 1;} + + return 0; +} + + + +sub pass_filtering_initial{ + + #test if the structure forms a plausible hairpin + unless(pass_filtering_structure()){filter_p("structure problem"); return 0;} + + #test if >90% of reads map to the hairpin in consistence with Dicer processing + unless(pass_filtering_signature()){filter_p("signature problem");return 0;} + + return 1; + +} + + +sub pass_filtering_signature{ + + #number of reads that map in consistence with Dicer processing + my $consistent=0; + + #number of reads that map inconsistent with Dicer processing + my $inconsistent=0; + +# number of potential star reads map in good consistence with Drosha/Dicer processing +# (3' overhangs relative to mature product) + my $star_perfect=0; + +# number of potential star reads that do not map in good consistence with 3' overhang + my $star_fuzzy=0; + + + #sort queries (deep sequences) by their position on the hairpin + my @queries=sort {$hash_query{$a}{"subject_beg"} <=> $hash_query{$b}{"subject_beg"}} keys %hash_query; + + foreach my $query(@queries){ + + #number of reads that the deep sequence represents + unless(defined($hash_query{$query}{"freq"})){next;} + my $query_freq=$hash_query{$query}{"freq"}; + + #test which Dicer product (if any) the deep sequence corresponds to + my $product=test_query($query); + + #if the deep sequence corresponds to a Dicer product, add to the 'consistent' variable + if($product){$consistent+=$query_freq;} + + #if the deep sequence do not correspond to a Dicer product, add to the 'inconsistent' variable + else{$inconsistent+=$query_freq;} + + #test a potential star sequence has good 3' overhang + if($product eq "star"){ + if(test_star($query)){$star_perfect+=$query_freq;} + else{$star_fuzzy+=$query_freq;} + } + } + +# if the majority of potential star sequences map in good accordance with 3' overhang +# score for the presence of star evidence + if($star_perfect>$star_fuzzy){$hash_comp{"star_read"}=1;} + + #total number of reads mapping to the hairpin + my $freq=$consistent+$inconsistent; + $hash_comp{"freq"}=$freq; + unless($freq>0){filter_s("read frequency too low"); return 0;} + + #unless >90% of the reads map in consistence with Dicer processing, the hairpin is discarded + my $inconsistent_fraction=$inconsistent/($inconsistent+$consistent); + unless($inconsistent_fraction<=0.1){filter_p("inconsistent\t$inconsistent\nconsistent\t$consistent"); return 0;} + + #the hairpin is retained + return 1; +} + +sub test_star{ + + #test if a deep sequence maps in good consistence with 3' overhang + + my $query=shift; + + #5' begin and 3' end positions + my $beg=$hash_query{$query}{"subject_beg"}; + my $end=$hash_query{$query}{"subject_end"}; + + #the difference between observed and expected begin positions must be 0 or 1 + my $offset=$beg-$hash_comp{"star_beg"}; + if($offset==0 or $offset==1 or $offset==-1){return 1;} + + return 0; +} + + + +sub test_query{ + + #test if deep sequence maps in consistence with Dicer processing + + my $query=shift; + + #begin, end, strand and read count + my $beg=$hash_query{$query}{"subject_beg"}; + my $end=$hash_query{$query}{"subject_end"}; + my $strand=$hash_query{$query}{"strand"}; + my $freq=$hash_query{$query}{"freq"}; + + #should not be on the minus strand (although this has in fact anecdotally been observed for known miRNAs) + if($strand eq '-'){return 0;} + + #the deep sequence is allowed to stretch 2 nt beyond the expected 5' end + my $fuzz_beg=2; + #the deep sequence is allowed to stretch 5 nt beyond the expected 3' end + my $fuzz_end=2; + + #if in accordance with Dicer processing, return the type of Dicer product + if(contained($beg,$end,$hash_comp{"mature_beg"}-$fuzz_beg,$hash_comp{"mature_end"}+$fuzz_end)){return "mature";} + if(contained($beg,$end,$hash_comp{"star_beg"}-$fuzz_beg,$hash_comp{"star_end"}+$fuzz_end)){return "star";} + if(contained($beg,$end,$hash_comp{"loop_beg"}-$fuzz_beg,$hash_comp{"loop_end"}+$fuzz_end)){return "loop";} + + #if not in accordance, return 0 + return 0; +} + + +sub pass_filtering_structure{ + + #The potential precursor must form a hairpin with miRNA precursor-like characteristics + + #return value + my $ret=1; + + #potential mature, star, loop and lower flank parts must be identifiable + unless(test_components()){return 0;} + + #no bifurcations + unless(no_bifurcations_precursor()){$ret=0;} + + #minimum 14 base pairings in duplex + unless(bp_duplex()>=15){$ret=0;filter_s("too few pairings in duplex");} + + #not more than 6 nt difference between mature and star length + unless(-60 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} + + #no overlap between the mature and the star sequence + if($hash_comp{"mature_arm"} eq "first"){ + ($hash_comp{"mature_end"}<$beg) or return 0; + }elsif($hash_comp{"mature_arm"} eq "second"){ + ($end<$hash_comp{"mature_beg"}) or return 0; + } + + #there should be no bifurcations and minimum one base pairing + my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,"+"); + if($struct=~/^(\(|\.)+$/ and $struct=~/\(/){ + return "first"; + }elsif($struct=~/^(\)|\.)+$/ and $struct=~/\)/){ + return "second"; + } + return 0; +} + + +sub test_loop{ + + #tests the loop positions + + my ($beg,$end)=@_; + + #unless the begin and end positions are plausible, test negative + unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} + + return 1; +} + + +sub test_flanks{ + + #tests the positions of the lower flanks + + my ($beg,$end)=@_; + + #unless the begin and end positions are plausible, test negative + unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} + + return 1; +} + + +sub comp{ + + #subroutine to retrive from the 'comp' hash + + my $type=shift; + my $component=$hash_comp{$type}; + return $component; +} + + +sub find_strand_query{ + + #subroutine to find the strand for a given query + + my $query=shift; + my $strand=$hash_query{$query}{"strand"}; + return $strand; +} + + +sub find_positions_query{ + + #subroutine to find the begin and end positions for a given query + + my $query=shift; + my $beg=$hash_query{$query}{"subject_beg"}; + my $end=$hash_query{$query}{"subject_end"}; + return ($beg,$end); +} + + + +sub find_mature_query{ + + #finds the query with the highest frequency of reads and returns it + #is used to determine the positions of the potential mature sequence + + my @queries=sort {$hash_query{$b}{"freq"} <=> $hash_query{$a}{"freq"}} keys %hash_query; + my $mature_query=$queries[0]; + return $mature_query; +} + + + + +sub reset_variables{ + + #resets the hashes for the next potential precursor + +# %hash_query=(); +# %hash_comp=(); +# %hash_bp=(); + foreach my $key (keys %hash_query) {delete($hash_query{$key});} + foreach my $key (keys %hash_comp) {delete($hash_comp{$key});} + foreach my $key (keys %hash_bp) {delete($hash_bp{$key});} + +# $message_filter=(); +# $message_score=(); +# $lines=(); + undef($message_filter); + undef($message_score); + undef($lines); + return; +} + + + +sub excise_seq{ + + #excise sub sequence from the potential precursor + + my($seq,$beg,$end,$strand)=@_; + + #begin can be equal to end if only one nucleotide is excised + unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;} + + #rarely, permuted combinations of signature and structure cause out of bound excision errors. + #this happens once appr. every two thousand combinations + unless($beg<=length($seq)){$out_of_bound++;return 0;} + + #if on the minus strand, the reverse complement should be excised + if($strand eq "-"){$seq=revcom($seq);} + + #the blast parsed format is 1-indexed, substr is 0-indexed + my $sub_seq=substr($seq,$beg-1,$end-$beg+1); + + return $sub_seq; + +} + +sub excise_struct{ + + #excise sub structure + + my($struct,$beg,$end,$strand)=@_; + my $lng=length $struct; + + #begin can be equal to end if only one nucleotide is excised + unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;} + + #rarely, permuted combinations of signature and structure cause out of bound excision errors. + #this happens once appr. every two thousand combinations + unless($beg<=length($struct)){return 0;} + + #if excising relative to minus strand, positions are reversed + if($strand eq "-"){($beg,$end)=rev_pos($beg,$end,$lng);} + + #the blast parsed format is 1-indexed, substr is 0-indexed + my $sub_struct=substr($struct,$beg-1,$end-$beg+1); + + return $sub_struct; +} + + +sub create_hash_nuclei{ + #parses a fasta file with sequences of known miRNAs considered for conservation purposes + #reads the nuclei into a hash + + my ($file) = @_; + my ($id, $desc, $sequence, $nucleus) = (); + + open (FASTA, "<$file") or die "can not open $file\n"; + while () + { + chomp; + if (/^>(\S+)(.*)/) + { + $id = $1; + $desc = $2; + $sequence = ""; + $nucleus = ""; + while (){ + chomp; + if (/^>(\S+)(.*)/){ + $nucleus = substr($sequence,1,$nucleus_lng); + $nucleus =~ tr/[T]/[U]/; + $hash_mirs{$nucleus} .="$id\t"; + $hash_nuclei{$nucleus} += 1; + + $id = $1; + $desc = $2; + $sequence = ""; + $nucleus = ""; + next; + } + $sequence .= $_; + } + } + } + $nucleus = substr($sequence,1,$nucleus_lng); + $nucleus =~ tr/[T]/[U]/; + $hash_mirs{$nucleus} .="$id\t"; + $hash_nuclei{$nucleus} += 1; + close FASTA; +} + + +sub parse_file_struct{ + #parses the output from RNAfoldand reads it into hashes + my($file) = @_; + my($id,$desc,$seq,$struct,$mfe) = (); + open (FILE_STRUCT, "<$file") or die "can not open $file\n"; + while (){ + chomp; + if (/^>(\S+)\s*(.*)/){ + $id= $1; + $desc= $2; + $seq= ""; + $struct= ""; + $mfe= ""; + while (){ + chomp; + if (/^>(\S+)\s*(.*)/){ + $hash_desc{$id} = $desc; + $hash_seq{$id} = $seq; + $hash_struct{$id} = $struct; + $hash_mfe{$id} = $mfe; + $id = $1; + $desc = $2; + $seq = ""; + $struct = ""; + $mfe = ""; + next; + } + if(/^\w/){ + tr/uU/tT/; + $seq .= $_; + next; + } + if(/((\.|\(|\))+)/){$struct .=$1;} + if(/\((\s*-\d+\.\d+)\)/){$mfe = $1;} + } + } + } + $hash_desc{$id} = $desc; + $hash_seq{$id} = $seq; + $hash_struct{$id} = $struct; + $hash_mfe{$id} = $mfe; + close FILE_STRUCT; + return; +} + + +sub score_s{ + + #this score message is appended to the end of the string of score messages outputted for the potential precursor + + my $message=shift; + $message_score.=$message."\n";; + return; +} + + + +sub score_p{ + + #this score message is appended to the beginning of the string of score messages outputted for the potential precursor + + my $message=shift; + $message_score=$message."\n".$message_score; + return; +} + + + +sub filter_s{ + + #this filtering message is appended to the end of the string of filtering messages outputted for the potential precursor + + my $message=shift; + $message_filter.=$message."\n"; + return; +} + + +sub filter_p{ + + #this filtering message is appended to the beginning of the string of filtering messages outputted for the potential precursor + + my $message=shift; + if(defined $message_filter){$message_filter=$message."\n".$message_filter;} + else{$message_filter=$message."\n";} + return; +} + + +sub find_freq{ + + #finds the frequency of a given read query from its id. + + my($query)=@_; + + if($query=~/x(\d+)/i){ + my $freq=$1; + return $freq; + }else{ + print STDERR "Problem with read format\n"; + return 0; + } +} + + +sub print_hash_comp{ + + #prints the 'comp' hash + + my @keys=sort keys %hash_comp; + foreach my $key(@keys){ + my $value=$hash_comp{$key}; + print "$key \t$value\n"; + } +} + + + +sub print_hash_bp{ + + #prints the 'bp' hash + + my @keys=sort {$a<=>$b} keys %hash_bp; + foreach my $key(@keys){ + my $value=$hash_bp{$key}; + print "$key\t$value\n"; + } + print "\n"; +} + + + +sub find_strand{ + + #A subroutine to find the strand, parsing different blast formats + + my($other)=@_; + + my $strand="+"; + + if($other=~/-/){ + $strand="-"; + } + + if($other=~/minus/i){ + $strand="-"; + } + return($strand); +} + + +sub contained{ + + #Is the stretch defined by the first positions contained in the stretch defined by the second? + + my($beg1,$end1,$beg2,$end2)=@_; + + testbeginend($beg1,$end1,$beg2,$end2); + + if($beg2<=$beg1 and $end1<=$end2){ + return 1; + }else{ + return 0; + } +} + + +sub testbeginend{ + + #Are the beginposition numerically smaller than the endposition for each pair? + + my($begin1,$end1,$begin2,$end2)=@_; + + unless($begin1<=$end1 and $begin2<=$end2){ + print STDERR "beg can not be larger than end for $subject_old\n"; + exit; + } +} + + +sub rev_pos{ + +# The blast_parsed format always uses positions that are relative to the 5' of the given strand +# This means that for a sequence of length n, the first nucleotide on the minus strand base pairs with +# the n't nucleotide on the plus strand + +# This subroutine reverses the begin and end positions of positions of the minus strand so that they +# are relative to the 5' end of the plus strand + + my($beg,$end,$lng)=@_; + + my $new_end=$lng-$beg+1; + my $new_beg=$lng-$end+1; + + return($new_beg,$new_end); +} + +sub round { + + #rounds to nearest integer + + my($number) = shift; + return int($number + .5); + +} + + +sub rev{ + + #reverses the order of nucleotides in a sequence + + my($sequence)=@_; + + my $rev=reverse $sequence; + + return $rev; +} + +sub com{ + + #the complementary of a sequence + + my($sequence)=@_; + + $sequence=~tr/acgtuACGTU/TGCAATGCAA/; + + return $sequence; +} + +sub revcom{ + + #reverse complement + + my($sequence)=@_; + + my $revcom=rev(com($sequence)); + + return $revcom; +} + + +sub max2 { + + #max of two numbers + + my($a, $b) = @_; + return ($a>$b ? $a : $b); +} + +sub min2 { + + #min of two numbers + + my($a, $b) = @_; + return ($a<$b ? $a : $b); +} + + + +sub score_freq{ + +# scores the count of reads that map to the potential precursor +# Assumes geometric distribution as described in methods section of manuscript + + my $freq=shift; + + #parameters of known precursors and background hairpins + my $parameter_test=0.999; + my $parameter_control=0.6; + + #log_odds calculated directly to avoid underflow + my $intercept=log((1-$parameter_test)/(1-$parameter_control)); + my $slope=log($parameter_test/$parameter_control); + my $log_odds=$slope*$freq+$intercept; + + #if no strong evidence for 3' overhangs, limit the score contribution to 0 + unless($options{x} or $hash_comp{"star_read"}){$log_odds=min2($log_odds,0);} + + return $log_odds; +} + + + +##sub score_mfe{ + +# scores the minimum free energy in kCal/mol of the potential precursor +# Assumes Gumbel distribution as described in methods section of manuscript + +## my $mfe=shift; + + #numerical value, minimum 1 +## my $mfe_adj=max2(1,-$mfe); + + #parameters of known precursors and background hairpins, scale and location +## my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32); +## my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23); + +## my $odds=$prob_test/$prob_background; +## my $log_odds=log($odds); + +## return $log_odds; +##} + +sub score_mfe{ +# use bignum; + +# scores the minimum free energy in kCal/mol of the potential precursor +# Assumes Gumbel distribution as described in methods section of manuscript + + my ($mfe,$mlng)=@_; + + #numerical value, minimum 1 + my $mfe_adj=max2(1,-$mfe); +my $mfe_adj1=$mfe/$mlng; + #parameters of known precursors and background hairpins, scale and location + my $a=1.339e-12;my $b=2.778e-13;my $c=45.834; + my $ev=$e**($mfe_adj1*$c); + print STDERR "\n***",$ev,"**\t",$ev+$b,"\t"; + my $log_odds=($a/($b+$ev)); + + + my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32); + my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23); + + my $odds=$prob_test/$prob_background; + my $log_odds_2=log($odds); + print STDERR "log_odds :",$log_odds,"\t",$log_odds_2,"\n"; + return $log_odds; +} + + + +sub prob_gumbel_discretized{ + +# discretized Gumbel distribution, probabilities within windows of 1 kCal/mol +# uses the subroutine that calculates the cdf to find the probabilities + + my ($var,$scale,$location)=@_; + + my $bound_lower=$var-0.5; + my $bound_upper=$var+0.5; + + my $cdf_lower=cdf_gumbel($bound_lower,$scale,$location); + my $cdf_upper=cdf_gumbel($bound_upper,$scale,$location); + + my $prob=$cdf_upper-$cdf_lower; + + return $prob; +} + + +sub cdf_gumbel{ + +# calculates the cumulative distribution function of the Gumbel distribution + + my ($var,$scale,$location)=@_; + + my $cdf=$e**(-($e**(-($var-$location)/$scale))); + + return $cdf; +} +