# HG changeset patch # User big-tiandm # Date 1406282247 14400 # Node ID 45de5e1ff4872a17739ea3d5ad19a81e96567149 # Parent c0aecae5eb03359b9178c6dbd556f2bed09c24ff Deleted selected files diff -r c0aecae5eb03 -r 45de5e1ff487 miRDeep_plant.pl --- a/miRDeep_plant.pl Fri Jul 25 05:57:20 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1544 +0,0 @@ -#!/usr/bin/perl - -use warnings; -use strict; -use Getopt::Std; - - - -################################# MIRDEEP ################################################# - -################################## USAGE ################################################## - - -my $usage= -"$0 file_signature file_structure temp_out_directory - -This is the core algorithm of miRDeep. It takes as input a file in blastparsed format with -information on the positions of reads aligned to potential precursor sequences (signature). -It also takes as input an RNAfold output file, giving information on the sequence, structure -and mimimum free energy of the potential precursor sequences. - -Extra arguments can be given. -s specifies a fastafile containing the known mature miRNA -sequences that should be considered for conservation purposes. -t prints out the potential -precursor sequences that do _not_ exceed the cut-off (default prints out the sequences that -exceeds the cut-off). -u gives limited output, that is only the ids of the potential precursors -that exceed the cut-off. -v varies the cut-off. -x is a sensitive option for Sanger sequences -obtained through conventional cloning. -z consider the number of base pairings in the lower -stems (this option is not well tested). - --h print this usage --s fasta file with known miRNAs -#-o temp directory ,maked befor running the program. --t print filtered --u limited output (only ids) --v cut-off (default 1) --x sensitive option for Sanger sequences --y use Randfold --z consider Drosha processing -"; - - - - - -############################################################################################ - -################################### INPUT ################################################## - - -#signature file in blast_parsed format -my $file_blast_parsed=shift or die $usage; - -#structure file outputted from RNAfold -my $file_struct=shift or die $usage; - -my $tmpdir=shift or die $usage; -#options -my %options=(); -getopts("hs:tuv:xyz",\%options); - - - - - - -############################################################################################# - -############################# GLOBAL VARIABLES ############################################## - - -#parameters -my $nucleus_lng=11; - -my $score_star=3.9; -my $score_star_not=-1.3; -my $score_nucleus=7.63; -my $score_nucleus_not=-1.17; -my $score_randfold=1.37; -my $score_randfold_not=-3.624; -my $score_intercept=0.3; -my @scores_stem=(-3.1,-2.3,-2.2,-1.6,-1.5,0.1,0.6,0.8,0.9,0.9,0); -my $score_min=1; -if($options{v}){$score_min=$options{v};} -if($options{x}){$score_min=-5;} - -my $e=2.718281828; - -#hashes -my %hash_desc; -my %hash_seq; -my %hash_struct; -my %hash_mfe; -my %hash_nuclei; -my %hash_mirs; -my %hash_query; -my %hash_comp; -my %hash_bp; - -#other variables -my $subject_old; -my $message_filter; -my $message_score; -my $lines; -my $out_of_bound; - - - -############################################################################################## - -################################ MAIN ###################################################### - - -#print help if that option is used -if($options{h}){die $usage;} -unless ($tmpdir=~/\/$/) {$tmpdir .="/";} -if(!(-s $tmpdir)){mkdir $tmpdir;} -$tmpdir .="TMP_DIR/"; -mkdir $tmpdir; - -#parse structure file outputted from RNAfold -parse_file_struct($file_struct); - -#if conservation is scored, the fasta file of known miRNA sequences is parsed -if($options{s}){create_hash_nuclei($options{s})}; - -#parse signature file in blast_parsed format and resolve each potential precursor -parse_file_blast_parsed($file_blast_parsed); -`rm -rf $tmpdir`; -exit; - - - - -############################################################################################## - -############################## SUBROUTINES ################################################### - - - -sub parse_file_blast_parsed{ - -# read through the signature blastparsed file, fills up a hash with information on queries -# (deep sequences) mapping to the current subject (potential precursor) and resolve each -# potential precursor in turn - - my $file_blast_parsed=shift; - - open (FILE_BLAST_PARSED, "<$file_blast_parsed") or die "can not open $file_blast_parsed\n"; - while (my $line=){ - if($line=~/^(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)$/){ - my $query=$1; - my $query_lng=$2; - my $query_beg=$3; - my $query_end=$4; - my $subject=$5; - my $subject_lng=$6; - my $subject_beg=$7; - my $subject_end=$8; - my $e_value=$9; - my $pid=$10; - my $bitscore=$11; - my $other=$12; - - #if the new line concerns a new subject (potential precursor) then the old subject must be resolved - if($subject_old and $subject_old ne $subject){ - resolve_potential_precursor(); - } - - #resolve the strand - my $strand=find_strand($other); - - #resolve the number of reads that the deep sequence represents - my $freq=find_freq($query); - - #read information of the query (deep sequence) into hash - $hash_query{$query}{"subject_beg"}=$subject_beg; - $hash_query{$query}{"subject_end"}=$subject_end; - $hash_query{$query}{"strand"}=$strand; - $hash_query{$query}{"freq"}=$freq; - - #save the signature information - $lines.=$line; - - $subject_old=$subject; - } - } - resolve_potential_precursor(); -} - -sub resolve_potential_precursor{ - -# dissects the potential precursor in parts by filling hashes, and tests if it passes the -# initial filter and the scoring filter - -# binary variable whether the potential precursor is still viable - my $ret=1; -#print STDERR ">$subject_old\n"; - - fill_structure(); -#print STDERR "\%hash_bp",scalar keys %hash_bp,"\n"; - fill_pri(); -#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; - - fill_mature(); -#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; - - fill_star(); -#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; - - fill_loop(); -#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; - - fill_lower_flanks(); -#print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; - -# do_test_assemble(); - -# this is the actual classification - unless(pass_filtering_initial() and pass_threshold_score()){$ret=0;} - - print_results($ret); - - reset_variables(); - - return; - -} - - - -sub print_results{ - - my $ret=shift; - -# print out if the precursor is accepted and accepted precursors should be printed out -# or if the potential precursor is discarded and discarded potential precursors should -# be printed out - - if((!$options{t} and $ret) or ($options{t} and !$ret)){ - #full output - unless($options{u}){ - if($message_filter){print $message_filter;} - if($message_score){print $message_score;} - print_hash_comp(); - print $lines,"\n\n"; - return; - } - #limited output (only ids) - my $id=$hash_comp{"pri_id"}; - print "$id\n"; - } -} - - - - - - - -sub pass_threshold_score{ - -# this is the scoring - - #minimum free energy of the potential precursor -# my $score_mfe=score_mfe($hash_comp{"pri_mfe"}); - my $score_mfe=score_mfe($hash_comp{"pri_mfe"},$hash_comp{"pri_end"}); - - #count of reads that map in accordance with Dicer processing - my $score_freq=score_freq($hash_comp{"freq"}); -#print STDERR "score_mfe: $score_mfe\nscore_freq: $score_freq\n"; - - #basic score - my $score=$score_mfe+$score_freq; - - #scoring of conserved nucleus/seed (optional) - if($options{s}){ - - #if the nucleus is conserved - if(test_nucleus_conservation()){ - - #nucleus from position 2-8 - my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng); - - #resolve DNA/RNA ambiguities - $nucleus=~tr/[T]/[U]/; - - #print score contribution - score_s("score_nucleus\t$score_nucleus"); - - #print the ids of known miRNAs with same nucleus - score_s("$hash_mirs{$nucleus}"); -#print STDERR "score_nucleus\t$score_nucleus\n"; - - #add to score - $score+=$score_nucleus; - - #if the nucleus is not conserved - }else{ - #print (negative) score contribution - score_s("score_nucleus\t$score_nucleus_not"); - - #add (negative) score contribution - $score+=$score_nucleus_not; - } - } - - #if the majority of potential star reads fall as expected from Dicer processing - if($hash_comp{"star_read"}){ - score_s("score_star\t$score_star"); -#print STDERR "score_star\t$score_star\n"; - $score+=$score_star; - }else{ - score_s("score_star\t$score_star_not"); -#print STDERR "score_star_not\t$score_star_not\n"; - $score+=$score_star_not; - } - - #score lower stems for potential for Drosha recognition (highly optional) - if($options{z}){ - my $stem_bp=$hash_comp{"stem_bp"}; - my $score_stem=$scores_stem[$stem_bp]; - $score+=$score_stem; - score_s("score_stem\t$score_stem"); - } - -#print STDERR "score_intercept\t$score_intercept\n"; - - $score+=$score_intercept; - - #score for randfold (optional) - if($options{y}){ - -# only calculate randfold value if it can make the difference between the potential precursor -# being accepted or discarded - if($score+$score_randfold>=$score_min and $score+$score_randfold_not<=$score_min){ - - #randfold value<0.05 - if(test_randfold()){$score+=$score_randfold;score_s("score_randfold\t$score_randfold");} - - #randfold value>0.05 - else{$score+=$score_randfold_not;score_s("score_randfold\t$score_randfold_not");} - } - } - - #round off values to one decimal - my $round_mfe=round($score_mfe*10)/10; - my $round_freq=round($score_freq*10)/10; - my $round=round($score*10)/10; - - #print scores - score_s("score_mfe\t$round_mfe\nscore_freq\t$round_freq\nscore\t$round"); - - #return 1 if the potential precursor is accepted, return 0 if discarded - unless($score>=$score_min){return 0;} - return 1; -} - -sub test_randfold{ - - #print sequence to temporary file, test randfold value, return 1 or 0 - -# print_file("pri_seq.fa",">pri_seq\n".$hash_comp{"pri_seq"}); - my $tmpfile=$tmpdir.$hash_comp{"pri_id"}; - open(FILE, ">$tmpfile"); - print FILE ">pri_seq\n",$hash_comp{"pri_seq"}; - close FILE; - -# my $p_value=`randfold -s $tmpfile 999 | cut -f 3`; - my $p1=`randfold -s $tmpfile 999 | cut -f 3`; - my $p2=`randfold -s $tmpfile 999 | cut -f 3`; - my $p_value=($p1+$p2)/2; - wait; -# system "rm $tmpfile"; - - if($p_value<=0.05){return 1;} - - return 0; -} - - -#sub print_file{ - - #print string to file - -# my($file,$string)=@_; - -# open(FILE, ">$file"); -# print FILE "$string"; -# close FILE; -#} - - -sub test_nucleus_conservation{ - - #test if nucleus is identical to nucleus from known miRNA, return 1 or 0 - - my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng); - $nucleus=~tr/[T]/[U]/; - if($hash_nuclei{$nucleus}){return 1;} - - return 0; -} - - - -sub pass_filtering_initial{ - - #test if the structure forms a plausible hairpin - unless(pass_filtering_structure()){filter_p("structure problem"); return 0;} - - #test if >90% of reads map to the hairpin in consistence with Dicer processing - unless(pass_filtering_signature()){filter_p("signature problem");return 0;} - - return 1; - -} - - -sub pass_filtering_signature{ - - #number of reads that map in consistence with Dicer processing - my $consistent=0; - - #number of reads that map inconsistent with Dicer processing - my $inconsistent=0; - -# number of potential star reads map in good consistence with Drosha/Dicer processing -# (3' overhangs relative to mature product) - my $star_perfect=0; - -# number of potential star reads that do not map in good consistence with 3' overhang - my $star_fuzzy=0; - - - #sort queries (deep sequences) by their position on the hairpin - my @queries=sort {$hash_query{$a}{"subject_beg"} <=> $hash_query{$b}{"subject_beg"}} keys %hash_query; - - foreach my $query(@queries){ - - #number of reads that the deep sequence represents - unless(defined($hash_query{$query}{"freq"})){next;} - my $query_freq=$hash_query{$query}{"freq"}; - - #test which Dicer product (if any) the deep sequence corresponds to - my $product=test_query($query); - - #if the deep sequence corresponds to a Dicer product, add to the 'consistent' variable - if($product){$consistent+=$query_freq;} - - #if the deep sequence do not correspond to a Dicer product, add to the 'inconsistent' variable - else{$inconsistent+=$query_freq;} - - #test a potential star sequence has good 3' overhang - if($product eq "star"){ - if(test_star($query)){$star_perfect+=$query_freq;} - else{$star_fuzzy+=$query_freq;} - } - } - -# if the majority of potential star sequences map in good accordance with 3' overhang -# score for the presence of star evidence - if($star_perfect>$star_fuzzy){$hash_comp{"star_read"}=1;} - - #total number of reads mapping to the hairpin - my $freq=$consistent+$inconsistent; - $hash_comp{"freq"}=$freq; - unless($freq>0){filter_s("read frequency too low"); return 0;} - - #unless >90% of the reads map in consistence with Dicer processing, the hairpin is discarded - my $inconsistent_fraction=$inconsistent/($inconsistent+$consistent); - unless($inconsistent_fraction<=0.1){filter_p("inconsistent\t$inconsistent\nconsistent\t$consistent"); return 0;} - - #the hairpin is retained - return 1; -} - -sub test_star{ - - #test if a deep sequence maps in good consistence with 3' overhang - - my $query=shift; - - #5' begin and 3' end positions - my $beg=$hash_query{$query}{"subject_beg"}; - my $end=$hash_query{$query}{"subject_end"}; - - #the difference between observed and expected begin positions must be 0 or 1 - my $offset=$beg-$hash_comp{"star_beg"}; - if($offset==0 or $offset==1 or $offset==-1){return 1;} - - return 0; -} - - - -sub test_query{ - - #test if deep sequence maps in consistence with Dicer processing - - my $query=shift; - - #begin, end, strand and read count - my $beg=$hash_query{$query}{"subject_beg"}; - my $end=$hash_query{$query}{"subject_end"}; - my $strand=$hash_query{$query}{"strand"}; - my $freq=$hash_query{$query}{"freq"}; - - #should not be on the minus strand (although this has in fact anecdotally been observed for known miRNAs) - if($strand eq '-'){return 0;} - - #the deep sequence is allowed to stretch 2 nt beyond the expected 5' end - my $fuzz_beg=2; - #the deep sequence is allowed to stretch 5 nt beyond the expected 3' end - my $fuzz_end=2; - - #if in accordance with Dicer processing, return the type of Dicer product - if(contained($beg,$end,$hash_comp{"mature_beg"}-$fuzz_beg,$hash_comp{"mature_end"}+$fuzz_end)){return "mature";} - if(contained($beg,$end,$hash_comp{"star_beg"}-$fuzz_beg,$hash_comp{"star_end"}+$fuzz_end)){return "star";} - if(contained($beg,$end,$hash_comp{"loop_beg"}-$fuzz_beg,$hash_comp{"loop_end"}+$fuzz_end)){return "loop";} - - #if not in accordance, return 0 - return 0; -} - - -sub pass_filtering_structure{ - - #The potential precursor must form a hairpin with miRNA precursor-like characteristics - - #return value - my $ret=1; - - #potential mature, star, loop and lower flank parts must be identifiable - unless(test_components()){return 0;} - - #no bifurcations - unless(no_bifurcations_precursor()){$ret=0;} - - #minimum 14 base pairings in duplex - unless(bp_duplex()>=15){$ret=0;filter_s("too few pairings in duplex");} - - #not more than 6 nt difference between mature and star length - unless(-60 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} - - #no overlap between the mature and the star sequence - if($hash_comp{"mature_arm"} eq "first"){ - ($hash_comp{"mature_end"}<$beg) or return 0; - }elsif($hash_comp{"mature_arm"} eq "second"){ - ($end<$hash_comp{"mature_beg"}) or return 0; - } - - #there should be no bifurcations and minimum one base pairing - my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,"+"); - if($struct=~/^(\(|\.)+$/ and $struct=~/\(/){ - return "first"; - }elsif($struct=~/^(\)|\.)+$/ and $struct=~/\)/){ - return "second"; - } - return 0; -} - - -sub test_loop{ - - #tests the loop positions - - my ($beg,$end)=@_; - - #unless the begin and end positions are plausible, test negative - unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} - - return 1; -} - - -sub test_flanks{ - - #tests the positions of the lower flanks - - my ($beg,$end)=@_; - - #unless the begin and end positions are plausible, test negative - unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} - - return 1; -} - - -sub comp{ - - #subroutine to retrive from the 'comp' hash - - my $type=shift; - my $component=$hash_comp{$type}; - return $component; -} - - -sub find_strand_query{ - - #subroutine to find the strand for a given query - - my $query=shift; - my $strand=$hash_query{$query}{"strand"}; - return $strand; -} - - -sub find_positions_query{ - - #subroutine to find the begin and end positions for a given query - - my $query=shift; - my $beg=$hash_query{$query}{"subject_beg"}; - my $end=$hash_query{$query}{"subject_end"}; - return ($beg,$end); -} - - - -sub find_mature_query{ - - #finds the query with the highest frequency of reads and returns it - #is used to determine the positions of the potential mature sequence - - my @queries=sort {$hash_query{$b}{"freq"} <=> $hash_query{$a}{"freq"}} keys %hash_query; - my $mature_query=$queries[0]; - return $mature_query; -} - - - - -sub reset_variables{ - - #resets the hashes for the next potential precursor - -# %hash_query=(); -# %hash_comp=(); -# %hash_bp=(); - foreach my $key (keys %hash_query) {delete($hash_query{$key});} - foreach my $key (keys %hash_comp) {delete($hash_comp{$key});} - foreach my $key (keys %hash_bp) {delete($hash_bp{$key});} - -# $message_filter=(); -# $message_score=(); -# $lines=(); - undef($message_filter); - undef($message_score); - undef($lines); - return; -} - - - -sub excise_seq{ - - #excise sub sequence from the potential precursor - - my($seq,$beg,$end,$strand)=@_; - - #begin can be equal to end if only one nucleotide is excised - unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;} - - #rarely, permuted combinations of signature and structure cause out of bound excision errors. - #this happens once appr. every two thousand combinations - unless($beg<=length($seq)){$out_of_bound++;return 0;} - - #if on the minus strand, the reverse complement should be excised - if($strand eq "-"){$seq=revcom($seq);} - - #the blast parsed format is 1-indexed, substr is 0-indexed - my $sub_seq=substr($seq,$beg-1,$end-$beg+1); - - return $sub_seq; - -} - -sub excise_struct{ - - #excise sub structure - - my($struct,$beg,$end,$strand)=@_; - my $lng=length $struct; - - #begin can be equal to end if only one nucleotide is excised - unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;} - - #rarely, permuted combinations of signature and structure cause out of bound excision errors. - #this happens once appr. every two thousand combinations - unless($beg<=length($struct)){return 0;} - - #if excising relative to minus strand, positions are reversed - if($strand eq "-"){($beg,$end)=rev_pos($beg,$end,$lng);} - - #the blast parsed format is 1-indexed, substr is 0-indexed - my $sub_struct=substr($struct,$beg-1,$end-$beg+1); - - return $sub_struct; -} - - -sub create_hash_nuclei{ - #parses a fasta file with sequences of known miRNAs considered for conservation purposes - #reads the nuclei into a hash - - my ($file) = @_; - my ($id, $desc, $sequence, $nucleus) = (); - - open (FASTA, "<$file") or die "can not open $file\n"; - while () - { - chomp; - if (/^>(\S+)(.*)/) - { - $id = $1; - $desc = $2; - $sequence = ""; - $nucleus = ""; - while (){ - chomp; - if (/^>(\S+)(.*)/){ - $nucleus = substr($sequence,1,$nucleus_lng); - $nucleus =~ tr/[T]/[U]/; - $hash_mirs{$nucleus} .="$id\t"; - $hash_nuclei{$nucleus} += 1; - - $id = $1; - $desc = $2; - $sequence = ""; - $nucleus = ""; - next; - } - $sequence .= $_; - } - } - } - $nucleus = substr($sequence,1,$nucleus_lng); - $nucleus =~ tr/[T]/[U]/; - $hash_mirs{$nucleus} .="$id\t"; - $hash_nuclei{$nucleus} += 1; - close FASTA; -} - - -sub parse_file_struct{ - #parses the output from RNAfoldand reads it into hashes - my($file) = @_; - my($id,$desc,$seq,$struct,$mfe) = (); - open (FILE_STRUCT, "<$file") or die "can not open $file\n"; - while (){ - chomp; - if (/^>(\S+)\s*(.*)/){ - $id= $1; - $desc= $2; - $seq= ""; - $struct= ""; - $mfe= ""; - while (){ - chomp; - if (/^>(\S+)\s*(.*)/){ - $hash_desc{$id} = $desc; - $hash_seq{$id} = $seq; - $hash_struct{$id} = $struct; - $hash_mfe{$id} = $mfe; - $id = $1; - $desc = $2; - $seq = ""; - $struct = ""; - $mfe = ""; - next; - } - if(/^\w/){ - tr/uU/tT/; - $seq .= $_; - next; - } - if(/((\.|\(|\))+)/){$struct .=$1;} - if(/\((\s*-\d+\.\d+)\)/){$mfe = $1;} - } - } - } - $hash_desc{$id} = $desc; - $hash_seq{$id} = $seq; - $hash_struct{$id} = $struct; - $hash_mfe{$id} = $mfe; - close FILE_STRUCT; - return; -} - - -sub score_s{ - - #this score message is appended to the end of the string of score messages outputted for the potential precursor - - my $message=shift; - $message_score.=$message."\n";; - return; -} - - - -sub score_p{ - - #this score message is appended to the beginning of the string of score messages outputted for the potential precursor - - my $message=shift; - $message_score=$message."\n".$message_score; - return; -} - - - -sub filter_s{ - - #this filtering message is appended to the end of the string of filtering messages outputted for the potential precursor - - my $message=shift; - $message_filter.=$message."\n"; - return; -} - - -sub filter_p{ - - #this filtering message is appended to the beginning of the string of filtering messages outputted for the potential precursor - - my $message=shift; - if(defined $message_filter){$message_filter=$message."\n".$message_filter;} - else{$message_filter=$message."\n";} - return; -} - - -sub find_freq{ - - #finds the frequency of a given read query from its id. - - my($query)=@_; - - if($query=~/x(\d+)/i){ - my $freq=$1; - return $freq; - }else{ - print STDERR "Problem with read format\n"; - return 0; - } -} - - -sub print_hash_comp{ - - #prints the 'comp' hash - - my @keys=sort keys %hash_comp; - foreach my $key(@keys){ - my $value=$hash_comp{$key}; - print "$key \t$value\n"; - } -} - - - -sub print_hash_bp{ - - #prints the 'bp' hash - - my @keys=sort {$a<=>$b} keys %hash_bp; - foreach my $key(@keys){ - my $value=$hash_bp{$key}; - print "$key\t$value\n"; - } - print "\n"; -} - - - -sub find_strand{ - - #A subroutine to find the strand, parsing different blast formats - - my($other)=@_; - - my $strand="+"; - - if($other=~/-/){ - $strand="-"; - } - - if($other=~/minus/i){ - $strand="-"; - } - return($strand); -} - - -sub contained{ - - #Is the stretch defined by the first positions contained in the stretch defined by the second? - - my($beg1,$end1,$beg2,$end2)=@_; - - testbeginend($beg1,$end1,$beg2,$end2); - - if($beg2<=$beg1 and $end1<=$end2){ - return 1; - }else{ - return 0; - } -} - - -sub testbeginend{ - - #Are the beginposition numerically smaller than the endposition for each pair? - - my($begin1,$end1,$begin2,$end2)=@_; - - unless($begin1<=$end1 and $begin2<=$end2){ - print STDERR "beg can not be larger than end for $subject_old\n"; - exit; - } -} - - -sub rev_pos{ - -# The blast_parsed format always uses positions that are relative to the 5' of the given strand -# This means that for a sequence of length n, the first nucleotide on the minus strand base pairs with -# the n't nucleotide on the plus strand - -# This subroutine reverses the begin and end positions of positions of the minus strand so that they -# are relative to the 5' end of the plus strand - - my($beg,$end,$lng)=@_; - - my $new_end=$lng-$beg+1; - my $new_beg=$lng-$end+1; - - return($new_beg,$new_end); -} - -sub round { - - #rounds to nearest integer - - my($number) = shift; - return int($number + .5); - -} - - -sub rev{ - - #reverses the order of nucleotides in a sequence - - my($sequence)=@_; - - my $rev=reverse $sequence; - - return $rev; -} - -sub com{ - - #the complementary of a sequence - - my($sequence)=@_; - - $sequence=~tr/acgtuACGTU/TGCAATGCAA/; - - return $sequence; -} - -sub revcom{ - - #reverse complement - - my($sequence)=@_; - - my $revcom=rev(com($sequence)); - - return $revcom; -} - - -sub max2 { - - #max of two numbers - - my($a, $b) = @_; - return ($a>$b ? $a : $b); -} - -sub min2 { - - #min of two numbers - - my($a, $b) = @_; - return ($a<$b ? $a : $b); -} - - - -sub score_freq{ - -# scores the count of reads that map to the potential precursor -# Assumes geometric distribution as described in methods section of manuscript - - my $freq=shift; - - #parameters of known precursors and background hairpins - my $parameter_test=0.999; - my $parameter_control=0.6; - - #log_odds calculated directly to avoid underflow - my $intercept=log((1-$parameter_test)/(1-$parameter_control)); - my $slope=log($parameter_test/$parameter_control); - my $log_odds=$slope*$freq+$intercept; - - #if no strong evidence for 3' overhangs, limit the score contribution to 0 - unless($options{x} or $hash_comp{"star_read"}){$log_odds=min2($log_odds,0);} - - return $log_odds; -} - - - -##sub score_mfe{ - -# scores the minimum free energy in kCal/mol of the potential precursor -# Assumes Gumbel distribution as described in methods section of manuscript - -## my $mfe=shift; - - #numerical value, minimum 1 -## my $mfe_adj=max2(1,-$mfe); - - #parameters of known precursors and background hairpins, scale and location -## my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32); -## my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23); - -## my $odds=$prob_test/$prob_background; -## my $log_odds=log($odds); - -## return $log_odds; -##} - -sub score_mfe{ -# use bignum; - -# scores the minimum free energy in kCal/mol of the potential precursor -# Assumes Gumbel distribution as described in methods section of manuscript - - my ($mfe,$mlng)=@_; - - #numerical value, minimum 1 - my $mfe_adj=max2(1,-$mfe); -my $mfe_adj1=$mfe/$mlng; - #parameters of known precursors and background hairpins, scale and location - my $a=1.339e-12;my $b=2.778e-13;my $c=45.834; - my $ev=$e**($mfe_adj1*$c); - print STDERR "\n***",$ev,"**\t",$ev+$b,"\t"; - my $log_odds=($a/($b+$ev)); - - - my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32); - my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23); - - my $odds=$prob_test/$prob_background; - my $log_odds_2=log($odds); - print STDERR "log_odds :",$log_odds,"\t",$log_odds_2,"\n"; - return $log_odds; -} - - - -sub prob_gumbel_discretized{ - -# discretized Gumbel distribution, probabilities within windows of 1 kCal/mol -# uses the subroutine that calculates the cdf to find the probabilities - - my ($var,$scale,$location)=@_; - - my $bound_lower=$var-0.5; - my $bound_upper=$var+0.5; - - my $cdf_lower=cdf_gumbel($bound_lower,$scale,$location); - my $cdf_upper=cdf_gumbel($bound_upper,$scale,$location); - - my $prob=$cdf_upper-$cdf_lower; - - return $prob; -} - - -sub cdf_gumbel{ - -# calculates the cumulative distribution function of the Gumbel distribution - - my ($var,$scale,$location)=@_; - - my $cdf=$e**(-($e**(-($var-$location)/$scale))); - - return $cdf; -} -