comparison miRDeep_plant.pl @ 50:7b5a48b972e9 draft

Uploaded
author big-tiandm
date Fri, 05 Dec 2014 00:11:02 -0500
parents
children
comparison
equal deleted inserted replaced
49:f008ab2cadc6 50:7b5a48b972e9
1 #!/usr/bin/perl
2
3 use warnings;
4 use strict;
5 use Getopt::Std;
6 #use RNA;
7
8
9 ################################# MIRDEEP #################################################
10
11 ################################## USAGE ##################################################
12
13
14 my $usage=
15 "$0 file_signature file_structure temp_out_directory
16
17 This is the core algorithm of miRDeep. It takes as input a file in blastparsed format with
18 information on the positions of reads aligned to potential precursor sequences (signature).
19 It also takes as input an RNAfold output file, giving information on the sequence, structure
20 and mimimum free energy of the potential precursor sequences.
21
22 Extra arguments can be given. -s specifies a fastafile containing the known mature miRNA
23 sequences that should be considered for conservation purposes. -t prints out the potential
24 precursor sequences that do _not_ exceed the cut-off (default prints out the sequences that
25 exceeds the cut-off). -u gives limited output, that is only the ids of the potential precursors
26 that exceed the cut-off. -v varies the cut-off. -x is a sensitive option for Sanger sequences
27 obtained through conventional cloning. -z consider the number of base pairings in the lower
28 stems (this option is not well tested).
29
30 -h print this usage
31 -s fasta file with known miRNAs
32 #-o temp directory ,maked befor running the program.
33 -t print filtered
34 -u limited output (only ids)
35 -v cut-off (default 1)
36 -x sensitive option for Sanger sequences
37 -y use Randfold
38 -z consider Drosha processing
39 ";
40
41
42
43
44
45 ############################################################################################
46
47 ################################### INPUT ##################################################
48
49
50 #signature file in blast_parsed format
51 my $file_blast_parsed=shift or die $usage;
52
53 #structure file outputted from RNAfold
54 my $file_struct=shift or die $usage;
55
56 my $tmpdir=shift or die $usage;
57 #options
58 my %options=();
59 getopts("hs:tuv:xyz",\%options);
60
61
62
63
64
65
66 #############################################################################################
67
68 ############################# GLOBAL VARIABLES ##############################################
69
70
71 #parameters
72 my $nucleus_lng=11;
73
74 my $score_star=3.9;
75 my $score_star_not=-1.3;
76 my $score_nucleus=7.63;
77 my $score_nucleus_not=-1.17;
78 my $score_randfold=1.37;
79 my $score_randfold_not=-3.624;
80 my $score_intercept=0.3;
81 my @scores_stem=(-3.1,-2.3,-2.2,-1.6,-1.5,0.1,0.6,0.8,0.9,0.9,0);
82 my $score_min=1;
83 if($options{v}){$score_min=$options{v};}
84 if($options{x}){$score_min=-5;}
85
86 my $e=2.718281828;
87
88 #hashes
89 my %hash_desc;
90 my %hash_seq;
91 my %hash_struct;
92 my %hash_mfe;
93 my %hash_nuclei;
94 my %hash_mirs;
95 my %hash_query;
96 my %hash_comp;
97 my %hash_bp;
98
99 #other variables
100 my $subject_old;
101 my $message_filter;
102 my $message_score;
103 my $lines;
104 my $out_of_bound;
105
106
107
108 ##############################################################################################
109
110 ################################ MAIN ######################################################
111
112
113 #print help if that option is used
114 if($options{h}){die $usage;}
115 unless ($tmpdir=~/\/$/) {$tmpdir .="/";}
116 if(!(-s $tmpdir)){mkdir $tmpdir;}
117 $tmpdir .="TMP_DIR/";
118 mkdir $tmpdir;
119
120 #parse structure file outputted from RNAfold
121 parse_file_struct($file_struct);
122
123 #if conservation is scored, the fasta file of known miRNA sequences is parsed
124 if($options{s}){create_hash_nuclei($options{s})};
125
126 #parse signature file in blast_parsed format and resolve each potential precursor
127 parse_file_blast_parsed($file_blast_parsed);
128 #`rm -rf $tmpdir`;
129 exit;
130
131
132
133
134 ##############################################################################################
135
136 ############################## SUBROUTINES ###################################################
137
138
139
140 sub parse_file_blast_parsed{
141
142 # read through the signature blastparsed file, fills up a hash with information on queries
143 # (deep sequences) mapping to the current subject (potential precursor) and resolve each
144 # potential precursor in turn
145
146 my $file_blast_parsed=shift;
147
148 open (FILE_BLAST_PARSED, "<$file_blast_parsed") or die "can not open $file_blast_parsed\n";
149 while (my $line=<FILE_BLAST_PARSED>){
150 if($line=~/^(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)$/){
151 my $query=$1;
152 my $query_lng=$2;
153 my $query_beg=$3;
154 my $query_end=$4;
155 my $subject=$5;
156 my $subject_lng=$6;
157 my $subject_beg=$7;
158 my $subject_end=$8;
159 my $e_value=$9;
160 my $pid=$10;
161 my $bitscore=$11;
162 my $other=$12;
163
164 #if the new line concerns a new subject (potential precursor) then the old subject must be resolved
165 if($subject_old and $subject_old ne $subject){
166 resolve_potential_precursor();
167 }
168
169 #resolve the strand
170 my $strand=find_strand($other);
171
172 #resolve the number of reads that the deep sequence represents
173 my $freq=find_freq($query);
174
175 #read information of the query (deep sequence) into hash
176 $hash_query{$query}{"subject_beg"}=$subject_beg;
177 $hash_query{$query}{"subject_end"}=$subject_end;
178 $hash_query{$query}{"strand"}=$strand;
179 $hash_query{$query}{"freq"}=$freq;
180
181 #save the signature information
182 $lines.=$line;
183
184 $subject_old=$subject;
185 }
186 }
187 resolve_potential_precursor();
188 }
189
190 sub resolve_potential_precursor{
191
192 # dissects the potential precursor in parts by filling hashes, and tests if it passes the
193 # initial filter and the scoring filter
194
195 # binary variable whether the potential precursor is still viable
196 my $ret=1;
197 #print STDERR ">$subject_old\n";
198
199 fill_structure();
200 #print STDERR "\%hash_bp",scalar keys %hash_bp,"\n";
201 fill_pri();
202 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
203
204 fill_mature();
205 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
206
207 fill_star();
208 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
209
210 fill_loop();
211 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
212
213 fill_lower_flanks();
214 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
215
216 # do_test_assemble();
217
218 # this is the actual classification
219 unless(pass_filtering_initial() and pass_threshold_score()){$ret=0;}
220
221 print_results($ret);
222
223 reset_variables();
224
225 return;
226
227 }
228
229
230
231 sub print_results{
232
233 my $ret=shift;
234
235 # print out if the precursor is accepted and accepted precursors should be printed out
236 # or if the potential precursor is discarded and discarded potential precursors should
237 # be printed out
238
239 if((!$options{t} and $ret) or ($options{t} and !$ret)){
240 #full output
241 unless($options{u}){
242 if($message_filter){print $message_filter;}
243 if($message_score){print $message_score;}
244 print_hash_comp();
245 print $lines,"\n\n";
246 return;
247 }
248 #limited output (only ids)
249 my $id=$hash_comp{"pri_id"};
250 print "$id\n";
251 }
252 }
253
254
255
256
257
258
259
260 sub pass_threshold_score{
261
262 # this is the scoring
263
264 #minimum free energy of the potential precursor
265 # my $score_mfe=score_mfe($hash_comp{"pri_mfe"});
266 my $score_mfe=score_mfe($hash_comp{"pri_mfe"},$hash_comp{"pri_end"});
267
268 #count of reads that map in accordance with Dicer processing
269 my $score_freq=score_freq($hash_comp{"freq"});
270 #print STDERR "score_mfe: $score_mfe\nscore_freq: $score_freq\n";
271
272 #basic score
273 my $score=$score_mfe+$score_freq;
274
275 #scoring of conserved nucleus/seed (optional)
276 if($options{s}){
277
278 #if the nucleus is conserved
279 if(test_nucleus_conservation()){
280
281 #nucleus from position 2-8
282 my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng);
283
284 #resolve DNA/RNA ambiguities
285 $nucleus=~tr/[T]/[U]/;
286
287 #print score contribution
288 score_s("score_nucleus\t$score_nucleus");
289
290 #print the ids of known miRNAs with same nucleus
291 score_s("$hash_mirs{$nucleus}");
292 #print STDERR "score_nucleus\t$score_nucleus\n";
293
294 #add to score
295 $score+=$score_nucleus;
296
297 #if the nucleus is not conserved
298 }else{
299 #print (negative) score contribution
300 score_s("score_nucleus\t$score_nucleus_not");
301
302 #add (negative) score contribution
303 $score+=$score_nucleus_not;
304 }
305 }
306
307 #if the majority of potential star reads fall as expected from Dicer processing
308 if($hash_comp{"star_read"}){
309 score_s("score_star\t$score_star");
310 #print STDERR "score_star\t$score_star\n";
311 $score+=$score_star;
312 }else{
313 score_s("score_star\t$score_star_not");
314 #print STDERR "score_star_not\t$score_star_not\n";
315 $score+=$score_star_not;
316 }
317
318 #score lower stems for potential for Drosha recognition (highly optional)
319 if($options{z}){
320 my $stem_bp=$hash_comp{"stem_bp"};
321 my $score_stem=$scores_stem[$stem_bp];
322 $score+=$score_stem;
323 score_s("score_stem\t$score_stem");
324 }
325
326 #print STDERR "score_intercept\t$score_intercept\n";
327
328 $score+=$score_intercept;
329
330 #score for randfold (optional)
331 if($options{y}){
332
333 # only calculate randfold value if it can make the difference between the potential precursor
334 # being accepted or discarded
335 if($score+$score_randfold>=$score_min and $score+$score_randfold_not<=$score_min){
336
337 #randfold value<0.05
338 if(test_randfold()){$score+=$score_randfold;score_s("score_randfold\t$score_randfold");}
339
340 #randfold value>0.05
341 else{$score+=$score_randfold_not;score_s("score_randfold\t$score_randfold_not");}
342 }
343 }
344
345 #round off values to one decimal
346 my $round_mfe=round($score_mfe*10)/10;
347 my $round_freq=round($score_freq*10)/10;
348 my $round=round($score*10)/10;
349
350 #print scores
351 score_s("score_mfe\t$round_mfe\nscore_freq\t$round_freq\nscore\t$round");
352
353 #return 1 if the potential precursor is accepted, return 0 if discarded
354 unless($score>=$score_min){return 0;}
355 return 1;
356 }
357
358 sub test_randfold{
359
360 #print sequence to temporary file, test randfold value, return 1 or 0
361
362 # print_file("pri_seq.fa",">pri_seq\n".$hash_comp{"pri_seq"});
363 #my $tmpfile=$tmpdir.$hash_comp{"pri_id"};
364 #open(FILE, ">$tmpfile");
365 #print FILE ">pri_seq\n",$hash_comp{"pri_seq"};
366 #close FILE;
367
368 # my $p_value=`randfold -s $tmpfile 999 | cut -f 3`;
369 #my $p1=`randfold -s $tmpfile 999 | cut -f 3`;
370 #my $p2=`randfold -s $tmpfile 999 | cut -f 3`;
371 my $p1=&randfold_pvalue($hash_comp{"pri_seq"},999);
372 my $p2=&randfold_pvalue($hash_comp{"pri_seq"},999);
373 my $p_value=($p1+$p2)/2;
374 wait;
375 # system "rm $tmpfile";
376
377 if($p_value<=0.05){return 1;}
378
379 return 0;
380 }
381
382 sub randfold_pvalue{
383 my $cpt_sup = 0;
384 my $cpt_inf = 0;
385 my $cpt_ega = 1;
386
387 my ($seq,$number_of_randomizations)=@_;
388 #my $str =$seq;
389 #my $mfe = RNA::fold($seq,$str);
390 my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`;
391 my @rawfolds=split/\s+/,$rnafold;
392 my $str=$rawfolds[1];
393 my $mfe=$rawfolds[-1];
394 $mfe=~s/\(//;
395 $mfe=~s/\)//;
396
397 for (my $i=0;$i<$number_of_randomizations;$i++) {
398 $seq = shuffle_sequence_dinucleotide($seq);
399 #$str = $seq;
400
401 #my $rand_mfe = RNA::fold($str,$str);
402 $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`;
403 my @rawfolds=split/\s+/,$rnafold;
404 my $str=$rawfolds[1];
405 my $rand_mfe=$rawfolds[-1];
406 $rand_mfe=~s/\(//;
407 $rand_mfe=~s/\)//;
408
409 if ($rand_mfe < $mfe) {
410 $cpt_inf++;
411 }
412 if ($rand_mfe == $mfe) {
413 $cpt_ega++;
414 }
415 if ($rand_mfe > $mfe) {
416 $cpt_sup++;
417 }
418 }
419
420 my $proba = ($cpt_ega + $cpt_inf) / ($number_of_randomizations + 1);
421
422 #print "$name\t$mfe\t$proba\n";
423 return $proba;
424 }
425
426 sub shuffle_sequence_dinucleotide {
427
428 my ($str) = @_;
429
430 # upper case and convert to ATGC
431 $str = uc($str);
432 $str =~ s/U/T/g;
433
434 my @nuc = ('A','T','G','C');
435 my $count_swap = 0;
436 # set maximum number of permutations
437 my $stop = length($str) * 10;
438
439 while($count_swap < $stop) {
440
441 my @pos;
442
443 # look start and end letters
444 my $firstnuc = $nuc[int(rand 4)];
445 my $thirdnuc = $nuc[int(rand 4)];
446
447 # get positions for matching nucleotides
448 for (my $i=0;$i<(length($str)-2);$i++) {
449 if ((substr($str,$i,1) eq $firstnuc) && (substr($str,$i+2,1) eq $thirdnuc)) {
450 push (@pos,($i+1));
451 $i++;
452 }
453 }
454
455 # swap at random trinucleotides
456 my $max = scalar(@pos);
457 for (my $i=0;$i<$max;$i++) {
458 my $swap = int(rand($max));
459 if ((abs($pos[$swap] - $pos[$i]) >= 3) && (substr($str,$pos[$i],1) ne substr($str,$pos[$swap],1))) {
460 $count_swap++;
461 my $w1 = substr($str,$pos[$i],1);
462 my $w2 = substr($str,$pos[$swap],1);
463 substr($str,$pos[$i],1,$w2);
464 substr($str,$pos[$swap],1,$w1);
465 }
466 }
467 }
468 return($str);
469 }
470
471 sub test_nucleus_conservation{
472
473 #test if nucleus is identical to nucleus from known miRNA, return 1 or 0
474
475 my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng);
476 $nucleus=~tr/[T]/[U]/;
477 if($hash_nuclei{$nucleus}){return 1;}
478
479 return 0;
480 }
481
482
483
484 sub pass_filtering_initial{
485
486 #test if the structure forms a plausible hairpin
487 unless(pass_filtering_structure()){filter_p("structure problem"); return 0;}
488
489 #test if >90% of reads map to the hairpin in consistence with Dicer processing
490 unless(pass_filtering_signature()){filter_p("signature problem");return 0;}
491
492 return 1;
493
494 }
495
496
497 sub pass_filtering_signature{
498
499 #number of reads that map in consistence with Dicer processing
500 my $consistent=0;
501
502 #number of reads that map inconsistent with Dicer processing
503 my $inconsistent=0;
504
505 # number of potential star reads map in good consistence with Drosha/Dicer processing
506 # (3' overhangs relative to mature product)
507 my $star_perfect=0;
508
509 # number of potential star reads that do not map in good consistence with 3' overhang
510 my $star_fuzzy=0;
511
512
513 #sort queries (deep sequences) by their position on the hairpin
514 my @queries=sort {$hash_query{$a}{"subject_beg"} <=> $hash_query{$b}{"subject_beg"}} keys %hash_query;
515
516 foreach my $query(@queries){
517
518 #number of reads that the deep sequence represents
519 unless(defined($hash_query{$query}{"freq"})){next;}
520 my $query_freq=$hash_query{$query}{"freq"};
521
522 #test which Dicer product (if any) the deep sequence corresponds to
523 my $product=test_query($query);
524
525 #if the deep sequence corresponds to a Dicer product, add to the 'consistent' variable
526 if($product){$consistent+=$query_freq;}
527
528 #if the deep sequence do not correspond to a Dicer product, add to the 'inconsistent' variable
529 else{$inconsistent+=$query_freq;}
530
531 #test a potential star sequence has good 3' overhang
532 if($product eq "star"){
533 if(test_star($query)){$star_perfect+=$query_freq;}
534 else{$star_fuzzy+=$query_freq;}
535 }
536 }
537
538 # if the majority of potential star sequences map in good accordance with 3' overhang
539 # score for the presence of star evidence
540 if($star_perfect>$star_fuzzy){$hash_comp{"star_read"}=1;}
541
542 #total number of reads mapping to the hairpin
543 my $freq=$consistent+$inconsistent;
544 $hash_comp{"freq"}=$freq;
545 unless($freq>0){filter_s("read frequency too low"); return 0;}
546
547 #unless >90% of the reads map in consistence with Dicer processing, the hairpin is discarded
548 my $inconsistent_fraction=$inconsistent/($inconsistent+$consistent);
549 unless($inconsistent_fraction<=0.1){filter_p("inconsistent\t$inconsistent\nconsistent\t$consistent"); return 0;}
550
551 #the hairpin is retained
552 return 1;
553 }
554
555 sub test_star{
556
557 #test if a deep sequence maps in good consistence with 3' overhang
558
559 my $query=shift;
560
561 #5' begin and 3' end positions
562 my $beg=$hash_query{$query}{"subject_beg"};
563 my $end=$hash_query{$query}{"subject_end"};
564
565 #the difference between observed and expected begin positions must be 0 or 1
566 my $offset=$beg-$hash_comp{"star_beg"};
567 if($offset==0 or $offset==1 or $offset==-1){return 1;}
568
569 return 0;
570 }
571
572
573
574 sub test_query{
575
576 #test if deep sequence maps in consistence with Dicer processing
577
578 my $query=shift;
579
580 #begin, end, strand and read count
581 my $beg=$hash_query{$query}{"subject_beg"};
582 my $end=$hash_query{$query}{"subject_end"};
583 my $strand=$hash_query{$query}{"strand"};
584 my $freq=$hash_query{$query}{"freq"};
585
586 #should not be on the minus strand (although this has in fact anecdotally been observed for known miRNAs)
587 if($strand eq '-'){return 0;}
588
589 #the deep sequence is allowed to stretch 2 nt beyond the expected 5' end
590 my $fuzz_beg=2;
591 #the deep sequence is allowed to stretch 5 nt beyond the expected 3' end
592 my $fuzz_end=2;
593
594 #if in accordance with Dicer processing, return the type of Dicer product
595 if(contained($beg,$end,$hash_comp{"mature_beg"}-$fuzz_beg,$hash_comp{"mature_end"}+$fuzz_end)){return "mature";}
596 if(contained($beg,$end,$hash_comp{"star_beg"}-$fuzz_beg,$hash_comp{"star_end"}+$fuzz_end)){return "star";}
597 if(contained($beg,$end,$hash_comp{"loop_beg"}-$fuzz_beg,$hash_comp{"loop_end"}+$fuzz_end)){return "loop";}
598
599 #if not in accordance, return 0
600 return 0;
601 }
602
603
604 sub pass_filtering_structure{
605
606 #The potential precursor must form a hairpin with miRNA precursor-like characteristics
607
608 #return value
609 my $ret=1;
610
611 #potential mature, star, loop and lower flank parts must be identifiable
612 unless(test_components()){return 0;}
613
614 #no bifurcations
615 unless(no_bifurcations_precursor()){$ret=0;}
616
617 #minimum 14 base pairings in duplex
618 unless(bp_duplex()>=15){$ret=0;filter_s("too few pairings in duplex");}
619
620 #not more than 6 nt difference between mature and star length
621 unless(-6<diff_lng() and diff_lng()<6){$ret=0; filter_s("too big difference between mature and star length") }
622
623 return $ret;
624 }
625
626
627
628
629
630
631 sub test_components{
632
633 #tests whether potential mature, star, loop and lower flank parts are identifiable
634
635 unless($hash_comp{"mature_struct"}){
636 filter_s("no mature");
637 # print STDERR "no mature\n";
638 return 0;
639 }
640
641 unless($hash_comp{"star_struct"}){
642 filter_s("no star");
643 # print STDERR "no star\n";
644 return 0;
645 }
646
647 unless($hash_comp{"loop_struct"}){
648 filter_s("no loop");
649 # print STDERR "no loop\n";
650 return 0;
651 }
652
653 unless($hash_comp{"flank_first_struct"}){
654 filter_s("no flanks");
655 #print STDERR "no flanks_first_struct\n";
656 return 0;
657 }
658
659 unless($hash_comp{"flank_second_struct"}){
660 filter_s("no flanks");
661 # print STDERR "no flanks_second_struct\n";
662 return 0;
663 }
664 return 1;
665 }
666
667
668
669
670
671 sub no_bifurcations_precursor{
672
673 #tests whether there are bifurcations in the hairpin
674
675 #assembles the potential precursor sequence and structure from the expected Dicer products
676 #this is the expected biological precursor, in contrast with 'pri_seq' that includes
677 #some genomic flanks on both sides
678
679 my $pre_struct;
680 my $pre_seq;
681 if($hash_comp{"mature_arm"} eq "first"){
682 $pre_struct.=$hash_comp{"mature_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"star_struct"};
683 $pre_seq.=$hash_comp{"mature_seq"}.$hash_comp{"loop_seq"}.$hash_comp{"star_seq"};
684 }else{
685 $pre_struct.=$hash_comp{"star_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"mature_struct"};
686 $pre_seq.=$hash_comp{"star_seq"}.$hash_comp{"loop_seq"}.$hash_comp{"mature_seq"};
687 }
688
689 #read into hash
690 $hash_comp{"pre_struct"}=$pre_struct;
691 $hash_comp{"pre_seq"}=$pre_seq;
692
693 #simple pattern matching checks for bifurcations
694 unless($pre_struct=~/^((\.|\()+..(\.|\))+)$/){
695 filter_s("bifurcation in precursor");
696 # print STDERR "bifurcation in precursor\n";
697 return 0;
698 }
699
700 return 1;
701 }
702
703 sub bp_precursor{
704
705 #total number of bps in the precursor
706
707 my $pre_struct=$hash_comp{"pre_struct"};
708
709 #simple pattern matching
710 my $pre_bps=0;
711 while($pre_struct=~/\(/g){
712 $pre_bps++;
713 }
714 return $pre_bps;
715 }
716
717
718 sub bp_duplex{
719
720 #total number of bps in the duplex
721
722 my $duplex_bps=0;
723 my $mature_struct=$hash_comp{"mature_struct"};
724
725 #simple pattern matching
726 while($mature_struct=~/(\(|\))/g){
727 $duplex_bps++;
728 }
729 return $duplex_bps;
730 }
731
732 sub diff_lng{
733
734 #find difference between mature and star lengths
735
736 my $mature_lng=length $hash_comp{"mature_struct"};
737 my $star_lng=length $hash_comp{"star_struct"};
738 my $diff_lng=$mature_lng-$star_lng;
739 return $diff_lng;
740 }
741
742
743
744 sub do_test_assemble{
745
746 # not currently used, tests if the 'pri_struct' as assembled from the parts (Dicer products, lower flanks)
747 # is identical to 'pri_struct' before disassembly into parts
748
749 my $assemble_struct;
750
751 if($hash_comp{"flank_first_struct"} and $hash_comp{"mature_struct"} and $hash_comp{"loop_struct"} and $hash_comp{"star_struct"} and $hash_comp{"flank_second_struct"}){
752 if($hash_comp{"mature_arm"} eq "first"){
753 $assemble_struct.=$hash_comp{"flank_first_struct"}.$hash_comp{"mature_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"star_struct"}.$hash_comp{"flank_second_struct"};
754 }else{
755 $assemble_struct.=$hash_comp{"flank_first_struct"}.$hash_comp{"star_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"mature_struct"}.$hash_comp{"flank_second_struct"};
756 }
757 unless($assemble_struct eq $hash_comp{"pri_struct"}){
758 $hash_comp{"test_assemble"}=$assemble_struct;
759 print_hash_comp();
760 }
761 }
762 return;
763 }
764
765
766
767 sub fill_structure{
768
769 #reads the dot bracket structure into the 'bp' hash where each key and value are basepaired
770
771 my $struct=$hash_struct{$subject_old};
772 my $lng=length $struct;
773
774 #local stack for keeping track of basepairings
775 my @bps;
776
777 for(my $pos=1;$pos<=$lng;$pos++){
778 my $struct_pos=excise_struct($struct,$pos,$pos,"+");
779
780 if($struct_pos eq "("){
781 push(@bps,$pos);
782 }
783
784 if($struct_pos eq ")"){
785 my $pos_prev=pop(@bps);
786 $hash_bp{$pos_prev}=$pos;
787 $hash_bp{$pos}=$pos_prev;
788 }
789 }
790 return;
791 }
792
793
794
795 sub fill_star{
796
797 #fills specifics on the expected star strand into 'comp' hash ('component' hash)
798
799 #if the mature sequence is not plausible, don't look for the star arm
800 my $mature_arm=$hash_comp{"mature_arm"};
801 unless($mature_arm){$hash_comp{"star_arm"}=0; return;}
802
803 #if the star sequence is not plausible, don't fill into the hash
804 my($star_beg,$star_end)=find_star();
805 my $star_arm=arm_star($star_beg,$star_end);
806 unless($star_arm){return;}
807
808 #excise expected star sequence and structure
809 my $star_seq=excise_seq($hash_comp{"pri_seq"},$star_beg,$star_end,"+");
810 my $star_struct=excise_seq($hash_comp{"pri_struct"},$star_beg,$star_end,"+");
811
812 #fill into hash
813 $hash_comp{"star_beg"}=$star_beg;
814 $hash_comp{"star_end"}=$star_end;
815 $hash_comp{"star_seq"}=$star_seq;
816 $hash_comp{"star_struct"}=$star_struct;
817 $hash_comp{"star_arm"}=$star_arm;
818
819 return;
820 }
821
822
823 sub find_star{
824
825 #uses the 'bp' hash to find the expected star begin and end positions from the mature positions
826
827 #the -2 is for the overhang
828 my $mature_beg=$hash_comp{"mature_beg"};
829 my $mature_end=$hash_comp{"mature_end"}-2;
830 my $mature_lng=$mature_end-$mature_beg+1;
831
832 #in some cases, the last nucleotide of the mature sequence does not form a base pair,
833 #and therefore does not basepair with the first nucleotide of the star sequence.
834 #In this case, the algorithm searches for the last nucleotide of the mature sequence
835 #to form a base pair. The offset is the number of nucleotides searched through.
836 my $offset_star_beg=0;
837 my $offset_beg=0;
838
839 #the offset should not be longer than the length of the mature sequence, then it
840 #means that the mature sequence does not form any base pairs
841 while(!$offset_star_beg and $offset_beg<$mature_lng){
842 if($hash_bp{$mature_end-$offset_beg}){
843 $offset_star_beg=$hash_bp{$mature_end-$offset_beg};
844 }else{
845 $offset_beg++;
846 }
847 }
848 #when defining the beginning of the star sequence, compensate for the offset
849 my $star_beg=$offset_star_beg-$offset_beg;
850
851 #same as above
852 my $offset_star_end=0;
853 my $offset_end=0;
854 while(!$offset_star_end and $offset_end<$mature_lng){
855 if($hash_bp{$mature_beg+$offset_end}){
856 $offset_star_end=$hash_bp{$mature_beg+$offset_end};
857 }else{
858 $offset_end++;
859 }
860 }
861 #the +2 is for the overhang
862 my $star_end=$offset_star_end+$offset_end+2;
863
864 return($star_beg,$star_end);
865 }
866
867
868 sub fill_pri{
869
870 #fills basic specifics on the precursor into the 'comp' hash
871
872 my $seq=$hash_seq{$subject_old};
873 my $struct=$hash_struct{$subject_old};
874 my $mfe=$hash_mfe{$subject_old};
875 my $length=length $seq;
876
877 $hash_comp{"pri_id"}=$subject_old;
878 $hash_comp{"pri_seq"}=$seq;
879 $hash_comp{"pri_struct"}=$struct;
880 $hash_comp{"pri_mfe"}=$mfe;
881 $hash_comp{"pri_beg"}=1;
882 $hash_comp{"pri_end"}=$length;
883
884 return;
885 }
886
887
888 sub fill_mature{
889
890 #fills specifics on the mature sequence into the 'comp' hash
891
892 my $mature_query=find_mature_query();
893 my($mature_beg,$mature_end)=find_positions_query($mature_query);
894 my $mature_strand=find_strand_query($mature_query);
895 my $mature_seq=excise_seq($hash_comp{"pri_seq"},$mature_beg,$mature_end,$mature_strand);
896 my $mature_struct=excise_struct($hash_comp{"pri_struct"},$mature_beg,$mature_end,$mature_strand);
897 my $mature_arm=arm_mature($mature_beg,$mature_end,$mature_strand);
898
899 $hash_comp{"mature_query"}=$mature_query;
900 $hash_comp{"mature_beg"}=$mature_beg;
901 $hash_comp{"mature_end"}=$mature_end;
902 $hash_comp{"mature_strand"}=$mature_strand;
903 $hash_comp{"mature_struct"}=$mature_struct;
904 $hash_comp{"mature_seq"}=$mature_seq;
905 $hash_comp{"mature_arm"}=$mature_arm;
906
907 return;
908 }
909
910
911
912 sub fill_loop{
913
914 #fills specifics on the loop sequence into the 'comp' hash
915
916 #unless both mature and star sequences are plausible, do not look for the loop
917 unless($hash_comp{"mature_arm"} and $hash_comp{"star_arm"}){return;}
918
919 my $loop_beg;
920 my $loop_end;
921
922 #defining the begin and end positions of the loop from the mature and star positions
923 #excision depends on whether the mature or star sequence is 5' of the loop ('first')
924 if($hash_comp{"mature_arm"} eq "first"){
925 $loop_beg=$hash_comp{"mature_end"}+1;
926 }else{
927 $loop_end=$hash_comp{"mature_beg"}-1;
928 }
929
930 if($hash_comp{"star_arm"} eq "first"){
931 $loop_beg=$hash_comp{"star_end"}+1;
932 }else{
933 $loop_end=$hash_comp{"star_beg"}-1;
934 }
935
936 #unless the positions are plausible, do not fill into hash
937 unless(test_loop($loop_beg,$loop_end)){return;}
938
939 my $loop_seq=excise_seq($hash_comp{"pri_seq"},$loop_beg,$loop_end,"+");
940 my $loop_struct=excise_struct($hash_comp{"pri_struct"},$loop_beg,$loop_end,"+");
941
942 $hash_comp{"loop_beg"}=$loop_beg;
943 $hash_comp{"loop_end"}=$loop_end;
944 $hash_comp{"loop_seq"}=$loop_seq;
945 $hash_comp{"loop_struct"}=$loop_struct;
946
947 return;
948 }
949
950
951 sub fill_lower_flanks{
952
953 #fills specifics on the lower flanks and unpaired strands into the 'comp' hash
954
955 #unless both mature and star sequences are plausible, do not look for the flanks
956 unless($hash_comp{"mature_arm"} and $hash_comp{"star_arm"}){return;}
957
958 my $flank_first_end;
959 my $flank_second_beg;
960
961 #defining the begin and end positions of the flanks from the mature and star positions
962 #excision depends on whether the mature or star sequence is 5' in the potenitial precursor ('first')
963 if($hash_comp{"mature_arm"} eq "first"){
964 $flank_first_end=$hash_comp{"mature_beg"}-1;
965 }else{
966 $flank_second_beg=$hash_comp{"mature_end"}+1;
967 }
968
969 if($hash_comp{"star_arm"} eq "first"){
970 $flank_first_end=$hash_comp{"star_beg"}-1;
971 }else{
972 $flank_second_beg=$hash_comp{"star_end"}+1;
973 }
974
975 #unless the positions are plausible, do not fill into hash
976 unless(test_flanks($flank_first_end,$flank_second_beg)){return;}
977
978 $hash_comp{"flank_first_end"}=$flank_first_end;
979 $hash_comp{"flank_second_beg"}=$flank_second_beg;
980 $hash_comp{"flank_first_seq"}=excise_seq($hash_comp{"pri_seq"},$hash_comp{"pri_beg"},$hash_comp{"flank_first_end"},"+");
981 $hash_comp{"flank_second_seq"}=excise_seq($hash_comp{"pri_seq"},$hash_comp{"flank_second_beg"},$hash_comp{"pri_end"},"+");
982 $hash_comp{"flank_first_struct"}=excise_struct($hash_comp{"pri_struct"},$hash_comp{"pri_beg"},$hash_comp{"flank_first_end"},"+");
983 $hash_comp{"flank_second_struct"}=excise_struct($hash_comp{"pri_struct"},$hash_comp{"flank_second_beg"},$hash_comp{"pri_end"},"+");
984
985 if($options{z}){
986 fill_stems_drosha();
987 }
988
989 return;
990 }
991
992
993 sub fill_stems_drosha{
994
995 #scores the number of base pairings formed by the first ten nt of the lower stems
996 #in general, the more stems, the higher the score contribution
997 #warning: this options has not been thoroughly tested
998
999 my $flank_first_struct=$hash_comp{"flank_first_struct"};
1000 my $flank_second_struct=$hash_comp{"flank_second_struct"};
1001
1002 my $stem_first=substr($flank_first_struct,-10);
1003 my $stem_second=substr($flank_second_struct,0,10);
1004
1005 my $stem_bp_first=0;
1006 my $stem_bp_second=0;
1007
1008 #find base pairings by simple pattern matching
1009 while($stem_first=~/\(/g){
1010 $stem_bp_first++;
1011 }
1012
1013 while($stem_second=~/\)/g){
1014 $stem_bp_second++;
1015 }
1016
1017 my $stem_bp=min2($stem_bp_first,$stem_bp_second);
1018
1019 $hash_comp{"stem_first"}=$stem_first;
1020 $hash_comp{"stem_second"}=$stem_second;
1021 $hash_comp{"stem_bp_first"}=$stem_bp_first;
1022 $hash_comp{"stem_bp_second"}=$stem_bp_second;
1023 $hash_comp{"stem_bp"}=$stem_bp;
1024
1025 return;
1026 }
1027
1028
1029
1030
1031 sub arm_mature{
1032
1033 #tests whether the mature sequence is in the 5' ('first') or 3' ('second') arm of the potential precursor
1034
1035 my ($beg,$end,$strand)=@_;
1036
1037 #mature and star sequences should alway be on plus strand
1038 if($strand eq "-"){return 0;}
1039
1040 #there should be no bifurcations and minimum one base pairing
1041 my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,$strand);
1042 if(defined($struct) and $struct=~/^(\(|\.)+$/ and $struct=~/\(/){
1043 return "first";
1044 }elsif(defined($struct) and $struct=~/^(\)|\.)+$/ and $struct=~/\)/){
1045 return "second";
1046 }
1047 return 0;
1048 }
1049
1050
1051 sub arm_star{
1052
1053 #tests whether the star sequence is in the 5' ('first') or 3' ('second') arm of the potential precursor
1054
1055 my ($beg,$end)=@_;
1056
1057 #unless the begin and end positions are plausible, test negative
1058 unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;}
1059
1060 #no overlap between the mature and the star sequence
1061 if($hash_comp{"mature_arm"} eq "first"){
1062 ($hash_comp{"mature_end"}<$beg) or return 0;
1063 }elsif($hash_comp{"mature_arm"} eq "second"){
1064 ($end<$hash_comp{"mature_beg"}) or return 0;
1065 }
1066
1067 #there should be no bifurcations and minimum one base pairing
1068 my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,"+");
1069 if($struct=~/^(\(|\.)+$/ and $struct=~/\(/){
1070 return "first";
1071 }elsif($struct=~/^(\)|\.)+$/ and $struct=~/\)/){
1072 return "second";
1073 }
1074 return 0;
1075 }
1076
1077
1078 sub test_loop{
1079
1080 #tests the loop positions
1081
1082 my ($beg,$end)=@_;
1083
1084 #unless the begin and end positions are plausible, test negative
1085 unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;}
1086
1087 return 1;
1088 }
1089
1090
1091 sub test_flanks{
1092
1093 #tests the positions of the lower flanks
1094
1095 my ($beg,$end)=@_;
1096
1097 #unless the begin and end positions are plausible, test negative
1098 unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;}
1099
1100 return 1;
1101 }
1102
1103
1104 sub comp{
1105
1106 #subroutine to retrive from the 'comp' hash
1107
1108 my $type=shift;
1109 my $component=$hash_comp{$type};
1110 return $component;
1111 }
1112
1113
1114 sub find_strand_query{
1115
1116 #subroutine to find the strand for a given query
1117
1118 my $query=shift;
1119 my $strand=$hash_query{$query}{"strand"};
1120 return $strand;
1121 }
1122
1123
1124 sub find_positions_query{
1125
1126 #subroutine to find the begin and end positions for a given query
1127
1128 my $query=shift;
1129 my $beg=$hash_query{$query}{"subject_beg"};
1130 my $end=$hash_query{$query}{"subject_end"};
1131 return ($beg,$end);
1132 }
1133
1134
1135
1136 sub find_mature_query{
1137
1138 #finds the query with the highest frequency of reads and returns it
1139 #is used to determine the positions of the potential mature sequence
1140
1141 my @queries=sort {$hash_query{$b}{"freq"} <=> $hash_query{$a}{"freq"}} keys %hash_query;
1142 my $mature_query=$queries[0];
1143 return $mature_query;
1144 }
1145
1146
1147
1148
1149 sub reset_variables{
1150
1151 #resets the hashes for the next potential precursor
1152
1153 # %hash_query=();
1154 # %hash_comp=();
1155 # %hash_bp=();
1156 foreach my $key (keys %hash_query) {delete($hash_query{$key});}
1157 foreach my $key (keys %hash_comp) {delete($hash_comp{$key});}
1158 foreach my $key (keys %hash_bp) {delete($hash_bp{$key});}
1159
1160 # $message_filter=();
1161 # $message_score=();
1162 # $lines=();
1163 undef($message_filter);
1164 undef($message_score);
1165 undef($lines);
1166 return;
1167 }
1168
1169
1170
1171 sub excise_seq{
1172
1173 #excise sub sequence from the potential precursor
1174
1175 my($seq,$beg,$end,$strand)=@_;
1176
1177 #begin can be equal to end if only one nucleotide is excised
1178 unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;}
1179
1180 #rarely, permuted combinations of signature and structure cause out of bound excision errors.
1181 #this happens once appr. every two thousand combinations
1182 unless($beg<=length($seq)){$out_of_bound++;return 0;}
1183
1184 #if on the minus strand, the reverse complement should be excised
1185 if($strand eq "-"){$seq=revcom($seq);}
1186
1187 #the blast parsed format is 1-indexed, substr is 0-indexed
1188 my $sub_seq=substr($seq,$beg-1,$end-$beg+1);
1189
1190 return $sub_seq;
1191
1192 }
1193
1194 sub excise_struct{
1195
1196 #excise sub structure
1197
1198 my($struct,$beg,$end,$strand)=@_;
1199 my $lng=length $struct;
1200
1201 #begin can be equal to end if only one nucleotide is excised
1202 unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;}
1203
1204 #rarely, permuted combinations of signature and structure cause out of bound excision errors.
1205 #this happens once appr. every two thousand combinations
1206 unless($beg<=length($struct)){return 0;}
1207
1208 #if excising relative to minus strand, positions are reversed
1209 if($strand eq "-"){($beg,$end)=rev_pos($beg,$end,$lng);}
1210
1211 #the blast parsed format is 1-indexed, substr is 0-indexed
1212 my $sub_struct=substr($struct,$beg-1,$end-$beg+1);
1213
1214 return $sub_struct;
1215 }
1216
1217
1218 sub create_hash_nuclei{
1219 #parses a fasta file with sequences of known miRNAs considered for conservation purposes
1220 #reads the nuclei into a hash
1221
1222 my ($file) = @_;
1223 my ($id, $desc, $sequence, $nucleus) = ();
1224
1225 open (FASTA, "<$file") or die "can not open $file\n";
1226 while (<FASTA>)
1227 {
1228 chomp;
1229 if (/^>(\S+)(.*)/)
1230 {
1231 $id = $1;
1232 $desc = $2;
1233 $sequence = "";
1234 $nucleus = "";
1235 while (<FASTA>){
1236 chomp;
1237 if (/^>(\S+)(.*)/){
1238 $nucleus = substr($sequence,1,$nucleus_lng);
1239 $nucleus =~ tr/[T]/[U]/;
1240 $hash_mirs{$nucleus} .="$id\t";
1241 $hash_nuclei{$nucleus} += 1;
1242
1243 $id = $1;
1244 $desc = $2;
1245 $sequence = "";
1246 $nucleus = "";
1247 next;
1248 }
1249 $sequence .= $_;
1250 }
1251 }
1252 }
1253 $nucleus = substr($sequence,1,$nucleus_lng);
1254 $nucleus =~ tr/[T]/[U]/;
1255 $hash_mirs{$nucleus} .="$id\t";
1256 $hash_nuclei{$nucleus} += 1;
1257 close FASTA;
1258 }
1259
1260
1261 sub parse_file_struct{
1262 #parses the output from RNAfoldand reads it into hashes
1263 my($file) = @_;
1264 my($id,$desc,$seq,$struct,$mfe) = ();
1265 open (FILE_STRUCT, "<$file") or die "can not open $file\n";
1266 while (<FILE_STRUCT>){
1267 chomp;
1268 if (/^>(\S+)\s*(.*)/){
1269 $id= $1;
1270 $desc= $2;
1271 $seq= "";
1272 $struct= "";
1273 $mfe= "";
1274 while (<FILE_STRUCT>){
1275 chomp;
1276 if (/^>(\S+)\s*(.*)/){
1277 $hash_desc{$id} = $desc;
1278 $hash_seq{$id} = $seq;
1279 $hash_struct{$id} = $struct;
1280 $hash_mfe{$id} = $mfe;
1281 $id = $1;
1282 $desc = $2;
1283 $seq = "";
1284 $struct = "";
1285 $mfe = "";
1286 next;
1287 }
1288 if(/^\w/){
1289 tr/uU/tT/;
1290 $seq .= $_;
1291 next;
1292 }
1293 if(/((\.|\(|\))+)/){$struct .=$1;}
1294 if(/\((\s*-\d+\.\d+)\)/){$mfe = $1;}
1295 }
1296 }
1297 }
1298 $hash_desc{$id} = $desc;
1299 $hash_seq{$id} = $seq;
1300 $hash_struct{$id} = $struct;
1301 $hash_mfe{$id} = $mfe;
1302 close FILE_STRUCT;
1303 return;
1304 }
1305
1306
1307 sub score_s{
1308
1309 #this score message is appended to the end of the string of score messages outputted for the potential precursor
1310
1311 my $message=shift;
1312 $message_score.=$message."\n";;
1313 return;
1314 }
1315
1316
1317
1318 sub score_p{
1319
1320 #this score message is appended to the beginning of the string of score messages outputted for the potential precursor
1321
1322 my $message=shift;
1323 $message_score=$message."\n".$message_score;
1324 return;
1325 }
1326
1327
1328
1329 sub filter_s{
1330
1331 #this filtering message is appended to the end of the string of filtering messages outputted for the potential precursor
1332
1333 my $message=shift;
1334 $message_filter.=$message."\n";
1335 return;
1336 }
1337
1338
1339 sub filter_p{
1340
1341 #this filtering message is appended to the beginning of the string of filtering messages outputted for the potential precursor
1342
1343 my $message=shift;
1344 if(defined $message_filter){$message_filter=$message."\n".$message_filter;}
1345 else{$message_filter=$message."\n";}
1346 return;
1347 }
1348
1349
1350 sub find_freq{
1351
1352 #finds the frequency of a given read query from its id.
1353
1354 my($query)=@_;
1355
1356 if($query=~/x(\d+)/i){
1357 my $freq=$1;
1358 return $freq;
1359 }else{
1360 #print STDERR "Problem with read format\n";
1361 return 0;
1362 }
1363 }
1364
1365
1366 sub print_hash_comp{
1367
1368 #prints the 'comp' hash
1369
1370 my @keys=sort keys %hash_comp;
1371 foreach my $key(@keys){
1372 my $value=$hash_comp{$key};
1373 print "$key \t$value\n";
1374 }
1375 }
1376
1377
1378
1379 sub print_hash_bp{
1380
1381 #prints the 'bp' hash
1382
1383 my @keys=sort {$a<=>$b} keys %hash_bp;
1384 foreach my $key(@keys){
1385 my $value=$hash_bp{$key};
1386 print "$key\t$value\n";
1387 }
1388 print "\n";
1389 }
1390
1391
1392
1393 sub find_strand{
1394
1395 #A subroutine to find the strand, parsing different blast formats
1396
1397 my($other)=@_;
1398
1399 my $strand="+";
1400
1401 if($other=~/-/){
1402 $strand="-";
1403 }
1404
1405 if($other=~/minus/i){
1406 $strand="-";
1407 }
1408 return($strand);
1409 }
1410
1411
1412 sub contained{
1413
1414 #Is the stretch defined by the first positions contained in the stretch defined by the second?
1415
1416 my($beg1,$end1,$beg2,$end2)=@_;
1417
1418 testbeginend($beg1,$end1,$beg2,$end2);
1419
1420 if($beg2<=$beg1 and $end1<=$end2){
1421 return 1;
1422 }else{
1423 return 0;
1424 }
1425 }
1426
1427
1428 sub testbeginend{
1429
1430 #Are the beginposition numerically smaller than the endposition for each pair?
1431
1432 my($begin1,$end1,$begin2,$end2)=@_;
1433
1434 unless($begin1<=$end1 and $begin2<=$end2){
1435 print STDERR "beg can not be larger than end for $subject_old\n";
1436 exit;
1437 }
1438 }
1439
1440
1441 sub rev_pos{
1442
1443 # The blast_parsed format always uses positions that are relative to the 5' of the given strand
1444 # This means that for a sequence of length n, the first nucleotide on the minus strand base pairs with
1445 # the n't nucleotide on the plus strand
1446
1447 # This subroutine reverses the begin and end positions of positions of the minus strand so that they
1448 # are relative to the 5' end of the plus strand
1449
1450 my($beg,$end,$lng)=@_;
1451
1452 my $new_end=$lng-$beg+1;
1453 my $new_beg=$lng-$end+1;
1454
1455 return($new_beg,$new_end);
1456 }
1457
1458 sub round {
1459
1460 #rounds to nearest integer
1461
1462 my($number) = shift;
1463 return int($number + .5);
1464
1465 }
1466
1467
1468 sub rev{
1469
1470 #reverses the order of nucleotides in a sequence
1471
1472 my($sequence)=@_;
1473
1474 my $rev=reverse $sequence;
1475
1476 return $rev;
1477 }
1478
1479 sub com{
1480
1481 #the complementary of a sequence
1482
1483 my($sequence)=@_;
1484
1485 $sequence=~tr/acgtuACGTU/TGCAATGCAA/;
1486
1487 return $sequence;
1488 }
1489
1490 sub revcom{
1491
1492 #reverse complement
1493
1494 my($sequence)=@_;
1495
1496 my $revcom=rev(com($sequence));
1497
1498 return $revcom;
1499 }
1500
1501
1502 sub max2 {
1503
1504 #max of two numbers
1505
1506 my($a, $b) = @_;
1507 return ($a>$b ? $a : $b);
1508 }
1509
1510 sub min2 {
1511
1512 #min of two numbers
1513
1514 my($a, $b) = @_;
1515 return ($a<$b ? $a : $b);
1516 }
1517
1518
1519
1520 sub score_freq{
1521
1522 # scores the count of reads that map to the potential precursor
1523 # Assumes geometric distribution as described in methods section of manuscript
1524
1525 my $freq=shift;
1526
1527 #parameters of known precursors and background hairpins
1528 my $parameter_test=0.999;
1529 my $parameter_control=0.6;
1530
1531 #log_odds calculated directly to avoid underflow
1532 my $intercept=log((1-$parameter_test)/(1-$parameter_control));
1533 my $slope=log($parameter_test/$parameter_control);
1534 my $log_odds=$slope*$freq+$intercept;
1535
1536 #if no strong evidence for 3' overhangs, limit the score contribution to 0
1537 unless($options{x} or $hash_comp{"star_read"}){$log_odds=min2($log_odds,0);}
1538
1539 return $log_odds;
1540 }
1541
1542
1543
1544 ##sub score_mfe{
1545
1546 # scores the minimum free energy in kCal/mol of the potential precursor
1547 # Assumes Gumbel distribution as described in methods section of manuscript
1548
1549 ## my $mfe=shift;
1550
1551 #numerical value, minimum 1
1552 ## my $mfe_adj=max2(1,-$mfe);
1553
1554 #parameters of known precursors and background hairpins, scale and location
1555 ## my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32);
1556 ## my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23);
1557
1558 ## my $odds=$prob_test/$prob_background;
1559 ## my $log_odds=log($odds);
1560
1561 ## return $log_odds;
1562 ##}
1563
1564 sub score_mfe{
1565 # use bignum;
1566
1567 # scores the minimum free energy in kCal/mol of the potential precursor
1568 # Assumes Gumbel distribution as described in methods section of manuscript
1569
1570 my ($mfe,$mlng)=@_;
1571
1572 #numerical value, minimum 1
1573 my $mfe_adj=max2(1,-$mfe);
1574 my $mfe_adj1=$mfe/$mlng;
1575 #parameters of known precursors and background hairpins, scale and location
1576 my $a=1.339e-12;my $b=2.778e-13;my $c=45.834;
1577 my $ev=$e**($mfe_adj1*$c);
1578 #print STDERR "\n***",$ev,"**\t",$ev+$b,"\t";
1579 my $log_odds=($a/($b+$ev));
1580
1581
1582 my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32);
1583 my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23);
1584
1585 my $odds=$prob_test/$prob_background;
1586 my $log_odds_2=log($odds);
1587 #print STDERR "log_odds :",$log_odds,"\t",$log_odds_2,"\n";
1588 return $log_odds;
1589 }
1590
1591
1592
1593 sub prob_gumbel_discretized{
1594
1595 # discretized Gumbel distribution, probabilities within windows of 1 kCal/mol
1596 # uses the subroutine that calculates the cdf to find the probabilities
1597
1598 my ($var,$scale,$location)=@_;
1599
1600 my $bound_lower=$var-0.5;
1601 my $bound_upper=$var+0.5;
1602
1603 my $cdf_lower=cdf_gumbel($bound_lower,$scale,$location);
1604 my $cdf_upper=cdf_gumbel($bound_upper,$scale,$location);
1605
1606 my $prob=$cdf_upper-$cdf_lower;
1607
1608 return $prob;
1609 }
1610
1611
1612 sub cdf_gumbel{
1613
1614 # calculates the cumulative distribution function of the Gumbel distribution
1615
1616 my ($var,$scale,$location)=@_;
1617
1618 my $cdf=$e**(-($e**(-($var-$location)/$scale)));
1619
1620 return $cdf;
1621 }
1622