comparison miRDeep_plant.pl @ 32:b3f9565b30b4 draft

Uploaded
author big-tiandm
date Thu, 31 Jul 2014 03:07:30 -0400
parents dc5a29826c7d
children 0c4e11018934
comparison
equal deleted inserted replaced
31:7321a6f82492 32:b3f9565b30b4
1 #!/usr/bin/perl
2
3 use warnings;
4 use strict;
5 use Getopt::Std;
6
7
8
9 ################################# MIRDEEP #################################################
10
11 ################################## USAGE ##################################################
12
13
14 my $usage=
15 "$0 file_signature file_structure temp_out_directory
16
17 This is the core algorithm of miRDeep. It takes as input a file in blastparsed format with
18 information on the positions of reads aligned to potential precursor sequences (signature).
19 It also takes as input an RNAfold output file, giving information on the sequence, structure
20 and mimimum free energy of the potential precursor sequences.
21
22 Extra arguments can be given. -s specifies a fastafile containing the known mature miRNA
23 sequences that should be considered for conservation purposes. -t prints out the potential
24 precursor sequences that do _not_ exceed the cut-off (default prints out the sequences that
25 exceeds the cut-off). -u gives limited output, that is only the ids of the potential precursors
26 that exceed the cut-off. -v varies the cut-off. -x is a sensitive option for Sanger sequences
27 obtained through conventional cloning. -z consider the number of base pairings in the lower
28 stems (this option is not well tested).
29
30 -h print this usage
31 -s fasta file with known miRNAs
32 #-o temp directory ,maked befor running the program.
33 -t print filtered
34 -u limited output (only ids)
35 -v cut-off (default 1)
36 -x sensitive option for Sanger sequences
37 -y use Randfold
38 -z consider Drosha processing
39 ";
40
41
42
43
44
45 ############################################################################################
46
47 ################################### INPUT ##################################################
48
49
50 #signature file in blast_parsed format
51 my $file_blast_parsed=shift or die $usage;
52
53 #structure file outputted from RNAfold
54 my $file_struct=shift or die $usage;
55
56 my $tmpdir=shift or die $usage;
57 #options
58 my %options=();
59 getopts("hs:tuv:xyz",\%options);
60
61
62
63
64
65
66 #############################################################################################
67
68 ############################# GLOBAL VARIABLES ##############################################
69
70
71 #parameters
72 my $nucleus_lng=11;
73
74 my $score_star=3.9;
75 my $score_star_not=-1.3;
76 my $score_nucleus=7.63;
77 my $score_nucleus_not=-1.17;
78 my $score_randfold=1.37;
79 my $score_randfold_not=-3.624;
80 my $score_intercept=0.3;
81 my @scores_stem=(-3.1,-2.3,-2.2,-1.6,-1.5,0.1,0.6,0.8,0.9,0.9,0);
82 my $score_min=1;
83 if($options{v}){$score_min=$options{v};}
84 if($options{x}){$score_min=-5;}
85
86 my $e=2.718281828;
87
88 #hashes
89 my %hash_desc;
90 my %hash_seq;
91 my %hash_struct;
92 my %hash_mfe;
93 my %hash_nuclei;
94 my %hash_mirs;
95 my %hash_query;
96 my %hash_comp;
97 my %hash_bp;
98
99 #other variables
100 my $subject_old;
101 my $message_filter;
102 my $message_score;
103 my $lines;
104 my $out_of_bound;
105
106
107
108 ##############################################################################################
109
110 ################################ MAIN ######################################################
111
112
113 #print help if that option is used
114 if($options{h}){die $usage;}
115 unless ($tmpdir=~/\/$/) {$tmpdir .="/";}
116 if(!(-s $tmpdir)){mkdir $tmpdir;}
117 $tmpdir .="TMP_DIR/";
118 mkdir $tmpdir;
119
120 #parse structure file outputted from RNAfold
121 parse_file_struct($file_struct);
122
123 #if conservation is scored, the fasta file of known miRNA sequences is parsed
124 if($options{s}){create_hash_nuclei($options{s})};
125
126 #parse signature file in blast_parsed format and resolve each potential precursor
127 parse_file_blast_parsed($file_blast_parsed);
128 `rm -rf $tmpdir`;
129 exit;
130
131
132
133
134 ##############################################################################################
135
136 ############################## SUBROUTINES ###################################################
137
138
139
140 sub parse_file_blast_parsed{
141
142 # read through the signature blastparsed file, fills up a hash with information on queries
143 # (deep sequences) mapping to the current subject (potential precursor) and resolve each
144 # potential precursor in turn
145
146 my $file_blast_parsed=shift;
147
148 open (FILE_BLAST_PARSED, "<$file_blast_parsed") or die "can not open $file_blast_parsed\n";
149 while (my $line=<FILE_BLAST_PARSED>){
150 if($line=~/^(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)$/){
151 my $query=$1;
152 my $query_lng=$2;
153 my $query_beg=$3;
154 my $query_end=$4;
155 my $subject=$5;
156 my $subject_lng=$6;
157 my $subject_beg=$7;
158 my $subject_end=$8;
159 my $e_value=$9;
160 my $pid=$10;
161 my $bitscore=$11;
162 my $other=$12;
163
164 #if the new line concerns a new subject (potential precursor) then the old subject must be resolved
165 if($subject_old and $subject_old ne $subject){
166 resolve_potential_precursor();
167 }
168
169 #resolve the strand
170 my $strand=find_strand($other);
171
172 #resolve the number of reads that the deep sequence represents
173 my $freq=find_freq($query);
174
175 #read information of the query (deep sequence) into hash
176 $hash_query{$query}{"subject_beg"}=$subject_beg;
177 $hash_query{$query}{"subject_end"}=$subject_end;
178 $hash_query{$query}{"strand"}=$strand;
179 $hash_query{$query}{"freq"}=$freq;
180
181 #save the signature information
182 $lines.=$line;
183
184 $subject_old=$subject;
185 }
186 }
187 resolve_potential_precursor();
188 }
189
190 sub resolve_potential_precursor{
191
192 # dissects the potential precursor in parts by filling hashes, and tests if it passes the
193 # initial filter and the scoring filter
194
195 # binary variable whether the potential precursor is still viable
196 my $ret=1;
197 #print STDERR ">$subject_old\n";
198
199 fill_structure();
200 #print STDERR "\%hash_bp",scalar keys %hash_bp,"\n";
201 fill_pri();
202 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
203
204 fill_mature();
205 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
206
207 fill_star();
208 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
209
210 fill_loop();
211 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
212
213 fill_lower_flanks();
214 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
215
216 # do_test_assemble();
217
218 # this is the actual classification
219 unless(pass_filtering_initial() and pass_threshold_score()){$ret=0;}
220
221 print_results($ret);
222
223 reset_variables();
224
225 return;
226
227 }
228
229
230
231 sub print_results{
232
233 my $ret=shift;
234
235 # print out if the precursor is accepted and accepted precursors should be printed out
236 # or if the potential precursor is discarded and discarded potential precursors should
237 # be printed out
238
239 if((!$options{t} and $ret) or ($options{t} and !$ret)){
240 #full output
241 unless($options{u}){
242 if($message_filter){print $message_filter;}
243 if($message_score){print $message_score;}
244 print_hash_comp();
245 print $lines,"\n\n";
246 return;
247 }
248 #limited output (only ids)
249 my $id=$hash_comp{"pri_id"};
250 print "$id\n";
251 }
252 }
253
254
255
256
257
258
259
260 sub pass_threshold_score{
261
262 # this is the scoring
263
264 #minimum free energy of the potential precursor
265 # my $score_mfe=score_mfe($hash_comp{"pri_mfe"});
266 my $score_mfe=score_mfe($hash_comp{"pri_mfe"},$hash_comp{"pri_end"});
267
268 #count of reads that map in accordance with Dicer processing
269 my $score_freq=score_freq($hash_comp{"freq"});
270 #print STDERR "score_mfe: $score_mfe\nscore_freq: $score_freq\n";
271
272 #basic score
273 my $score=$score_mfe+$score_freq;
274
275 #scoring of conserved nucleus/seed (optional)
276 if($options{s}){
277
278 #if the nucleus is conserved
279 if(test_nucleus_conservation()){
280
281 #nucleus from position 2-8
282 my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng);
283
284 #resolve DNA/RNA ambiguities
285 $nucleus=~tr/[T]/[U]/;
286
287 #print score contribution
288 score_s("score_nucleus\t$score_nucleus");
289
290 #print the ids of known miRNAs with same nucleus
291 score_s("$hash_mirs{$nucleus}");
292 #print STDERR "score_nucleus\t$score_nucleus\n";
293
294 #add to score
295 $score+=$score_nucleus;
296
297 #if the nucleus is not conserved
298 }else{
299 #print (negative) score contribution
300 score_s("score_nucleus\t$score_nucleus_not");
301
302 #add (negative) score contribution
303 $score+=$score_nucleus_not;
304 }
305 }
306
307 #if the majority of potential star reads fall as expected from Dicer processing
308 if($hash_comp{"star_read"}){
309 score_s("score_star\t$score_star");
310 #print STDERR "score_star\t$score_star\n";
311 $score+=$score_star;
312 }else{
313 score_s("score_star\t$score_star_not");
314 #print STDERR "score_star_not\t$score_star_not\n";
315 $score+=$score_star_not;
316 }
317
318 #score lower stems for potential for Drosha recognition (highly optional)
319 if($options{z}){
320 my $stem_bp=$hash_comp{"stem_bp"};
321 my $score_stem=$scores_stem[$stem_bp];
322 $score+=$score_stem;
323 score_s("score_stem\t$score_stem");
324 }
325
326 #print STDERR "score_intercept\t$score_intercept\n";
327
328 $score+=$score_intercept;
329
330 #score for randfold (optional)
331 if($options{y}){
332
333 # only calculate randfold value if it can make the difference between the potential precursor
334 # being accepted or discarded
335 if($score+$score_randfold>=$score_min and $score+$score_randfold_not<=$score_min){
336
337 #randfold value<0.05
338 if(test_randfold()){$score+=$score_randfold;score_s("score_randfold\t$score_randfold");}
339
340 #randfold value>0.05
341 else{$score+=$score_randfold_not;score_s("score_randfold\t$score_randfold_not");}
342 }
343 }
344
345 #round off values to one decimal
346 my $round_mfe=round($score_mfe*10)/10;
347 my $round_freq=round($score_freq*10)/10;
348 my $round=round($score*10)/10;
349
350 #print scores
351 score_s("score_mfe\t$round_mfe\nscore_freq\t$round_freq\nscore\t$round");
352
353 #return 1 if the potential precursor is accepted, return 0 if discarded
354 unless($score>=$score_min){return 0;}
355 return 1;
356 }
357
358 sub test_randfold{
359
360 #print sequence to temporary file, test randfold value, return 1 or 0
361
362 # print_file("pri_seq.fa",">pri_seq\n".$hash_comp{"pri_seq"});
363 my $tmpfile=$tmpdir.$hash_comp{"pri_id"};
364 open(FILE, ">$tmpfile");
365 print FILE ">pri_seq\n",$hash_comp{"pri_seq"};
366 close FILE;
367
368 # my $p_value=`randfold -s $tmpfile 999 | cut -f 3`;
369 my $p1=`randfold -s $tmpfile 999 | cut -f 3`;
370 my $p2=`randfold -s $tmpfile 999 | cut -f 3`;
371 my $p_value=($p1+$p2)/2;
372 wait;
373 # system "rm $tmpfile";
374
375 if($p_value<=0.05){return 1;}
376
377 return 0;
378 }
379
380
381 #sub print_file{
382
383 #print string to file
384
385 # my($file,$string)=@_;
386
387 # open(FILE, ">$file");
388 # print FILE "$string";
389 # close FILE;
390 #}
391
392
393 sub test_nucleus_conservation{
394
395 #test if nucleus is identical to nucleus from known miRNA, return 1 or 0
396
397 my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng);
398 $nucleus=~tr/[T]/[U]/;
399 if($hash_nuclei{$nucleus}){return 1;}
400
401 return 0;
402 }
403
404
405
406 sub pass_filtering_initial{
407
408 #test if the structure forms a plausible hairpin
409 unless(pass_filtering_structure()){filter_p("structure problem"); return 0;}
410
411 #test if >90% of reads map to the hairpin in consistence with Dicer processing
412 unless(pass_filtering_signature()){filter_p("signature problem");return 0;}
413
414 return 1;
415
416 }
417
418
419 sub pass_filtering_signature{
420
421 #number of reads that map in consistence with Dicer processing
422 my $consistent=0;
423
424 #number of reads that map inconsistent with Dicer processing
425 my $inconsistent=0;
426
427 # number of potential star reads map in good consistence with Drosha/Dicer processing
428 # (3' overhangs relative to mature product)
429 my $star_perfect=0;
430
431 # number of potential star reads that do not map in good consistence with 3' overhang
432 my $star_fuzzy=0;
433
434
435 #sort queries (deep sequences) by their position on the hairpin
436 my @queries=sort {$hash_query{$a}{"subject_beg"} <=> $hash_query{$b}{"subject_beg"}} keys %hash_query;
437
438 foreach my $query(@queries){
439
440 #number of reads that the deep sequence represents
441 unless(defined($hash_query{$query}{"freq"})){next;}
442 my $query_freq=$hash_query{$query}{"freq"};
443
444 #test which Dicer product (if any) the deep sequence corresponds to
445 my $product=test_query($query);
446
447 #if the deep sequence corresponds to a Dicer product, add to the 'consistent' variable
448 if($product){$consistent+=$query_freq;}
449
450 #if the deep sequence do not correspond to a Dicer product, add to the 'inconsistent' variable
451 else{$inconsistent+=$query_freq;}
452
453 #test a potential star sequence has good 3' overhang
454 if($product eq "star"){
455 if(test_star($query)){$star_perfect+=$query_freq;}
456 else{$star_fuzzy+=$query_freq;}
457 }
458 }
459
460 # if the majority of potential star sequences map in good accordance with 3' overhang
461 # score for the presence of star evidence
462 if($star_perfect>$star_fuzzy){$hash_comp{"star_read"}=1;}
463
464 #total number of reads mapping to the hairpin
465 my $freq=$consistent+$inconsistent;
466 $hash_comp{"freq"}=$freq;
467 unless($freq>0){filter_s("read frequency too low"); return 0;}
468
469 #unless >90% of the reads map in consistence with Dicer processing, the hairpin is discarded
470 my $inconsistent_fraction=$inconsistent/($inconsistent+$consistent);
471 unless($inconsistent_fraction<=0.1){filter_p("inconsistent\t$inconsistent\nconsistent\t$consistent"); return 0;}
472
473 #the hairpin is retained
474 return 1;
475 }
476
477 sub test_star{
478
479 #test if a deep sequence maps in good consistence with 3' overhang
480
481 my $query=shift;
482
483 #5' begin and 3' end positions
484 my $beg=$hash_query{$query}{"subject_beg"};
485 my $end=$hash_query{$query}{"subject_end"};
486
487 #the difference between observed and expected begin positions must be 0 or 1
488 my $offset=$beg-$hash_comp{"star_beg"};
489 if($offset==0 or $offset==1 or $offset==-1){return 1;}
490
491 return 0;
492 }
493
494
495
496 sub test_query{
497
498 #test if deep sequence maps in consistence with Dicer processing
499
500 my $query=shift;
501
502 #begin, end, strand and read count
503 my $beg=$hash_query{$query}{"subject_beg"};
504 my $end=$hash_query{$query}{"subject_end"};
505 my $strand=$hash_query{$query}{"strand"};
506 my $freq=$hash_query{$query}{"freq"};
507
508 #should not be on the minus strand (although this has in fact anecdotally been observed for known miRNAs)
509 if($strand eq '-'){return 0;}
510
511 #the deep sequence is allowed to stretch 2 nt beyond the expected 5' end
512 my $fuzz_beg=2;
513 #the deep sequence is allowed to stretch 5 nt beyond the expected 3' end
514 my $fuzz_end=2;
515
516 #if in accordance with Dicer processing, return the type of Dicer product
517 if(contained($beg,$end,$hash_comp{"mature_beg"}-$fuzz_beg,$hash_comp{"mature_end"}+$fuzz_end)){return "mature";}
518 if(contained($beg,$end,$hash_comp{"star_beg"}-$fuzz_beg,$hash_comp{"star_end"}+$fuzz_end)){return "star";}
519 if(contained($beg,$end,$hash_comp{"loop_beg"}-$fuzz_beg,$hash_comp{"loop_end"}+$fuzz_end)){return "loop";}
520
521 #if not in accordance, return 0
522 return 0;
523 }
524
525
526 sub pass_filtering_structure{
527
528 #The potential precursor must form a hairpin with miRNA precursor-like characteristics
529
530 #return value
531 my $ret=1;
532
533 #potential mature, star, loop and lower flank parts must be identifiable
534 unless(test_components()){return 0;}
535
536 #no bifurcations
537 unless(no_bifurcations_precursor()){$ret=0;}
538
539 #minimum 14 base pairings in duplex
540 unless(bp_duplex()>=15){$ret=0;filter_s("too few pairings in duplex");}
541
542 #not more than 6 nt difference between mature and star length
543 unless(-6<diff_lng() and diff_lng()<6){$ret=0; filter_s("too big difference between mature and star length") }
544
545 return $ret;
546 }
547
548
549
550
551
552
553 sub test_components{
554
555 #tests whether potential mature, star, loop and lower flank parts are identifiable
556
557 unless($hash_comp{"mature_struct"}){
558 filter_s("no mature");
559 # print STDERR "no mature\n";
560 return 0;
561 }
562
563 unless($hash_comp{"star_struct"}){
564 filter_s("no star");
565 # print STDERR "no star\n";
566 return 0;
567 }
568
569 unless($hash_comp{"loop_struct"}){
570 filter_s("no loop");
571 # print STDERR "no loop\n";
572 return 0;
573 }
574
575 unless($hash_comp{"flank_first_struct"}){
576 filter_s("no flanks");
577 #print STDERR "no flanks_first_struct\n";
578 return 0;
579 }
580
581 unless($hash_comp{"flank_second_struct"}){
582 filter_s("no flanks");
583 # print STDERR "no flanks_second_struct\n";
584 return 0;
585 }
586 return 1;
587 }
588
589
590
591
592
593 sub no_bifurcations_precursor{
594
595 #tests whether there are bifurcations in the hairpin
596
597 #assembles the potential precursor sequence and structure from the expected Dicer products
598 #this is the expected biological precursor, in contrast with 'pri_seq' that includes
599 #some genomic flanks on both sides
600
601 my $pre_struct;
602 my $pre_seq;
603 if($hash_comp{"mature_arm"} eq "first"){
604 $pre_struct.=$hash_comp{"mature_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"star_struct"};
605 $pre_seq.=$hash_comp{"mature_seq"}.$hash_comp{"loop_seq"}.$hash_comp{"star_seq"};
606 }else{
607 $pre_struct.=$hash_comp{"star_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"mature_struct"};
608 $pre_seq.=$hash_comp{"star_seq"}.$hash_comp{"loop_seq"}.$hash_comp{"mature_seq"};
609 }
610
611 #read into hash
612 $hash_comp{"pre_struct"}=$pre_struct;
613 $hash_comp{"pre_seq"}=$pre_seq;
614
615 #simple pattern matching checks for bifurcations
616 unless($pre_struct=~/^((\.|\()+..(\.|\))+)$/){
617 filter_s("bifurcation in precursor");
618 # print STDERR "bifurcation in precursor\n";
619 return 0;
620 }
621
622 return 1;
623 }
624
625 sub bp_precursor{
626
627 #total number of bps in the precursor
628
629 my $pre_struct=$hash_comp{"pre_struct"};
630
631 #simple pattern matching
632 my $pre_bps=0;
633 while($pre_struct=~/\(/g){
634 $pre_bps++;
635 }
636 return $pre_bps;
637 }
638
639
640 sub bp_duplex{
641
642 #total number of bps in the duplex
643
644 my $duplex_bps=0;
645 my $mature_struct=$hash_comp{"mature_struct"};
646
647 #simple pattern matching
648 while($mature_struct=~/(\(|\))/g){
649 $duplex_bps++;
650 }
651 return $duplex_bps;
652 }
653
654 sub diff_lng{
655
656 #find difference between mature and star lengths
657
658 my $mature_lng=length $hash_comp{"mature_struct"};
659 my $star_lng=length $hash_comp{"star_struct"};
660 my $diff_lng=$mature_lng-$star_lng;
661 return $diff_lng;
662 }
663
664
665
666 sub do_test_assemble{
667
668 # not currently used, tests if the 'pri_struct' as assembled from the parts (Dicer products, lower flanks)
669 # is identical to 'pri_struct' before disassembly into parts
670
671 my $assemble_struct;
672
673 if($hash_comp{"flank_first_struct"} and $hash_comp{"mature_struct"} and $hash_comp{"loop_struct"} and $hash_comp{"star_struct"} and $hash_comp{"flank_second_struct"}){
674 if($hash_comp{"mature_arm"} eq "first"){
675 $assemble_struct.=$hash_comp{"flank_first_struct"}.$hash_comp{"mature_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"star_struct"}.$hash_comp{"flank_second_struct"};
676 }else{
677 $assemble_struct.=$hash_comp{"flank_first_struct"}.$hash_comp{"star_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"mature_struct"}.$hash_comp{"flank_second_struct"};
678 }
679 unless($assemble_struct eq $hash_comp{"pri_struct"}){
680 $hash_comp{"test_assemble"}=$assemble_struct;
681 print_hash_comp();
682 }
683 }
684 return;
685 }
686
687
688
689 sub fill_structure{
690
691 #reads the dot bracket structure into the 'bp' hash where each key and value are basepaired
692
693 my $struct=$hash_struct{$subject_old};
694 my $lng=length $struct;
695
696 #local stack for keeping track of basepairings
697 my @bps;
698
699 for(my $pos=1;$pos<=$lng;$pos++){
700 my $struct_pos=excise_struct($struct,$pos,$pos,"+");
701
702 if($struct_pos eq "("){
703 push(@bps,$pos);
704 }
705
706 if($struct_pos eq ")"){
707 my $pos_prev=pop(@bps);
708 $hash_bp{$pos_prev}=$pos;
709 $hash_bp{$pos}=$pos_prev;
710 }
711 }
712 return;
713 }
714
715
716
717 sub fill_star{
718
719 #fills specifics on the expected star strand into 'comp' hash ('component' hash)
720
721 #if the mature sequence is not plausible, don't look for the star arm
722 my $mature_arm=$hash_comp{"mature_arm"};
723 unless($mature_arm){$hash_comp{"star_arm"}=0; return;}
724
725 #if the star sequence is not plausible, don't fill into the hash
726 my($star_beg,$star_end)=find_star();
727 my $star_arm=arm_star($star_beg,$star_end);
728 unless($star_arm){return;}
729
730 #excise expected star sequence and structure
731 my $star_seq=excise_seq($hash_comp{"pri_seq"},$star_beg,$star_end,"+");
732 my $star_struct=excise_seq($hash_comp{"pri_struct"},$star_beg,$star_end,"+");
733
734 #fill into hash
735 $hash_comp{"star_beg"}=$star_beg;
736 $hash_comp{"star_end"}=$star_end;
737 $hash_comp{"star_seq"}=$star_seq;
738 $hash_comp{"star_struct"}=$star_struct;
739 $hash_comp{"star_arm"}=$star_arm;
740
741 return;
742 }
743
744
745 sub find_star{
746
747 #uses the 'bp' hash to find the expected star begin and end positions from the mature positions
748
749 #the -2 is for the overhang
750 my $mature_beg=$hash_comp{"mature_beg"};
751 my $mature_end=$hash_comp{"mature_end"}-2;
752 my $mature_lng=$mature_end-$mature_beg+1;
753
754 #in some cases, the last nucleotide of the mature sequence does not form a base pair,
755 #and therefore does not basepair with the first nucleotide of the star sequence.
756 #In this case, the algorithm searches for the last nucleotide of the mature sequence
757 #to form a base pair. The offset is the number of nucleotides searched through.
758 my $offset_star_beg=0;
759 my $offset_beg=0;
760
761 #the offset should not be longer than the length of the mature sequence, then it
762 #means that the mature sequence does not form any base pairs
763 while(!$offset_star_beg and $offset_beg<$mature_lng){
764 if($hash_bp{$mature_end-$offset_beg}){
765 $offset_star_beg=$hash_bp{$mature_end-$offset_beg};
766 }else{
767 $offset_beg++;
768 }
769 }
770 #when defining the beginning of the star sequence, compensate for the offset
771 my $star_beg=$offset_star_beg-$offset_beg;
772
773 #same as above
774 my $offset_star_end=0;
775 my $offset_end=0;
776 while(!$offset_star_end and $offset_end<$mature_lng){
777 if($hash_bp{$mature_beg+$offset_end}){
778 $offset_star_end=$hash_bp{$mature_beg+$offset_end};
779 }else{
780 $offset_end++;
781 }
782 }
783 #the +2 is for the overhang
784 my $star_end=$offset_star_end+$offset_end+2;
785
786 return($star_beg,$star_end);
787 }
788
789
790 sub fill_pri{
791
792 #fills basic specifics on the precursor into the 'comp' hash
793
794 my $seq=$hash_seq{$subject_old};
795 my $struct=$hash_struct{$subject_old};
796 my $mfe=$hash_mfe{$subject_old};
797 my $length=length $seq;
798
799 $hash_comp{"pri_id"}=$subject_old;
800 $hash_comp{"pri_seq"}=$seq;
801 $hash_comp{"pri_struct"}=$struct;
802 $hash_comp{"pri_mfe"}=$mfe;
803 $hash_comp{"pri_beg"}=1;
804 $hash_comp{"pri_end"}=$length;
805
806 return;
807 }
808
809
810 sub fill_mature{
811
812 #fills specifics on the mature sequence into the 'comp' hash
813
814 my $mature_query=find_mature_query();
815 my($mature_beg,$mature_end)=find_positions_query($mature_query);
816 my $mature_strand=find_strand_query($mature_query);
817 my $mature_seq=excise_seq($hash_comp{"pri_seq"},$mature_beg,$mature_end,$mature_strand);
818 my $mature_struct=excise_struct($hash_comp{"pri_struct"},$mature_beg,$mature_end,$mature_strand);
819 my $mature_arm=arm_mature($mature_beg,$mature_end,$mature_strand);
820
821 $hash_comp{"mature_query"}=$mature_query;
822 $hash_comp{"mature_beg"}=$mature_beg;
823 $hash_comp{"mature_end"}=$mature_end;
824 $hash_comp{"mature_strand"}=$mature_strand;
825 $hash_comp{"mature_struct"}=$mature_struct;
826 $hash_comp{"mature_seq"}=$mature_seq;
827 $hash_comp{"mature_arm"}=$mature_arm;
828
829 return;
830 }
831
832
833
834 sub fill_loop{
835
836 #fills specifics on the loop sequence into the 'comp' hash
837
838 #unless both mature and star sequences are plausible, do not look for the loop
839 unless($hash_comp{"mature_arm"} and $hash_comp{"star_arm"}){return;}
840
841 my $loop_beg;
842 my $loop_end;
843
844 #defining the begin and end positions of the loop from the mature and star positions
845 #excision depends on whether the mature or star sequence is 5' of the loop ('first')
846 if($hash_comp{"mature_arm"} eq "first"){
847 $loop_beg=$hash_comp{"mature_end"}+1;
848 }else{
849 $loop_end=$hash_comp{"mature_beg"}-1;
850 }
851
852 if($hash_comp{"star_arm"} eq "first"){
853 $loop_beg=$hash_comp{"star_end"}+1;
854 }else{
855 $loop_end=$hash_comp{"star_beg"}-1;
856 }
857
858 #unless the positions are plausible, do not fill into hash
859 unless(test_loop($loop_beg,$loop_end)){return;}
860
861 my $loop_seq=excise_seq($hash_comp{"pri_seq"},$loop_beg,$loop_end,"+");
862 my $loop_struct=excise_struct($hash_comp{"pri_struct"},$loop_beg,$loop_end,"+");
863
864 $hash_comp{"loop_beg"}=$loop_beg;
865 $hash_comp{"loop_end"}=$loop_end;
866 $hash_comp{"loop_seq"}=$loop_seq;
867 $hash_comp{"loop_struct"}=$loop_struct;
868
869 return;
870 }
871
872
873 sub fill_lower_flanks{
874
875 #fills specifics on the lower flanks and unpaired strands into the 'comp' hash
876
877 #unless both mature and star sequences are plausible, do not look for the flanks
878 unless($hash_comp{"mature_arm"} and $hash_comp{"star_arm"}){return;}
879
880 my $flank_first_end;
881 my $flank_second_beg;
882
883 #defining the begin and end positions of the flanks from the mature and star positions
884 #excision depends on whether the mature or star sequence is 5' in the potenitial precursor ('first')
885 if($hash_comp{"mature_arm"} eq "first"){
886 $flank_first_end=$hash_comp{"mature_beg"}-1;
887 }else{
888 $flank_second_beg=$hash_comp{"mature_end"}+1;
889 }
890
891 if($hash_comp{"star_arm"} eq "first"){
892 $flank_first_end=$hash_comp{"star_beg"}-1;
893 }else{
894 $flank_second_beg=$hash_comp{"star_end"}+1;
895 }
896
897 #unless the positions are plausible, do not fill into hash
898 unless(test_flanks($flank_first_end,$flank_second_beg)){return;}
899
900 $hash_comp{"flank_first_end"}=$flank_first_end;
901 $hash_comp{"flank_second_beg"}=$flank_second_beg;
902 $hash_comp{"flank_first_seq"}=excise_seq($hash_comp{"pri_seq"},$hash_comp{"pri_beg"},$hash_comp{"flank_first_end"},"+");
903 $hash_comp{"flank_second_seq"}=excise_seq($hash_comp{"pri_seq"},$hash_comp{"flank_second_beg"},$hash_comp{"pri_end"},"+");
904 $hash_comp{"flank_first_struct"}=excise_struct($hash_comp{"pri_struct"},$hash_comp{"pri_beg"},$hash_comp{"flank_first_end"},"+");
905 $hash_comp{"flank_second_struct"}=excise_struct($hash_comp{"pri_struct"},$hash_comp{"flank_second_beg"},$hash_comp{"pri_end"},"+");
906
907 if($options{z}){
908 fill_stems_drosha();
909 }
910
911 return;
912 }
913
914
915 sub fill_stems_drosha{
916
917 #scores the number of base pairings formed by the first ten nt of the lower stems
918 #in general, the more stems, the higher the score contribution
919 #warning: this options has not been thoroughly tested
920
921 my $flank_first_struct=$hash_comp{"flank_first_struct"};
922 my $flank_second_struct=$hash_comp{"flank_second_struct"};
923
924 my $stem_first=substr($flank_first_struct,-10);
925 my $stem_second=substr($flank_second_struct,0,10);
926
927 my $stem_bp_first=0;
928 my $stem_bp_second=0;
929
930 #find base pairings by simple pattern matching
931 while($stem_first=~/\(/g){
932 $stem_bp_first++;
933 }
934
935 while($stem_second=~/\)/g){
936 $stem_bp_second++;
937 }
938
939 my $stem_bp=min2($stem_bp_first,$stem_bp_second);
940
941 $hash_comp{"stem_first"}=$stem_first;
942 $hash_comp{"stem_second"}=$stem_second;
943 $hash_comp{"stem_bp_first"}=$stem_bp_first;
944 $hash_comp{"stem_bp_second"}=$stem_bp_second;
945 $hash_comp{"stem_bp"}=$stem_bp;
946
947 return;
948 }
949
950
951
952
953 sub arm_mature{
954
955 #tests whether the mature sequence is in the 5' ('first') or 3' ('second') arm of the potential precursor
956
957 my ($beg,$end,$strand)=@_;
958
959 #mature and star sequences should alway be on plus strand
960 if($strand eq "-"){return 0;}
961
962 #there should be no bifurcations and minimum one base pairing
963 my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,$strand);
964 if(defined($struct) and $struct=~/^(\(|\.)+$/ and $struct=~/\(/){
965 return "first";
966 }elsif(defined($struct) and $struct=~/^(\)|\.)+$/ and $struct=~/\)/){
967 return "second";
968 }
969 return 0;
970 }
971
972
973 sub arm_star{
974
975 #tests whether the star sequence is in the 5' ('first') or 3' ('second') arm of the potential precursor
976
977 my ($beg,$end)=@_;
978
979 #unless the begin and end positions are plausible, test negative
980 unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;}
981
982 #no overlap between the mature and the star sequence
983 if($hash_comp{"mature_arm"} eq "first"){
984 ($hash_comp{"mature_end"}<$beg) or return 0;
985 }elsif($hash_comp{"mature_arm"} eq "second"){
986 ($end<$hash_comp{"mature_beg"}) or return 0;
987 }
988
989 #there should be no bifurcations and minimum one base pairing
990 my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,"+");
991 if($struct=~/^(\(|\.)+$/ and $struct=~/\(/){
992 return "first";
993 }elsif($struct=~/^(\)|\.)+$/ and $struct=~/\)/){
994 return "second";
995 }
996 return 0;
997 }
998
999
1000 sub test_loop{
1001
1002 #tests the loop positions
1003
1004 my ($beg,$end)=@_;
1005
1006 #unless the begin and end positions are plausible, test negative
1007 unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;}
1008
1009 return 1;
1010 }
1011
1012
1013 sub test_flanks{
1014
1015 #tests the positions of the lower flanks
1016
1017 my ($beg,$end)=@_;
1018
1019 #unless the begin and end positions are plausible, test negative
1020 unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;}
1021
1022 return 1;
1023 }
1024
1025
1026 sub comp{
1027
1028 #subroutine to retrive from the 'comp' hash
1029
1030 my $type=shift;
1031 my $component=$hash_comp{$type};
1032 return $component;
1033 }
1034
1035
1036 sub find_strand_query{
1037
1038 #subroutine to find the strand for a given query
1039
1040 my $query=shift;
1041 my $strand=$hash_query{$query}{"strand"};
1042 return $strand;
1043 }
1044
1045
1046 sub find_positions_query{
1047
1048 #subroutine to find the begin and end positions for a given query
1049
1050 my $query=shift;
1051 my $beg=$hash_query{$query}{"subject_beg"};
1052 my $end=$hash_query{$query}{"subject_end"};
1053 return ($beg,$end);
1054 }
1055
1056
1057
1058 sub find_mature_query{
1059
1060 #finds the query with the highest frequency of reads and returns it
1061 #is used to determine the positions of the potential mature sequence
1062
1063 my @queries=sort {$hash_query{$b}{"freq"} <=> $hash_query{$a}{"freq"}} keys %hash_query;
1064 my $mature_query=$queries[0];
1065 return $mature_query;
1066 }
1067
1068
1069
1070
1071 sub reset_variables{
1072
1073 #resets the hashes for the next potential precursor
1074
1075 # %hash_query=();
1076 # %hash_comp=();
1077 # %hash_bp=();
1078 foreach my $key (keys %hash_query) {delete($hash_query{$key});}
1079 foreach my $key (keys %hash_comp) {delete($hash_comp{$key});}
1080 foreach my $key (keys %hash_bp) {delete($hash_bp{$key});}
1081
1082 # $message_filter=();
1083 # $message_score=();
1084 # $lines=();
1085 undef($message_filter);
1086 undef($message_score);
1087 undef($lines);
1088 return;
1089 }
1090
1091
1092
1093 sub excise_seq{
1094
1095 #excise sub sequence from the potential precursor
1096
1097 my($seq,$beg,$end,$strand)=@_;
1098
1099 #begin can be equal to end if only one nucleotide is excised
1100 unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;}
1101
1102 #rarely, permuted combinations of signature and structure cause out of bound excision errors.
1103 #this happens once appr. every two thousand combinations
1104 unless($beg<=length($seq)){$out_of_bound++;return 0;}
1105
1106 #if on the minus strand, the reverse complement should be excised
1107 if($strand eq "-"){$seq=revcom($seq);}
1108
1109 #the blast parsed format is 1-indexed, substr is 0-indexed
1110 my $sub_seq=substr($seq,$beg-1,$end-$beg+1);
1111
1112 return $sub_seq;
1113
1114 }
1115
1116 sub excise_struct{
1117
1118 #excise sub structure
1119
1120 my($struct,$beg,$end,$strand)=@_;
1121 my $lng=length $struct;
1122
1123 #begin can be equal to end if only one nucleotide is excised
1124 unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;}
1125
1126 #rarely, permuted combinations of signature and structure cause out of bound excision errors.
1127 #this happens once appr. every two thousand combinations
1128 unless($beg<=length($struct)){return 0;}
1129
1130 #if excising relative to minus strand, positions are reversed
1131 if($strand eq "-"){($beg,$end)=rev_pos($beg,$end,$lng);}
1132
1133 #the blast parsed format is 1-indexed, substr is 0-indexed
1134 my $sub_struct=substr($struct,$beg-1,$end-$beg+1);
1135
1136 return $sub_struct;
1137 }
1138
1139
1140 sub create_hash_nuclei{
1141 #parses a fasta file with sequences of known miRNAs considered for conservation purposes
1142 #reads the nuclei into a hash
1143
1144 my ($file) = @_;
1145 my ($id, $desc, $sequence, $nucleus) = ();
1146
1147 open (FASTA, "<$file") or die "can not open $file\n";
1148 while (<FASTA>)
1149 {
1150 chomp;
1151 if (/^>(\S+)(.*)/)
1152 {
1153 $id = $1;
1154 $desc = $2;
1155 $sequence = "";
1156 $nucleus = "";
1157 while (<FASTA>){
1158 chomp;
1159 if (/^>(\S+)(.*)/){
1160 $nucleus = substr($sequence,1,$nucleus_lng);
1161 $nucleus =~ tr/[T]/[U]/;
1162 $hash_mirs{$nucleus} .="$id\t";
1163 $hash_nuclei{$nucleus} += 1;
1164
1165 $id = $1;
1166 $desc = $2;
1167 $sequence = "";
1168 $nucleus = "";
1169 next;
1170 }
1171 $sequence .= $_;
1172 }
1173 }
1174 }
1175 $nucleus = substr($sequence,1,$nucleus_lng);
1176 $nucleus =~ tr/[T]/[U]/;
1177 $hash_mirs{$nucleus} .="$id\t";
1178 $hash_nuclei{$nucleus} += 1;
1179 close FASTA;
1180 }
1181
1182
1183 sub parse_file_struct{
1184 #parses the output from RNAfoldand reads it into hashes
1185 my($file) = @_;
1186 my($id,$desc,$seq,$struct,$mfe) = ();
1187 open (FILE_STRUCT, "<$file") or die "can not open $file\n";
1188 while (<FILE_STRUCT>){
1189 chomp;
1190 if (/^>(\S+)\s*(.*)/){
1191 $id= $1;
1192 $desc= $2;
1193 $seq= "";
1194 $struct= "";
1195 $mfe= "";
1196 while (<FILE_STRUCT>){
1197 chomp;
1198 if (/^>(\S+)\s*(.*)/){
1199 $hash_desc{$id} = $desc;
1200 $hash_seq{$id} = $seq;
1201 $hash_struct{$id} = $struct;
1202 $hash_mfe{$id} = $mfe;
1203 $id = $1;
1204 $desc = $2;
1205 $seq = "";
1206 $struct = "";
1207 $mfe = "";
1208 next;
1209 }
1210 if(/^\w/){
1211 tr/uU/tT/;
1212 $seq .= $_;
1213 next;
1214 }
1215 if(/((\.|\(|\))+)/){$struct .=$1;}
1216 if(/\((\s*-\d+\.\d+)\)/){$mfe = $1;}
1217 }
1218 }
1219 }
1220 $hash_desc{$id} = $desc;
1221 $hash_seq{$id} = $seq;
1222 $hash_struct{$id} = $struct;
1223 $hash_mfe{$id} = $mfe;
1224 close FILE_STRUCT;
1225 return;
1226 }
1227
1228
1229 sub score_s{
1230
1231 #this score message is appended to the end of the string of score messages outputted for the potential precursor
1232
1233 my $message=shift;
1234 $message_score.=$message."\n";;
1235 return;
1236 }
1237
1238
1239
1240 sub score_p{
1241
1242 #this score message is appended to the beginning of the string of score messages outputted for the potential precursor
1243
1244 my $message=shift;
1245 $message_score=$message."\n".$message_score;
1246 return;
1247 }
1248
1249
1250
1251 sub filter_s{
1252
1253 #this filtering message is appended to the end of the string of filtering messages outputted for the potential precursor
1254
1255 my $message=shift;
1256 $message_filter.=$message."\n";
1257 return;
1258 }
1259
1260
1261 sub filter_p{
1262
1263 #this filtering message is appended to the beginning of the string of filtering messages outputted for the potential precursor
1264
1265 my $message=shift;
1266 if(defined $message_filter){$message_filter=$message."\n".$message_filter;}
1267 else{$message_filter=$message."\n";}
1268 return;
1269 }
1270
1271
1272 sub find_freq{
1273
1274 #finds the frequency of a given read query from its id.
1275
1276 my($query)=@_;
1277
1278 if($query=~/x(\d+)/i){
1279 my $freq=$1;
1280 return $freq;
1281 }else{
1282 print STDERR "Problem with read format\n";
1283 return 0;
1284 }
1285 }
1286
1287
1288 sub print_hash_comp{
1289
1290 #prints the 'comp' hash
1291
1292 my @keys=sort keys %hash_comp;
1293 foreach my $key(@keys){
1294 my $value=$hash_comp{$key};
1295 print "$key \t$value\n";
1296 }
1297 }
1298
1299
1300
1301 sub print_hash_bp{
1302
1303 #prints the 'bp' hash
1304
1305 my @keys=sort {$a<=>$b} keys %hash_bp;
1306 foreach my $key(@keys){
1307 my $value=$hash_bp{$key};
1308 print "$key\t$value\n";
1309 }
1310 print "\n";
1311 }
1312
1313
1314
1315 sub find_strand{
1316
1317 #A subroutine to find the strand, parsing different blast formats
1318
1319 my($other)=@_;
1320
1321 my $strand="+";
1322
1323 if($other=~/-/){
1324 $strand="-";
1325 }
1326
1327 if($other=~/minus/i){
1328 $strand="-";
1329 }
1330 return($strand);
1331 }
1332
1333
1334 sub contained{
1335
1336 #Is the stretch defined by the first positions contained in the stretch defined by the second?
1337
1338 my($beg1,$end1,$beg2,$end2)=@_;
1339
1340 testbeginend($beg1,$end1,$beg2,$end2);
1341
1342 if($beg2<=$beg1 and $end1<=$end2){
1343 return 1;
1344 }else{
1345 return 0;
1346 }
1347 }
1348
1349
1350 sub testbeginend{
1351
1352 #Are the beginposition numerically smaller than the endposition for each pair?
1353
1354 my($begin1,$end1,$begin2,$end2)=@_;
1355
1356 unless($begin1<=$end1 and $begin2<=$end2){
1357 print STDERR "beg can not be larger than end for $subject_old\n";
1358 exit;
1359 }
1360 }
1361
1362
1363 sub rev_pos{
1364
1365 # The blast_parsed format always uses positions that are relative to the 5' of the given strand
1366 # This means that for a sequence of length n, the first nucleotide on the minus strand base pairs with
1367 # the n't nucleotide on the plus strand
1368
1369 # This subroutine reverses the begin and end positions of positions of the minus strand so that they
1370 # are relative to the 5' end of the plus strand
1371
1372 my($beg,$end,$lng)=@_;
1373
1374 my $new_end=$lng-$beg+1;
1375 my $new_beg=$lng-$end+1;
1376
1377 return($new_beg,$new_end);
1378 }
1379
1380 sub round {
1381
1382 #rounds to nearest integer
1383
1384 my($number) = shift;
1385 return int($number + .5);
1386
1387 }
1388
1389
1390 sub rev{
1391
1392 #reverses the order of nucleotides in a sequence
1393
1394 my($sequence)=@_;
1395
1396 my $rev=reverse $sequence;
1397
1398 return $rev;
1399 }
1400
1401 sub com{
1402
1403 #the complementary of a sequence
1404
1405 my($sequence)=@_;
1406
1407 $sequence=~tr/acgtuACGTU/TGCAATGCAA/;
1408
1409 return $sequence;
1410 }
1411
1412 sub revcom{
1413
1414 #reverse complement
1415
1416 my($sequence)=@_;
1417
1418 my $revcom=rev(com($sequence));
1419
1420 return $revcom;
1421 }
1422
1423
1424 sub max2 {
1425
1426 #max of two numbers
1427
1428 my($a, $b) = @_;
1429 return ($a>$b ? $a : $b);
1430 }
1431
1432 sub min2 {
1433
1434 #min of two numbers
1435
1436 my($a, $b) = @_;
1437 return ($a<$b ? $a : $b);
1438 }
1439
1440
1441
1442 sub score_freq{
1443
1444 # scores the count of reads that map to the potential precursor
1445 # Assumes geometric distribution as described in methods section of manuscript
1446
1447 my $freq=shift;
1448
1449 #parameters of known precursors and background hairpins
1450 my $parameter_test=0.999;
1451 my $parameter_control=0.6;
1452
1453 #log_odds calculated directly to avoid underflow
1454 my $intercept=log((1-$parameter_test)/(1-$parameter_control));
1455 my $slope=log($parameter_test/$parameter_control);
1456 my $log_odds=$slope*$freq+$intercept;
1457
1458 #if no strong evidence for 3' overhangs, limit the score contribution to 0
1459 unless($options{x} or $hash_comp{"star_read"}){$log_odds=min2($log_odds,0);}
1460
1461 return $log_odds;
1462 }
1463
1464
1465
1466 ##sub score_mfe{
1467
1468 # scores the minimum free energy in kCal/mol of the potential precursor
1469 # Assumes Gumbel distribution as described in methods section of manuscript
1470
1471 ## my $mfe=shift;
1472
1473 #numerical value, minimum 1
1474 ## my $mfe_adj=max2(1,-$mfe);
1475
1476 #parameters of known precursors and background hairpins, scale and location
1477 ## my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32);
1478 ## my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23);
1479
1480 ## my $odds=$prob_test/$prob_background;
1481 ## my $log_odds=log($odds);
1482
1483 ## return $log_odds;
1484 ##}
1485
1486 sub score_mfe{
1487 # use bignum;
1488
1489 # scores the minimum free energy in kCal/mol of the potential precursor
1490 # Assumes Gumbel distribution as described in methods section of manuscript
1491
1492 my ($mfe,$mlng)=@_;
1493
1494 #numerical value, minimum 1
1495 my $mfe_adj=max2(1,-$mfe);
1496 my $mfe_adj1=$mfe/$mlng;
1497 #parameters of known precursors and background hairpins, scale and location
1498 my $a=1.339e-12;my $b=2.778e-13;my $c=45.834;
1499 my $ev=$e**($mfe_adj1*$c);
1500 print STDERR "\n***",$ev,"**\t",$ev+$b,"\t";
1501 my $log_odds=($a/($b+$ev));
1502
1503
1504 my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32);
1505 my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23);
1506
1507 my $odds=$prob_test/$prob_background;
1508 my $log_odds_2=log($odds);
1509 print STDERR "log_odds :",$log_odds,"\t",$log_odds_2,"\n";
1510 return $log_odds;
1511 }
1512
1513
1514
1515 sub prob_gumbel_discretized{
1516
1517 # discretized Gumbel distribution, probabilities within windows of 1 kCal/mol
1518 # uses the subroutine that calculates the cdf to find the probabilities
1519
1520 my ($var,$scale,$location)=@_;
1521
1522 my $bound_lower=$var-0.5;
1523 my $bound_upper=$var+0.5;
1524
1525 my $cdf_lower=cdf_gumbel($bound_lower,$scale,$location);
1526 my $cdf_upper=cdf_gumbel($bound_upper,$scale,$location);
1527
1528 my $prob=$cdf_upper-$cdf_lower;
1529
1530 return $prob;
1531 }
1532
1533
1534 sub cdf_gumbel{
1535
1536 # calculates the cumulative distribution function of the Gumbel distribution
1537
1538 my ($var,$scale,$location)=@_;
1539
1540 my $cdf=$e**(-($e**(-($var-$location)/$scale)));
1541
1542 return $cdf;
1543 }
1544