Mercurial > repos > big-tiandm > mirplant2
comparison miRDeep_plant.pl @ 50:7b5a48b972e9 draft
Uploaded
author | big-tiandm |
---|---|
date | Fri, 05 Dec 2014 00:11:02 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
49:f008ab2cadc6 | 50:7b5a48b972e9 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 use warnings; | |
4 use strict; | |
5 use Getopt::Std; | |
6 #use RNA; | |
7 | |
8 | |
9 ################################# MIRDEEP ################################################# | |
10 | |
11 ################################## USAGE ################################################## | |
12 | |
13 | |
14 my $usage= | |
15 "$0 file_signature file_structure temp_out_directory | |
16 | |
17 This is the core algorithm of miRDeep. It takes as input a file in blastparsed format with | |
18 information on the positions of reads aligned to potential precursor sequences (signature). | |
19 It also takes as input an RNAfold output file, giving information on the sequence, structure | |
20 and mimimum free energy of the potential precursor sequences. | |
21 | |
22 Extra arguments can be given. -s specifies a fastafile containing the known mature miRNA | |
23 sequences that should be considered for conservation purposes. -t prints out the potential | |
24 precursor sequences that do _not_ exceed the cut-off (default prints out the sequences that | |
25 exceeds the cut-off). -u gives limited output, that is only the ids of the potential precursors | |
26 that exceed the cut-off. -v varies the cut-off. -x is a sensitive option for Sanger sequences | |
27 obtained through conventional cloning. -z consider the number of base pairings in the lower | |
28 stems (this option is not well tested). | |
29 | |
30 -h print this usage | |
31 -s fasta file with known miRNAs | |
32 #-o temp directory ,maked befor running the program. | |
33 -t print filtered | |
34 -u limited output (only ids) | |
35 -v cut-off (default 1) | |
36 -x sensitive option for Sanger sequences | |
37 -y use Randfold | |
38 -z consider Drosha processing | |
39 "; | |
40 | |
41 | |
42 | |
43 | |
44 | |
45 ############################################################################################ | |
46 | |
47 ################################### INPUT ################################################## | |
48 | |
49 | |
50 #signature file in blast_parsed format | |
51 my $file_blast_parsed=shift or die $usage; | |
52 | |
53 #structure file outputted from RNAfold | |
54 my $file_struct=shift or die $usage; | |
55 | |
56 my $tmpdir=shift or die $usage; | |
57 #options | |
58 my %options=(); | |
59 getopts("hs:tuv:xyz",\%options); | |
60 | |
61 | |
62 | |
63 | |
64 | |
65 | |
66 ############################################################################################# | |
67 | |
68 ############################# GLOBAL VARIABLES ############################################## | |
69 | |
70 | |
71 #parameters | |
72 my $nucleus_lng=11; | |
73 | |
74 my $score_star=3.9; | |
75 my $score_star_not=-1.3; | |
76 my $score_nucleus=7.63; | |
77 my $score_nucleus_not=-1.17; | |
78 my $score_randfold=1.37; | |
79 my $score_randfold_not=-3.624; | |
80 my $score_intercept=0.3; | |
81 my @scores_stem=(-3.1,-2.3,-2.2,-1.6,-1.5,0.1,0.6,0.8,0.9,0.9,0); | |
82 my $score_min=1; | |
83 if($options{v}){$score_min=$options{v};} | |
84 if($options{x}){$score_min=-5;} | |
85 | |
86 my $e=2.718281828; | |
87 | |
88 #hashes | |
89 my %hash_desc; | |
90 my %hash_seq; | |
91 my %hash_struct; | |
92 my %hash_mfe; | |
93 my %hash_nuclei; | |
94 my %hash_mirs; | |
95 my %hash_query; | |
96 my %hash_comp; | |
97 my %hash_bp; | |
98 | |
99 #other variables | |
100 my $subject_old; | |
101 my $message_filter; | |
102 my $message_score; | |
103 my $lines; | |
104 my $out_of_bound; | |
105 | |
106 | |
107 | |
108 ############################################################################################## | |
109 | |
110 ################################ MAIN ###################################################### | |
111 | |
112 | |
113 #print help if that option is used | |
114 if($options{h}){die $usage;} | |
115 unless ($tmpdir=~/\/$/) {$tmpdir .="/";} | |
116 if(!(-s $tmpdir)){mkdir $tmpdir;} | |
117 $tmpdir .="TMP_DIR/"; | |
118 mkdir $tmpdir; | |
119 | |
120 #parse structure file outputted from RNAfold | |
121 parse_file_struct($file_struct); | |
122 | |
123 #if conservation is scored, the fasta file of known miRNA sequences is parsed | |
124 if($options{s}){create_hash_nuclei($options{s})}; | |
125 | |
126 #parse signature file in blast_parsed format and resolve each potential precursor | |
127 parse_file_blast_parsed($file_blast_parsed); | |
128 #`rm -rf $tmpdir`; | |
129 exit; | |
130 | |
131 | |
132 | |
133 | |
134 ############################################################################################## | |
135 | |
136 ############################## SUBROUTINES ################################################### | |
137 | |
138 | |
139 | |
140 sub parse_file_blast_parsed{ | |
141 | |
142 # read through the signature blastparsed file, fills up a hash with information on queries | |
143 # (deep sequences) mapping to the current subject (potential precursor) and resolve each | |
144 # potential precursor in turn | |
145 | |
146 my $file_blast_parsed=shift; | |
147 | |
148 open (FILE_BLAST_PARSED, "<$file_blast_parsed") or die "can not open $file_blast_parsed\n"; | |
149 while (my $line=<FILE_BLAST_PARSED>){ | |
150 if($line=~/^(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)$/){ | |
151 my $query=$1; | |
152 my $query_lng=$2; | |
153 my $query_beg=$3; | |
154 my $query_end=$4; | |
155 my $subject=$5; | |
156 my $subject_lng=$6; | |
157 my $subject_beg=$7; | |
158 my $subject_end=$8; | |
159 my $e_value=$9; | |
160 my $pid=$10; | |
161 my $bitscore=$11; | |
162 my $other=$12; | |
163 | |
164 #if the new line concerns a new subject (potential precursor) then the old subject must be resolved | |
165 if($subject_old and $subject_old ne $subject){ | |
166 resolve_potential_precursor(); | |
167 } | |
168 | |
169 #resolve the strand | |
170 my $strand=find_strand($other); | |
171 | |
172 #resolve the number of reads that the deep sequence represents | |
173 my $freq=find_freq($query); | |
174 | |
175 #read information of the query (deep sequence) into hash | |
176 $hash_query{$query}{"subject_beg"}=$subject_beg; | |
177 $hash_query{$query}{"subject_end"}=$subject_end; | |
178 $hash_query{$query}{"strand"}=$strand; | |
179 $hash_query{$query}{"freq"}=$freq; | |
180 | |
181 #save the signature information | |
182 $lines.=$line; | |
183 | |
184 $subject_old=$subject; | |
185 } | |
186 } | |
187 resolve_potential_precursor(); | |
188 } | |
189 | |
190 sub resolve_potential_precursor{ | |
191 | |
192 # dissects the potential precursor in parts by filling hashes, and tests if it passes the | |
193 # initial filter and the scoring filter | |
194 | |
195 # binary variable whether the potential precursor is still viable | |
196 my $ret=1; | |
197 #print STDERR ">$subject_old\n"; | |
198 | |
199 fill_structure(); | |
200 #print STDERR "\%hash_bp",scalar keys %hash_bp,"\n"; | |
201 fill_pri(); | |
202 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; | |
203 | |
204 fill_mature(); | |
205 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; | |
206 | |
207 fill_star(); | |
208 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; | |
209 | |
210 fill_loop(); | |
211 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; | |
212 | |
213 fill_lower_flanks(); | |
214 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n"; | |
215 | |
216 # do_test_assemble(); | |
217 | |
218 # this is the actual classification | |
219 unless(pass_filtering_initial() and pass_threshold_score()){$ret=0;} | |
220 | |
221 print_results($ret); | |
222 | |
223 reset_variables(); | |
224 | |
225 return; | |
226 | |
227 } | |
228 | |
229 | |
230 | |
231 sub print_results{ | |
232 | |
233 my $ret=shift; | |
234 | |
235 # print out if the precursor is accepted and accepted precursors should be printed out | |
236 # or if the potential precursor is discarded and discarded potential precursors should | |
237 # be printed out | |
238 | |
239 if((!$options{t} and $ret) or ($options{t} and !$ret)){ | |
240 #full output | |
241 unless($options{u}){ | |
242 if($message_filter){print $message_filter;} | |
243 if($message_score){print $message_score;} | |
244 print_hash_comp(); | |
245 print $lines,"\n\n"; | |
246 return; | |
247 } | |
248 #limited output (only ids) | |
249 my $id=$hash_comp{"pri_id"}; | |
250 print "$id\n"; | |
251 } | |
252 } | |
253 | |
254 | |
255 | |
256 | |
257 | |
258 | |
259 | |
260 sub pass_threshold_score{ | |
261 | |
262 # this is the scoring | |
263 | |
264 #minimum free energy of the potential precursor | |
265 # my $score_mfe=score_mfe($hash_comp{"pri_mfe"}); | |
266 my $score_mfe=score_mfe($hash_comp{"pri_mfe"},$hash_comp{"pri_end"}); | |
267 | |
268 #count of reads that map in accordance with Dicer processing | |
269 my $score_freq=score_freq($hash_comp{"freq"}); | |
270 #print STDERR "score_mfe: $score_mfe\nscore_freq: $score_freq\n"; | |
271 | |
272 #basic score | |
273 my $score=$score_mfe+$score_freq; | |
274 | |
275 #scoring of conserved nucleus/seed (optional) | |
276 if($options{s}){ | |
277 | |
278 #if the nucleus is conserved | |
279 if(test_nucleus_conservation()){ | |
280 | |
281 #nucleus from position 2-8 | |
282 my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng); | |
283 | |
284 #resolve DNA/RNA ambiguities | |
285 $nucleus=~tr/[T]/[U]/; | |
286 | |
287 #print score contribution | |
288 score_s("score_nucleus\t$score_nucleus"); | |
289 | |
290 #print the ids of known miRNAs with same nucleus | |
291 score_s("$hash_mirs{$nucleus}"); | |
292 #print STDERR "score_nucleus\t$score_nucleus\n"; | |
293 | |
294 #add to score | |
295 $score+=$score_nucleus; | |
296 | |
297 #if the nucleus is not conserved | |
298 }else{ | |
299 #print (negative) score contribution | |
300 score_s("score_nucleus\t$score_nucleus_not"); | |
301 | |
302 #add (negative) score contribution | |
303 $score+=$score_nucleus_not; | |
304 } | |
305 } | |
306 | |
307 #if the majority of potential star reads fall as expected from Dicer processing | |
308 if($hash_comp{"star_read"}){ | |
309 score_s("score_star\t$score_star"); | |
310 #print STDERR "score_star\t$score_star\n"; | |
311 $score+=$score_star; | |
312 }else{ | |
313 score_s("score_star\t$score_star_not"); | |
314 #print STDERR "score_star_not\t$score_star_not\n"; | |
315 $score+=$score_star_not; | |
316 } | |
317 | |
318 #score lower stems for potential for Drosha recognition (highly optional) | |
319 if($options{z}){ | |
320 my $stem_bp=$hash_comp{"stem_bp"}; | |
321 my $score_stem=$scores_stem[$stem_bp]; | |
322 $score+=$score_stem; | |
323 score_s("score_stem\t$score_stem"); | |
324 } | |
325 | |
326 #print STDERR "score_intercept\t$score_intercept\n"; | |
327 | |
328 $score+=$score_intercept; | |
329 | |
330 #score for randfold (optional) | |
331 if($options{y}){ | |
332 | |
333 # only calculate randfold value if it can make the difference between the potential precursor | |
334 # being accepted or discarded | |
335 if($score+$score_randfold>=$score_min and $score+$score_randfold_not<=$score_min){ | |
336 | |
337 #randfold value<0.05 | |
338 if(test_randfold()){$score+=$score_randfold;score_s("score_randfold\t$score_randfold");} | |
339 | |
340 #randfold value>0.05 | |
341 else{$score+=$score_randfold_not;score_s("score_randfold\t$score_randfold_not");} | |
342 } | |
343 } | |
344 | |
345 #round off values to one decimal | |
346 my $round_mfe=round($score_mfe*10)/10; | |
347 my $round_freq=round($score_freq*10)/10; | |
348 my $round=round($score*10)/10; | |
349 | |
350 #print scores | |
351 score_s("score_mfe\t$round_mfe\nscore_freq\t$round_freq\nscore\t$round"); | |
352 | |
353 #return 1 if the potential precursor is accepted, return 0 if discarded | |
354 unless($score>=$score_min){return 0;} | |
355 return 1; | |
356 } | |
357 | |
358 sub test_randfold{ | |
359 | |
360 #print sequence to temporary file, test randfold value, return 1 or 0 | |
361 | |
362 # print_file("pri_seq.fa",">pri_seq\n".$hash_comp{"pri_seq"}); | |
363 #my $tmpfile=$tmpdir.$hash_comp{"pri_id"}; | |
364 #open(FILE, ">$tmpfile"); | |
365 #print FILE ">pri_seq\n",$hash_comp{"pri_seq"}; | |
366 #close FILE; | |
367 | |
368 # my $p_value=`randfold -s $tmpfile 999 | cut -f 3`; | |
369 #my $p1=`randfold -s $tmpfile 999 | cut -f 3`; | |
370 #my $p2=`randfold -s $tmpfile 999 | cut -f 3`; | |
371 my $p1=&randfold_pvalue($hash_comp{"pri_seq"},999); | |
372 my $p2=&randfold_pvalue($hash_comp{"pri_seq"},999); | |
373 my $p_value=($p1+$p2)/2; | |
374 wait; | |
375 # system "rm $tmpfile"; | |
376 | |
377 if($p_value<=0.05){return 1;} | |
378 | |
379 return 0; | |
380 } | |
381 | |
382 sub randfold_pvalue{ | |
383 my $cpt_sup = 0; | |
384 my $cpt_inf = 0; | |
385 my $cpt_ega = 1; | |
386 | |
387 my ($seq,$number_of_randomizations)=@_; | |
388 #my $str =$seq; | |
389 #my $mfe = RNA::fold($seq,$str); | |
390 my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; | |
391 my @rawfolds=split/\s+/,$rnafold; | |
392 my $str=$rawfolds[1]; | |
393 my $mfe=$rawfolds[-1]; | |
394 $mfe=~s/\(//; | |
395 $mfe=~s/\)//; | |
396 | |
397 for (my $i=0;$i<$number_of_randomizations;$i++) { | |
398 $seq = shuffle_sequence_dinucleotide($seq); | |
399 #$str = $seq; | |
400 | |
401 #my $rand_mfe = RNA::fold($str,$str); | |
402 $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`; | |
403 my @rawfolds=split/\s+/,$rnafold; | |
404 my $str=$rawfolds[1]; | |
405 my $rand_mfe=$rawfolds[-1]; | |
406 $rand_mfe=~s/\(//; | |
407 $rand_mfe=~s/\)//; | |
408 | |
409 if ($rand_mfe < $mfe) { | |
410 $cpt_inf++; | |
411 } | |
412 if ($rand_mfe == $mfe) { | |
413 $cpt_ega++; | |
414 } | |
415 if ($rand_mfe > $mfe) { | |
416 $cpt_sup++; | |
417 } | |
418 } | |
419 | |
420 my $proba = ($cpt_ega + $cpt_inf) / ($number_of_randomizations + 1); | |
421 | |
422 #print "$name\t$mfe\t$proba\n"; | |
423 return $proba; | |
424 } | |
425 | |
426 sub shuffle_sequence_dinucleotide { | |
427 | |
428 my ($str) = @_; | |
429 | |
430 # upper case and convert to ATGC | |
431 $str = uc($str); | |
432 $str =~ s/U/T/g; | |
433 | |
434 my @nuc = ('A','T','G','C'); | |
435 my $count_swap = 0; | |
436 # set maximum number of permutations | |
437 my $stop = length($str) * 10; | |
438 | |
439 while($count_swap < $stop) { | |
440 | |
441 my @pos; | |
442 | |
443 # look start and end letters | |
444 my $firstnuc = $nuc[int(rand 4)]; | |
445 my $thirdnuc = $nuc[int(rand 4)]; | |
446 | |
447 # get positions for matching nucleotides | |
448 for (my $i=0;$i<(length($str)-2);$i++) { | |
449 if ((substr($str,$i,1) eq $firstnuc) && (substr($str,$i+2,1) eq $thirdnuc)) { | |
450 push (@pos,($i+1)); | |
451 $i++; | |
452 } | |
453 } | |
454 | |
455 # swap at random trinucleotides | |
456 my $max = scalar(@pos); | |
457 for (my $i=0;$i<$max;$i++) { | |
458 my $swap = int(rand($max)); | |
459 if ((abs($pos[$swap] - $pos[$i]) >= 3) && (substr($str,$pos[$i],1) ne substr($str,$pos[$swap],1))) { | |
460 $count_swap++; | |
461 my $w1 = substr($str,$pos[$i],1); | |
462 my $w2 = substr($str,$pos[$swap],1); | |
463 substr($str,$pos[$i],1,$w2); | |
464 substr($str,$pos[$swap],1,$w1); | |
465 } | |
466 } | |
467 } | |
468 return($str); | |
469 } | |
470 | |
471 sub test_nucleus_conservation{ | |
472 | |
473 #test if nucleus is identical to nucleus from known miRNA, return 1 or 0 | |
474 | |
475 my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng); | |
476 $nucleus=~tr/[T]/[U]/; | |
477 if($hash_nuclei{$nucleus}){return 1;} | |
478 | |
479 return 0; | |
480 } | |
481 | |
482 | |
483 | |
484 sub pass_filtering_initial{ | |
485 | |
486 #test if the structure forms a plausible hairpin | |
487 unless(pass_filtering_structure()){filter_p("structure problem"); return 0;} | |
488 | |
489 #test if >90% of reads map to the hairpin in consistence with Dicer processing | |
490 unless(pass_filtering_signature()){filter_p("signature problem");return 0;} | |
491 | |
492 return 1; | |
493 | |
494 } | |
495 | |
496 | |
497 sub pass_filtering_signature{ | |
498 | |
499 #number of reads that map in consistence with Dicer processing | |
500 my $consistent=0; | |
501 | |
502 #number of reads that map inconsistent with Dicer processing | |
503 my $inconsistent=0; | |
504 | |
505 # number of potential star reads map in good consistence with Drosha/Dicer processing | |
506 # (3' overhangs relative to mature product) | |
507 my $star_perfect=0; | |
508 | |
509 # number of potential star reads that do not map in good consistence with 3' overhang | |
510 my $star_fuzzy=0; | |
511 | |
512 | |
513 #sort queries (deep sequences) by their position on the hairpin | |
514 my @queries=sort {$hash_query{$a}{"subject_beg"} <=> $hash_query{$b}{"subject_beg"}} keys %hash_query; | |
515 | |
516 foreach my $query(@queries){ | |
517 | |
518 #number of reads that the deep sequence represents | |
519 unless(defined($hash_query{$query}{"freq"})){next;} | |
520 my $query_freq=$hash_query{$query}{"freq"}; | |
521 | |
522 #test which Dicer product (if any) the deep sequence corresponds to | |
523 my $product=test_query($query); | |
524 | |
525 #if the deep sequence corresponds to a Dicer product, add to the 'consistent' variable | |
526 if($product){$consistent+=$query_freq;} | |
527 | |
528 #if the deep sequence do not correspond to a Dicer product, add to the 'inconsistent' variable | |
529 else{$inconsistent+=$query_freq;} | |
530 | |
531 #test a potential star sequence has good 3' overhang | |
532 if($product eq "star"){ | |
533 if(test_star($query)){$star_perfect+=$query_freq;} | |
534 else{$star_fuzzy+=$query_freq;} | |
535 } | |
536 } | |
537 | |
538 # if the majority of potential star sequences map in good accordance with 3' overhang | |
539 # score for the presence of star evidence | |
540 if($star_perfect>$star_fuzzy){$hash_comp{"star_read"}=1;} | |
541 | |
542 #total number of reads mapping to the hairpin | |
543 my $freq=$consistent+$inconsistent; | |
544 $hash_comp{"freq"}=$freq; | |
545 unless($freq>0){filter_s("read frequency too low"); return 0;} | |
546 | |
547 #unless >90% of the reads map in consistence with Dicer processing, the hairpin is discarded | |
548 my $inconsistent_fraction=$inconsistent/($inconsistent+$consistent); | |
549 unless($inconsistent_fraction<=0.1){filter_p("inconsistent\t$inconsistent\nconsistent\t$consistent"); return 0;} | |
550 | |
551 #the hairpin is retained | |
552 return 1; | |
553 } | |
554 | |
555 sub test_star{ | |
556 | |
557 #test if a deep sequence maps in good consistence with 3' overhang | |
558 | |
559 my $query=shift; | |
560 | |
561 #5' begin and 3' end positions | |
562 my $beg=$hash_query{$query}{"subject_beg"}; | |
563 my $end=$hash_query{$query}{"subject_end"}; | |
564 | |
565 #the difference between observed and expected begin positions must be 0 or 1 | |
566 my $offset=$beg-$hash_comp{"star_beg"}; | |
567 if($offset==0 or $offset==1 or $offset==-1){return 1;} | |
568 | |
569 return 0; | |
570 } | |
571 | |
572 | |
573 | |
574 sub test_query{ | |
575 | |
576 #test if deep sequence maps in consistence with Dicer processing | |
577 | |
578 my $query=shift; | |
579 | |
580 #begin, end, strand and read count | |
581 my $beg=$hash_query{$query}{"subject_beg"}; | |
582 my $end=$hash_query{$query}{"subject_end"}; | |
583 my $strand=$hash_query{$query}{"strand"}; | |
584 my $freq=$hash_query{$query}{"freq"}; | |
585 | |
586 #should not be on the minus strand (although this has in fact anecdotally been observed for known miRNAs) | |
587 if($strand eq '-'){return 0;} | |
588 | |
589 #the deep sequence is allowed to stretch 2 nt beyond the expected 5' end | |
590 my $fuzz_beg=2; | |
591 #the deep sequence is allowed to stretch 5 nt beyond the expected 3' end | |
592 my $fuzz_end=2; | |
593 | |
594 #if in accordance with Dicer processing, return the type of Dicer product | |
595 if(contained($beg,$end,$hash_comp{"mature_beg"}-$fuzz_beg,$hash_comp{"mature_end"}+$fuzz_end)){return "mature";} | |
596 if(contained($beg,$end,$hash_comp{"star_beg"}-$fuzz_beg,$hash_comp{"star_end"}+$fuzz_end)){return "star";} | |
597 if(contained($beg,$end,$hash_comp{"loop_beg"}-$fuzz_beg,$hash_comp{"loop_end"}+$fuzz_end)){return "loop";} | |
598 | |
599 #if not in accordance, return 0 | |
600 return 0; | |
601 } | |
602 | |
603 | |
604 sub pass_filtering_structure{ | |
605 | |
606 #The potential precursor must form a hairpin with miRNA precursor-like characteristics | |
607 | |
608 #return value | |
609 my $ret=1; | |
610 | |
611 #potential mature, star, loop and lower flank parts must be identifiable | |
612 unless(test_components()){return 0;} | |
613 | |
614 #no bifurcations | |
615 unless(no_bifurcations_precursor()){$ret=0;} | |
616 | |
617 #minimum 14 base pairings in duplex | |
618 unless(bp_duplex()>=15){$ret=0;filter_s("too few pairings in duplex");} | |
619 | |
620 #not more than 6 nt difference between mature and star length | |
621 unless(-6<diff_lng() and diff_lng()<6){$ret=0; filter_s("too big difference between mature and star length") } | |
622 | |
623 return $ret; | |
624 } | |
625 | |
626 | |
627 | |
628 | |
629 | |
630 | |
631 sub test_components{ | |
632 | |
633 #tests whether potential mature, star, loop and lower flank parts are identifiable | |
634 | |
635 unless($hash_comp{"mature_struct"}){ | |
636 filter_s("no mature"); | |
637 # print STDERR "no mature\n"; | |
638 return 0; | |
639 } | |
640 | |
641 unless($hash_comp{"star_struct"}){ | |
642 filter_s("no star"); | |
643 # print STDERR "no star\n"; | |
644 return 0; | |
645 } | |
646 | |
647 unless($hash_comp{"loop_struct"}){ | |
648 filter_s("no loop"); | |
649 # print STDERR "no loop\n"; | |
650 return 0; | |
651 } | |
652 | |
653 unless($hash_comp{"flank_first_struct"}){ | |
654 filter_s("no flanks"); | |
655 #print STDERR "no flanks_first_struct\n"; | |
656 return 0; | |
657 } | |
658 | |
659 unless($hash_comp{"flank_second_struct"}){ | |
660 filter_s("no flanks"); | |
661 # print STDERR "no flanks_second_struct\n"; | |
662 return 0; | |
663 } | |
664 return 1; | |
665 } | |
666 | |
667 | |
668 | |
669 | |
670 | |
671 sub no_bifurcations_precursor{ | |
672 | |
673 #tests whether there are bifurcations in the hairpin | |
674 | |
675 #assembles the potential precursor sequence and structure from the expected Dicer products | |
676 #this is the expected biological precursor, in contrast with 'pri_seq' that includes | |
677 #some genomic flanks on both sides | |
678 | |
679 my $pre_struct; | |
680 my $pre_seq; | |
681 if($hash_comp{"mature_arm"} eq "first"){ | |
682 $pre_struct.=$hash_comp{"mature_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"star_struct"}; | |
683 $pre_seq.=$hash_comp{"mature_seq"}.$hash_comp{"loop_seq"}.$hash_comp{"star_seq"}; | |
684 }else{ | |
685 $pre_struct.=$hash_comp{"star_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"mature_struct"}; | |
686 $pre_seq.=$hash_comp{"star_seq"}.$hash_comp{"loop_seq"}.$hash_comp{"mature_seq"}; | |
687 } | |
688 | |
689 #read into hash | |
690 $hash_comp{"pre_struct"}=$pre_struct; | |
691 $hash_comp{"pre_seq"}=$pre_seq; | |
692 | |
693 #simple pattern matching checks for bifurcations | |
694 unless($pre_struct=~/^((\.|\()+..(\.|\))+)$/){ | |
695 filter_s("bifurcation in precursor"); | |
696 # print STDERR "bifurcation in precursor\n"; | |
697 return 0; | |
698 } | |
699 | |
700 return 1; | |
701 } | |
702 | |
703 sub bp_precursor{ | |
704 | |
705 #total number of bps in the precursor | |
706 | |
707 my $pre_struct=$hash_comp{"pre_struct"}; | |
708 | |
709 #simple pattern matching | |
710 my $pre_bps=0; | |
711 while($pre_struct=~/\(/g){ | |
712 $pre_bps++; | |
713 } | |
714 return $pre_bps; | |
715 } | |
716 | |
717 | |
718 sub bp_duplex{ | |
719 | |
720 #total number of bps in the duplex | |
721 | |
722 my $duplex_bps=0; | |
723 my $mature_struct=$hash_comp{"mature_struct"}; | |
724 | |
725 #simple pattern matching | |
726 while($mature_struct=~/(\(|\))/g){ | |
727 $duplex_bps++; | |
728 } | |
729 return $duplex_bps; | |
730 } | |
731 | |
732 sub diff_lng{ | |
733 | |
734 #find difference between mature and star lengths | |
735 | |
736 my $mature_lng=length $hash_comp{"mature_struct"}; | |
737 my $star_lng=length $hash_comp{"star_struct"}; | |
738 my $diff_lng=$mature_lng-$star_lng; | |
739 return $diff_lng; | |
740 } | |
741 | |
742 | |
743 | |
744 sub do_test_assemble{ | |
745 | |
746 # not currently used, tests if the 'pri_struct' as assembled from the parts (Dicer products, lower flanks) | |
747 # is identical to 'pri_struct' before disassembly into parts | |
748 | |
749 my $assemble_struct; | |
750 | |
751 if($hash_comp{"flank_first_struct"} and $hash_comp{"mature_struct"} and $hash_comp{"loop_struct"} and $hash_comp{"star_struct"} and $hash_comp{"flank_second_struct"}){ | |
752 if($hash_comp{"mature_arm"} eq "first"){ | |
753 $assemble_struct.=$hash_comp{"flank_first_struct"}.$hash_comp{"mature_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"star_struct"}.$hash_comp{"flank_second_struct"}; | |
754 }else{ | |
755 $assemble_struct.=$hash_comp{"flank_first_struct"}.$hash_comp{"star_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"mature_struct"}.$hash_comp{"flank_second_struct"}; | |
756 } | |
757 unless($assemble_struct eq $hash_comp{"pri_struct"}){ | |
758 $hash_comp{"test_assemble"}=$assemble_struct; | |
759 print_hash_comp(); | |
760 } | |
761 } | |
762 return; | |
763 } | |
764 | |
765 | |
766 | |
767 sub fill_structure{ | |
768 | |
769 #reads the dot bracket structure into the 'bp' hash where each key and value are basepaired | |
770 | |
771 my $struct=$hash_struct{$subject_old}; | |
772 my $lng=length $struct; | |
773 | |
774 #local stack for keeping track of basepairings | |
775 my @bps; | |
776 | |
777 for(my $pos=1;$pos<=$lng;$pos++){ | |
778 my $struct_pos=excise_struct($struct,$pos,$pos,"+"); | |
779 | |
780 if($struct_pos eq "("){ | |
781 push(@bps,$pos); | |
782 } | |
783 | |
784 if($struct_pos eq ")"){ | |
785 my $pos_prev=pop(@bps); | |
786 $hash_bp{$pos_prev}=$pos; | |
787 $hash_bp{$pos}=$pos_prev; | |
788 } | |
789 } | |
790 return; | |
791 } | |
792 | |
793 | |
794 | |
795 sub fill_star{ | |
796 | |
797 #fills specifics on the expected star strand into 'comp' hash ('component' hash) | |
798 | |
799 #if the mature sequence is not plausible, don't look for the star arm | |
800 my $mature_arm=$hash_comp{"mature_arm"}; | |
801 unless($mature_arm){$hash_comp{"star_arm"}=0; return;} | |
802 | |
803 #if the star sequence is not plausible, don't fill into the hash | |
804 my($star_beg,$star_end)=find_star(); | |
805 my $star_arm=arm_star($star_beg,$star_end); | |
806 unless($star_arm){return;} | |
807 | |
808 #excise expected star sequence and structure | |
809 my $star_seq=excise_seq($hash_comp{"pri_seq"},$star_beg,$star_end,"+"); | |
810 my $star_struct=excise_seq($hash_comp{"pri_struct"},$star_beg,$star_end,"+"); | |
811 | |
812 #fill into hash | |
813 $hash_comp{"star_beg"}=$star_beg; | |
814 $hash_comp{"star_end"}=$star_end; | |
815 $hash_comp{"star_seq"}=$star_seq; | |
816 $hash_comp{"star_struct"}=$star_struct; | |
817 $hash_comp{"star_arm"}=$star_arm; | |
818 | |
819 return; | |
820 } | |
821 | |
822 | |
823 sub find_star{ | |
824 | |
825 #uses the 'bp' hash to find the expected star begin and end positions from the mature positions | |
826 | |
827 #the -2 is for the overhang | |
828 my $mature_beg=$hash_comp{"mature_beg"}; | |
829 my $mature_end=$hash_comp{"mature_end"}-2; | |
830 my $mature_lng=$mature_end-$mature_beg+1; | |
831 | |
832 #in some cases, the last nucleotide of the mature sequence does not form a base pair, | |
833 #and therefore does not basepair with the first nucleotide of the star sequence. | |
834 #In this case, the algorithm searches for the last nucleotide of the mature sequence | |
835 #to form a base pair. The offset is the number of nucleotides searched through. | |
836 my $offset_star_beg=0; | |
837 my $offset_beg=0; | |
838 | |
839 #the offset should not be longer than the length of the mature sequence, then it | |
840 #means that the mature sequence does not form any base pairs | |
841 while(!$offset_star_beg and $offset_beg<$mature_lng){ | |
842 if($hash_bp{$mature_end-$offset_beg}){ | |
843 $offset_star_beg=$hash_bp{$mature_end-$offset_beg}; | |
844 }else{ | |
845 $offset_beg++; | |
846 } | |
847 } | |
848 #when defining the beginning of the star sequence, compensate for the offset | |
849 my $star_beg=$offset_star_beg-$offset_beg; | |
850 | |
851 #same as above | |
852 my $offset_star_end=0; | |
853 my $offset_end=0; | |
854 while(!$offset_star_end and $offset_end<$mature_lng){ | |
855 if($hash_bp{$mature_beg+$offset_end}){ | |
856 $offset_star_end=$hash_bp{$mature_beg+$offset_end}; | |
857 }else{ | |
858 $offset_end++; | |
859 } | |
860 } | |
861 #the +2 is for the overhang | |
862 my $star_end=$offset_star_end+$offset_end+2; | |
863 | |
864 return($star_beg,$star_end); | |
865 } | |
866 | |
867 | |
868 sub fill_pri{ | |
869 | |
870 #fills basic specifics on the precursor into the 'comp' hash | |
871 | |
872 my $seq=$hash_seq{$subject_old}; | |
873 my $struct=$hash_struct{$subject_old}; | |
874 my $mfe=$hash_mfe{$subject_old}; | |
875 my $length=length $seq; | |
876 | |
877 $hash_comp{"pri_id"}=$subject_old; | |
878 $hash_comp{"pri_seq"}=$seq; | |
879 $hash_comp{"pri_struct"}=$struct; | |
880 $hash_comp{"pri_mfe"}=$mfe; | |
881 $hash_comp{"pri_beg"}=1; | |
882 $hash_comp{"pri_end"}=$length; | |
883 | |
884 return; | |
885 } | |
886 | |
887 | |
888 sub fill_mature{ | |
889 | |
890 #fills specifics on the mature sequence into the 'comp' hash | |
891 | |
892 my $mature_query=find_mature_query(); | |
893 my($mature_beg,$mature_end)=find_positions_query($mature_query); | |
894 my $mature_strand=find_strand_query($mature_query); | |
895 my $mature_seq=excise_seq($hash_comp{"pri_seq"},$mature_beg,$mature_end,$mature_strand); | |
896 my $mature_struct=excise_struct($hash_comp{"pri_struct"},$mature_beg,$mature_end,$mature_strand); | |
897 my $mature_arm=arm_mature($mature_beg,$mature_end,$mature_strand); | |
898 | |
899 $hash_comp{"mature_query"}=$mature_query; | |
900 $hash_comp{"mature_beg"}=$mature_beg; | |
901 $hash_comp{"mature_end"}=$mature_end; | |
902 $hash_comp{"mature_strand"}=$mature_strand; | |
903 $hash_comp{"mature_struct"}=$mature_struct; | |
904 $hash_comp{"mature_seq"}=$mature_seq; | |
905 $hash_comp{"mature_arm"}=$mature_arm; | |
906 | |
907 return; | |
908 } | |
909 | |
910 | |
911 | |
912 sub fill_loop{ | |
913 | |
914 #fills specifics on the loop sequence into the 'comp' hash | |
915 | |
916 #unless both mature and star sequences are plausible, do not look for the loop | |
917 unless($hash_comp{"mature_arm"} and $hash_comp{"star_arm"}){return;} | |
918 | |
919 my $loop_beg; | |
920 my $loop_end; | |
921 | |
922 #defining the begin and end positions of the loop from the mature and star positions | |
923 #excision depends on whether the mature or star sequence is 5' of the loop ('first') | |
924 if($hash_comp{"mature_arm"} eq "first"){ | |
925 $loop_beg=$hash_comp{"mature_end"}+1; | |
926 }else{ | |
927 $loop_end=$hash_comp{"mature_beg"}-1; | |
928 } | |
929 | |
930 if($hash_comp{"star_arm"} eq "first"){ | |
931 $loop_beg=$hash_comp{"star_end"}+1; | |
932 }else{ | |
933 $loop_end=$hash_comp{"star_beg"}-1; | |
934 } | |
935 | |
936 #unless the positions are plausible, do not fill into hash | |
937 unless(test_loop($loop_beg,$loop_end)){return;} | |
938 | |
939 my $loop_seq=excise_seq($hash_comp{"pri_seq"},$loop_beg,$loop_end,"+"); | |
940 my $loop_struct=excise_struct($hash_comp{"pri_struct"},$loop_beg,$loop_end,"+"); | |
941 | |
942 $hash_comp{"loop_beg"}=$loop_beg; | |
943 $hash_comp{"loop_end"}=$loop_end; | |
944 $hash_comp{"loop_seq"}=$loop_seq; | |
945 $hash_comp{"loop_struct"}=$loop_struct; | |
946 | |
947 return; | |
948 } | |
949 | |
950 | |
951 sub fill_lower_flanks{ | |
952 | |
953 #fills specifics on the lower flanks and unpaired strands into the 'comp' hash | |
954 | |
955 #unless both mature and star sequences are plausible, do not look for the flanks | |
956 unless($hash_comp{"mature_arm"} and $hash_comp{"star_arm"}){return;} | |
957 | |
958 my $flank_first_end; | |
959 my $flank_second_beg; | |
960 | |
961 #defining the begin and end positions of the flanks from the mature and star positions | |
962 #excision depends on whether the mature or star sequence is 5' in the potenitial precursor ('first') | |
963 if($hash_comp{"mature_arm"} eq "first"){ | |
964 $flank_first_end=$hash_comp{"mature_beg"}-1; | |
965 }else{ | |
966 $flank_second_beg=$hash_comp{"mature_end"}+1; | |
967 } | |
968 | |
969 if($hash_comp{"star_arm"} eq "first"){ | |
970 $flank_first_end=$hash_comp{"star_beg"}-1; | |
971 }else{ | |
972 $flank_second_beg=$hash_comp{"star_end"}+1; | |
973 } | |
974 | |
975 #unless the positions are plausible, do not fill into hash | |
976 unless(test_flanks($flank_first_end,$flank_second_beg)){return;} | |
977 | |
978 $hash_comp{"flank_first_end"}=$flank_first_end; | |
979 $hash_comp{"flank_second_beg"}=$flank_second_beg; | |
980 $hash_comp{"flank_first_seq"}=excise_seq($hash_comp{"pri_seq"},$hash_comp{"pri_beg"},$hash_comp{"flank_first_end"},"+"); | |
981 $hash_comp{"flank_second_seq"}=excise_seq($hash_comp{"pri_seq"},$hash_comp{"flank_second_beg"},$hash_comp{"pri_end"},"+"); | |
982 $hash_comp{"flank_first_struct"}=excise_struct($hash_comp{"pri_struct"},$hash_comp{"pri_beg"},$hash_comp{"flank_first_end"},"+"); | |
983 $hash_comp{"flank_second_struct"}=excise_struct($hash_comp{"pri_struct"},$hash_comp{"flank_second_beg"},$hash_comp{"pri_end"},"+"); | |
984 | |
985 if($options{z}){ | |
986 fill_stems_drosha(); | |
987 } | |
988 | |
989 return; | |
990 } | |
991 | |
992 | |
993 sub fill_stems_drosha{ | |
994 | |
995 #scores the number of base pairings formed by the first ten nt of the lower stems | |
996 #in general, the more stems, the higher the score contribution | |
997 #warning: this options has not been thoroughly tested | |
998 | |
999 my $flank_first_struct=$hash_comp{"flank_first_struct"}; | |
1000 my $flank_second_struct=$hash_comp{"flank_second_struct"}; | |
1001 | |
1002 my $stem_first=substr($flank_first_struct,-10); | |
1003 my $stem_second=substr($flank_second_struct,0,10); | |
1004 | |
1005 my $stem_bp_first=0; | |
1006 my $stem_bp_second=0; | |
1007 | |
1008 #find base pairings by simple pattern matching | |
1009 while($stem_first=~/\(/g){ | |
1010 $stem_bp_first++; | |
1011 } | |
1012 | |
1013 while($stem_second=~/\)/g){ | |
1014 $stem_bp_second++; | |
1015 } | |
1016 | |
1017 my $stem_bp=min2($stem_bp_first,$stem_bp_second); | |
1018 | |
1019 $hash_comp{"stem_first"}=$stem_first; | |
1020 $hash_comp{"stem_second"}=$stem_second; | |
1021 $hash_comp{"stem_bp_first"}=$stem_bp_first; | |
1022 $hash_comp{"stem_bp_second"}=$stem_bp_second; | |
1023 $hash_comp{"stem_bp"}=$stem_bp; | |
1024 | |
1025 return; | |
1026 } | |
1027 | |
1028 | |
1029 | |
1030 | |
1031 sub arm_mature{ | |
1032 | |
1033 #tests whether the mature sequence is in the 5' ('first') or 3' ('second') arm of the potential precursor | |
1034 | |
1035 my ($beg,$end,$strand)=@_; | |
1036 | |
1037 #mature and star sequences should alway be on plus strand | |
1038 if($strand eq "-"){return 0;} | |
1039 | |
1040 #there should be no bifurcations and minimum one base pairing | |
1041 my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,$strand); | |
1042 if(defined($struct) and $struct=~/^(\(|\.)+$/ and $struct=~/\(/){ | |
1043 return "first"; | |
1044 }elsif(defined($struct) and $struct=~/^(\)|\.)+$/ and $struct=~/\)/){ | |
1045 return "second"; | |
1046 } | |
1047 return 0; | |
1048 } | |
1049 | |
1050 | |
1051 sub arm_star{ | |
1052 | |
1053 #tests whether the star sequence is in the 5' ('first') or 3' ('second') arm of the potential precursor | |
1054 | |
1055 my ($beg,$end)=@_; | |
1056 | |
1057 #unless the begin and end positions are plausible, test negative | |
1058 unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} | |
1059 | |
1060 #no overlap between the mature and the star sequence | |
1061 if($hash_comp{"mature_arm"} eq "first"){ | |
1062 ($hash_comp{"mature_end"}<$beg) or return 0; | |
1063 }elsif($hash_comp{"mature_arm"} eq "second"){ | |
1064 ($end<$hash_comp{"mature_beg"}) or return 0; | |
1065 } | |
1066 | |
1067 #there should be no bifurcations and minimum one base pairing | |
1068 my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,"+"); | |
1069 if($struct=~/^(\(|\.)+$/ and $struct=~/\(/){ | |
1070 return "first"; | |
1071 }elsif($struct=~/^(\)|\.)+$/ and $struct=~/\)/){ | |
1072 return "second"; | |
1073 } | |
1074 return 0; | |
1075 } | |
1076 | |
1077 | |
1078 sub test_loop{ | |
1079 | |
1080 #tests the loop positions | |
1081 | |
1082 my ($beg,$end)=@_; | |
1083 | |
1084 #unless the begin and end positions are plausible, test negative | |
1085 unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} | |
1086 | |
1087 return 1; | |
1088 } | |
1089 | |
1090 | |
1091 sub test_flanks{ | |
1092 | |
1093 #tests the positions of the lower flanks | |
1094 | |
1095 my ($beg,$end)=@_; | |
1096 | |
1097 #unless the begin and end positions are plausible, test negative | |
1098 unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;} | |
1099 | |
1100 return 1; | |
1101 } | |
1102 | |
1103 | |
1104 sub comp{ | |
1105 | |
1106 #subroutine to retrive from the 'comp' hash | |
1107 | |
1108 my $type=shift; | |
1109 my $component=$hash_comp{$type}; | |
1110 return $component; | |
1111 } | |
1112 | |
1113 | |
1114 sub find_strand_query{ | |
1115 | |
1116 #subroutine to find the strand for a given query | |
1117 | |
1118 my $query=shift; | |
1119 my $strand=$hash_query{$query}{"strand"}; | |
1120 return $strand; | |
1121 } | |
1122 | |
1123 | |
1124 sub find_positions_query{ | |
1125 | |
1126 #subroutine to find the begin and end positions for a given query | |
1127 | |
1128 my $query=shift; | |
1129 my $beg=$hash_query{$query}{"subject_beg"}; | |
1130 my $end=$hash_query{$query}{"subject_end"}; | |
1131 return ($beg,$end); | |
1132 } | |
1133 | |
1134 | |
1135 | |
1136 sub find_mature_query{ | |
1137 | |
1138 #finds the query with the highest frequency of reads and returns it | |
1139 #is used to determine the positions of the potential mature sequence | |
1140 | |
1141 my @queries=sort {$hash_query{$b}{"freq"} <=> $hash_query{$a}{"freq"}} keys %hash_query; | |
1142 my $mature_query=$queries[0]; | |
1143 return $mature_query; | |
1144 } | |
1145 | |
1146 | |
1147 | |
1148 | |
1149 sub reset_variables{ | |
1150 | |
1151 #resets the hashes for the next potential precursor | |
1152 | |
1153 # %hash_query=(); | |
1154 # %hash_comp=(); | |
1155 # %hash_bp=(); | |
1156 foreach my $key (keys %hash_query) {delete($hash_query{$key});} | |
1157 foreach my $key (keys %hash_comp) {delete($hash_comp{$key});} | |
1158 foreach my $key (keys %hash_bp) {delete($hash_bp{$key});} | |
1159 | |
1160 # $message_filter=(); | |
1161 # $message_score=(); | |
1162 # $lines=(); | |
1163 undef($message_filter); | |
1164 undef($message_score); | |
1165 undef($lines); | |
1166 return; | |
1167 } | |
1168 | |
1169 | |
1170 | |
1171 sub excise_seq{ | |
1172 | |
1173 #excise sub sequence from the potential precursor | |
1174 | |
1175 my($seq,$beg,$end,$strand)=@_; | |
1176 | |
1177 #begin can be equal to end if only one nucleotide is excised | |
1178 unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;} | |
1179 | |
1180 #rarely, permuted combinations of signature and structure cause out of bound excision errors. | |
1181 #this happens once appr. every two thousand combinations | |
1182 unless($beg<=length($seq)){$out_of_bound++;return 0;} | |
1183 | |
1184 #if on the minus strand, the reverse complement should be excised | |
1185 if($strand eq "-"){$seq=revcom($seq);} | |
1186 | |
1187 #the blast parsed format is 1-indexed, substr is 0-indexed | |
1188 my $sub_seq=substr($seq,$beg-1,$end-$beg+1); | |
1189 | |
1190 return $sub_seq; | |
1191 | |
1192 } | |
1193 | |
1194 sub excise_struct{ | |
1195 | |
1196 #excise sub structure | |
1197 | |
1198 my($struct,$beg,$end,$strand)=@_; | |
1199 my $lng=length $struct; | |
1200 | |
1201 #begin can be equal to end if only one nucleotide is excised | |
1202 unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;} | |
1203 | |
1204 #rarely, permuted combinations of signature and structure cause out of bound excision errors. | |
1205 #this happens once appr. every two thousand combinations | |
1206 unless($beg<=length($struct)){return 0;} | |
1207 | |
1208 #if excising relative to minus strand, positions are reversed | |
1209 if($strand eq "-"){($beg,$end)=rev_pos($beg,$end,$lng);} | |
1210 | |
1211 #the blast parsed format is 1-indexed, substr is 0-indexed | |
1212 my $sub_struct=substr($struct,$beg-1,$end-$beg+1); | |
1213 | |
1214 return $sub_struct; | |
1215 } | |
1216 | |
1217 | |
1218 sub create_hash_nuclei{ | |
1219 #parses a fasta file with sequences of known miRNAs considered for conservation purposes | |
1220 #reads the nuclei into a hash | |
1221 | |
1222 my ($file) = @_; | |
1223 my ($id, $desc, $sequence, $nucleus) = (); | |
1224 | |
1225 open (FASTA, "<$file") or die "can not open $file\n"; | |
1226 while (<FASTA>) | |
1227 { | |
1228 chomp; | |
1229 if (/^>(\S+)(.*)/) | |
1230 { | |
1231 $id = $1; | |
1232 $desc = $2; | |
1233 $sequence = ""; | |
1234 $nucleus = ""; | |
1235 while (<FASTA>){ | |
1236 chomp; | |
1237 if (/^>(\S+)(.*)/){ | |
1238 $nucleus = substr($sequence,1,$nucleus_lng); | |
1239 $nucleus =~ tr/[T]/[U]/; | |
1240 $hash_mirs{$nucleus} .="$id\t"; | |
1241 $hash_nuclei{$nucleus} += 1; | |
1242 | |
1243 $id = $1; | |
1244 $desc = $2; | |
1245 $sequence = ""; | |
1246 $nucleus = ""; | |
1247 next; | |
1248 } | |
1249 $sequence .= $_; | |
1250 } | |
1251 } | |
1252 } | |
1253 $nucleus = substr($sequence,1,$nucleus_lng); | |
1254 $nucleus =~ tr/[T]/[U]/; | |
1255 $hash_mirs{$nucleus} .="$id\t"; | |
1256 $hash_nuclei{$nucleus} += 1; | |
1257 close FASTA; | |
1258 } | |
1259 | |
1260 | |
1261 sub parse_file_struct{ | |
1262 #parses the output from RNAfoldand reads it into hashes | |
1263 my($file) = @_; | |
1264 my($id,$desc,$seq,$struct,$mfe) = (); | |
1265 open (FILE_STRUCT, "<$file") or die "can not open $file\n"; | |
1266 while (<FILE_STRUCT>){ | |
1267 chomp; | |
1268 if (/^>(\S+)\s*(.*)/){ | |
1269 $id= $1; | |
1270 $desc= $2; | |
1271 $seq= ""; | |
1272 $struct= ""; | |
1273 $mfe= ""; | |
1274 while (<FILE_STRUCT>){ | |
1275 chomp; | |
1276 if (/^>(\S+)\s*(.*)/){ | |
1277 $hash_desc{$id} = $desc; | |
1278 $hash_seq{$id} = $seq; | |
1279 $hash_struct{$id} = $struct; | |
1280 $hash_mfe{$id} = $mfe; | |
1281 $id = $1; | |
1282 $desc = $2; | |
1283 $seq = ""; | |
1284 $struct = ""; | |
1285 $mfe = ""; | |
1286 next; | |
1287 } | |
1288 if(/^\w/){ | |
1289 tr/uU/tT/; | |
1290 $seq .= $_; | |
1291 next; | |
1292 } | |
1293 if(/((\.|\(|\))+)/){$struct .=$1;} | |
1294 if(/\((\s*-\d+\.\d+)\)/){$mfe = $1;} | |
1295 } | |
1296 } | |
1297 } | |
1298 $hash_desc{$id} = $desc; | |
1299 $hash_seq{$id} = $seq; | |
1300 $hash_struct{$id} = $struct; | |
1301 $hash_mfe{$id} = $mfe; | |
1302 close FILE_STRUCT; | |
1303 return; | |
1304 } | |
1305 | |
1306 | |
1307 sub score_s{ | |
1308 | |
1309 #this score message is appended to the end of the string of score messages outputted for the potential precursor | |
1310 | |
1311 my $message=shift; | |
1312 $message_score.=$message."\n";; | |
1313 return; | |
1314 } | |
1315 | |
1316 | |
1317 | |
1318 sub score_p{ | |
1319 | |
1320 #this score message is appended to the beginning of the string of score messages outputted for the potential precursor | |
1321 | |
1322 my $message=shift; | |
1323 $message_score=$message."\n".$message_score; | |
1324 return; | |
1325 } | |
1326 | |
1327 | |
1328 | |
1329 sub filter_s{ | |
1330 | |
1331 #this filtering message is appended to the end of the string of filtering messages outputted for the potential precursor | |
1332 | |
1333 my $message=shift; | |
1334 $message_filter.=$message."\n"; | |
1335 return; | |
1336 } | |
1337 | |
1338 | |
1339 sub filter_p{ | |
1340 | |
1341 #this filtering message is appended to the beginning of the string of filtering messages outputted for the potential precursor | |
1342 | |
1343 my $message=shift; | |
1344 if(defined $message_filter){$message_filter=$message."\n".$message_filter;} | |
1345 else{$message_filter=$message."\n";} | |
1346 return; | |
1347 } | |
1348 | |
1349 | |
1350 sub find_freq{ | |
1351 | |
1352 #finds the frequency of a given read query from its id. | |
1353 | |
1354 my($query)=@_; | |
1355 | |
1356 if($query=~/x(\d+)/i){ | |
1357 my $freq=$1; | |
1358 return $freq; | |
1359 }else{ | |
1360 #print STDERR "Problem with read format\n"; | |
1361 return 0; | |
1362 } | |
1363 } | |
1364 | |
1365 | |
1366 sub print_hash_comp{ | |
1367 | |
1368 #prints the 'comp' hash | |
1369 | |
1370 my @keys=sort keys %hash_comp; | |
1371 foreach my $key(@keys){ | |
1372 my $value=$hash_comp{$key}; | |
1373 print "$key \t$value\n"; | |
1374 } | |
1375 } | |
1376 | |
1377 | |
1378 | |
1379 sub print_hash_bp{ | |
1380 | |
1381 #prints the 'bp' hash | |
1382 | |
1383 my @keys=sort {$a<=>$b} keys %hash_bp; | |
1384 foreach my $key(@keys){ | |
1385 my $value=$hash_bp{$key}; | |
1386 print "$key\t$value\n"; | |
1387 } | |
1388 print "\n"; | |
1389 } | |
1390 | |
1391 | |
1392 | |
1393 sub find_strand{ | |
1394 | |
1395 #A subroutine to find the strand, parsing different blast formats | |
1396 | |
1397 my($other)=@_; | |
1398 | |
1399 my $strand="+"; | |
1400 | |
1401 if($other=~/-/){ | |
1402 $strand="-"; | |
1403 } | |
1404 | |
1405 if($other=~/minus/i){ | |
1406 $strand="-"; | |
1407 } | |
1408 return($strand); | |
1409 } | |
1410 | |
1411 | |
1412 sub contained{ | |
1413 | |
1414 #Is the stretch defined by the first positions contained in the stretch defined by the second? | |
1415 | |
1416 my($beg1,$end1,$beg2,$end2)=@_; | |
1417 | |
1418 testbeginend($beg1,$end1,$beg2,$end2); | |
1419 | |
1420 if($beg2<=$beg1 and $end1<=$end2){ | |
1421 return 1; | |
1422 }else{ | |
1423 return 0; | |
1424 } | |
1425 } | |
1426 | |
1427 | |
1428 sub testbeginend{ | |
1429 | |
1430 #Are the beginposition numerically smaller than the endposition for each pair? | |
1431 | |
1432 my($begin1,$end1,$begin2,$end2)=@_; | |
1433 | |
1434 unless($begin1<=$end1 and $begin2<=$end2){ | |
1435 print STDERR "beg can not be larger than end for $subject_old\n"; | |
1436 exit; | |
1437 } | |
1438 } | |
1439 | |
1440 | |
1441 sub rev_pos{ | |
1442 | |
1443 # The blast_parsed format always uses positions that are relative to the 5' of the given strand | |
1444 # This means that for a sequence of length n, the first nucleotide on the minus strand base pairs with | |
1445 # the n't nucleotide on the plus strand | |
1446 | |
1447 # This subroutine reverses the begin and end positions of positions of the minus strand so that they | |
1448 # are relative to the 5' end of the plus strand | |
1449 | |
1450 my($beg,$end,$lng)=@_; | |
1451 | |
1452 my $new_end=$lng-$beg+1; | |
1453 my $new_beg=$lng-$end+1; | |
1454 | |
1455 return($new_beg,$new_end); | |
1456 } | |
1457 | |
1458 sub round { | |
1459 | |
1460 #rounds to nearest integer | |
1461 | |
1462 my($number) = shift; | |
1463 return int($number + .5); | |
1464 | |
1465 } | |
1466 | |
1467 | |
1468 sub rev{ | |
1469 | |
1470 #reverses the order of nucleotides in a sequence | |
1471 | |
1472 my($sequence)=@_; | |
1473 | |
1474 my $rev=reverse $sequence; | |
1475 | |
1476 return $rev; | |
1477 } | |
1478 | |
1479 sub com{ | |
1480 | |
1481 #the complementary of a sequence | |
1482 | |
1483 my($sequence)=@_; | |
1484 | |
1485 $sequence=~tr/acgtuACGTU/TGCAATGCAA/; | |
1486 | |
1487 return $sequence; | |
1488 } | |
1489 | |
1490 sub revcom{ | |
1491 | |
1492 #reverse complement | |
1493 | |
1494 my($sequence)=@_; | |
1495 | |
1496 my $revcom=rev(com($sequence)); | |
1497 | |
1498 return $revcom; | |
1499 } | |
1500 | |
1501 | |
1502 sub max2 { | |
1503 | |
1504 #max of two numbers | |
1505 | |
1506 my($a, $b) = @_; | |
1507 return ($a>$b ? $a : $b); | |
1508 } | |
1509 | |
1510 sub min2 { | |
1511 | |
1512 #min of two numbers | |
1513 | |
1514 my($a, $b) = @_; | |
1515 return ($a<$b ? $a : $b); | |
1516 } | |
1517 | |
1518 | |
1519 | |
1520 sub score_freq{ | |
1521 | |
1522 # scores the count of reads that map to the potential precursor | |
1523 # Assumes geometric distribution as described in methods section of manuscript | |
1524 | |
1525 my $freq=shift; | |
1526 | |
1527 #parameters of known precursors and background hairpins | |
1528 my $parameter_test=0.999; | |
1529 my $parameter_control=0.6; | |
1530 | |
1531 #log_odds calculated directly to avoid underflow | |
1532 my $intercept=log((1-$parameter_test)/(1-$parameter_control)); | |
1533 my $slope=log($parameter_test/$parameter_control); | |
1534 my $log_odds=$slope*$freq+$intercept; | |
1535 | |
1536 #if no strong evidence for 3' overhangs, limit the score contribution to 0 | |
1537 unless($options{x} or $hash_comp{"star_read"}){$log_odds=min2($log_odds,0);} | |
1538 | |
1539 return $log_odds; | |
1540 } | |
1541 | |
1542 | |
1543 | |
1544 ##sub score_mfe{ | |
1545 | |
1546 # scores the minimum free energy in kCal/mol of the potential precursor | |
1547 # Assumes Gumbel distribution as described in methods section of manuscript | |
1548 | |
1549 ## my $mfe=shift; | |
1550 | |
1551 #numerical value, minimum 1 | |
1552 ## my $mfe_adj=max2(1,-$mfe); | |
1553 | |
1554 #parameters of known precursors and background hairpins, scale and location | |
1555 ## my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32); | |
1556 ## my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23); | |
1557 | |
1558 ## my $odds=$prob_test/$prob_background; | |
1559 ## my $log_odds=log($odds); | |
1560 | |
1561 ## return $log_odds; | |
1562 ##} | |
1563 | |
1564 sub score_mfe{ | |
1565 # use bignum; | |
1566 | |
1567 # scores the minimum free energy in kCal/mol of the potential precursor | |
1568 # Assumes Gumbel distribution as described in methods section of manuscript | |
1569 | |
1570 my ($mfe,$mlng)=@_; | |
1571 | |
1572 #numerical value, minimum 1 | |
1573 my $mfe_adj=max2(1,-$mfe); | |
1574 my $mfe_adj1=$mfe/$mlng; | |
1575 #parameters of known precursors and background hairpins, scale and location | |
1576 my $a=1.339e-12;my $b=2.778e-13;my $c=45.834; | |
1577 my $ev=$e**($mfe_adj1*$c); | |
1578 #print STDERR "\n***",$ev,"**\t",$ev+$b,"\t"; | |
1579 my $log_odds=($a/($b+$ev)); | |
1580 | |
1581 | |
1582 my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32); | |
1583 my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23); | |
1584 | |
1585 my $odds=$prob_test/$prob_background; | |
1586 my $log_odds_2=log($odds); | |
1587 #print STDERR "log_odds :",$log_odds,"\t",$log_odds_2,"\n"; | |
1588 return $log_odds; | |
1589 } | |
1590 | |
1591 | |
1592 | |
1593 sub prob_gumbel_discretized{ | |
1594 | |
1595 # discretized Gumbel distribution, probabilities within windows of 1 kCal/mol | |
1596 # uses the subroutine that calculates the cdf to find the probabilities | |
1597 | |
1598 my ($var,$scale,$location)=@_; | |
1599 | |
1600 my $bound_lower=$var-0.5; | |
1601 my $bound_upper=$var+0.5; | |
1602 | |
1603 my $cdf_lower=cdf_gumbel($bound_lower,$scale,$location); | |
1604 my $cdf_upper=cdf_gumbel($bound_upper,$scale,$location); | |
1605 | |
1606 my $prob=$cdf_upper-$cdf_lower; | |
1607 | |
1608 return $prob; | |
1609 } | |
1610 | |
1611 | |
1612 sub cdf_gumbel{ | |
1613 | |
1614 # calculates the cumulative distribution function of the Gumbel distribution | |
1615 | |
1616 my ($var,$scale,$location)=@_; | |
1617 | |
1618 my $cdf=$e**(-($e**(-($var-$location)/$scale))); | |
1619 | |
1620 return $cdf; | |
1621 } | |
1622 |