50
|
1 #!/usr/bin/perl
|
|
2
|
|
3 use warnings;
|
|
4 use strict;
|
|
5 use Getopt::Std;
|
|
6 #use RNA;
|
|
7
|
|
8
|
|
9 ################################# MIRDEEP #################################################
|
|
10
|
|
11 ################################## USAGE ##################################################
|
|
12
|
|
13
|
|
14 my $usage=
|
|
15 "$0 file_signature file_structure temp_out_directory
|
|
16
|
|
17 This is the core algorithm of miRDeep. It takes as input a file in blastparsed format with
|
|
18 information on the positions of reads aligned to potential precursor sequences (signature).
|
|
19 It also takes as input an RNAfold output file, giving information on the sequence, structure
|
|
20 and mimimum free energy of the potential precursor sequences.
|
|
21
|
|
22 Extra arguments can be given. -s specifies a fastafile containing the known mature miRNA
|
|
23 sequences that should be considered for conservation purposes. -t prints out the potential
|
|
24 precursor sequences that do _not_ exceed the cut-off (default prints out the sequences that
|
|
25 exceeds the cut-off). -u gives limited output, that is only the ids of the potential precursors
|
|
26 that exceed the cut-off. -v varies the cut-off. -x is a sensitive option for Sanger sequences
|
|
27 obtained through conventional cloning. -z consider the number of base pairings in the lower
|
|
28 stems (this option is not well tested).
|
|
29
|
|
30 -h print this usage
|
|
31 -s fasta file with known miRNAs
|
|
32 #-o temp directory ,maked befor running the program.
|
|
33 -t print filtered
|
|
34 -u limited output (only ids)
|
|
35 -v cut-off (default 1)
|
|
36 -x sensitive option for Sanger sequences
|
|
37 -y use Randfold
|
|
38 -z consider Drosha processing
|
|
39 ";
|
|
40
|
|
41
|
|
42
|
|
43
|
|
44
|
|
45 ############################################################################################
|
|
46
|
|
47 ################################### INPUT ##################################################
|
|
48
|
|
49
|
|
50 #signature file in blast_parsed format
|
|
51 my $file_blast_parsed=shift or die $usage;
|
|
52
|
|
53 #structure file outputted from RNAfold
|
|
54 my $file_struct=shift or die $usage;
|
|
55
|
|
56 my $tmpdir=shift or die $usage;
|
|
57 #options
|
|
58 my %options=();
|
|
59 getopts("hs:tuv:xyz",\%options);
|
|
60
|
|
61
|
|
62
|
|
63
|
|
64
|
|
65
|
|
66 #############################################################################################
|
|
67
|
|
68 ############################# GLOBAL VARIABLES ##############################################
|
|
69
|
|
70
|
|
71 #parameters
|
|
72 my $nucleus_lng=11;
|
|
73
|
|
74 my $score_star=3.9;
|
|
75 my $score_star_not=-1.3;
|
|
76 my $score_nucleus=7.63;
|
|
77 my $score_nucleus_not=-1.17;
|
|
78 my $score_randfold=1.37;
|
|
79 my $score_randfold_not=-3.624;
|
|
80 my $score_intercept=0.3;
|
|
81 my @scores_stem=(-3.1,-2.3,-2.2,-1.6,-1.5,0.1,0.6,0.8,0.9,0.9,0);
|
|
82 my $score_min=1;
|
|
83 if($options{v}){$score_min=$options{v};}
|
|
84 if($options{x}){$score_min=-5;}
|
|
85
|
|
86 my $e=2.718281828;
|
|
87
|
|
88 #hashes
|
|
89 my %hash_desc;
|
|
90 my %hash_seq;
|
|
91 my %hash_struct;
|
|
92 my %hash_mfe;
|
|
93 my %hash_nuclei;
|
|
94 my %hash_mirs;
|
|
95 my %hash_query;
|
|
96 my %hash_comp;
|
|
97 my %hash_bp;
|
|
98
|
|
99 #other variables
|
|
100 my $subject_old;
|
|
101 my $message_filter;
|
|
102 my $message_score;
|
|
103 my $lines;
|
|
104 my $out_of_bound;
|
|
105
|
|
106
|
|
107
|
|
108 ##############################################################################################
|
|
109
|
|
110 ################################ MAIN ######################################################
|
|
111
|
|
112
|
|
113 #print help if that option is used
|
|
114 if($options{h}){die $usage;}
|
|
115 unless ($tmpdir=~/\/$/) {$tmpdir .="/";}
|
|
116 if(!(-s $tmpdir)){mkdir $tmpdir;}
|
|
117 $tmpdir .="TMP_DIR/";
|
|
118 mkdir $tmpdir;
|
|
119
|
|
120 #parse structure file outputted from RNAfold
|
|
121 parse_file_struct($file_struct);
|
|
122
|
|
123 #if conservation is scored, the fasta file of known miRNA sequences is parsed
|
|
124 if($options{s}){create_hash_nuclei($options{s})};
|
|
125
|
|
126 #parse signature file in blast_parsed format and resolve each potential precursor
|
|
127 parse_file_blast_parsed($file_blast_parsed);
|
|
128 #`rm -rf $tmpdir`;
|
|
129 exit;
|
|
130
|
|
131
|
|
132
|
|
133
|
|
134 ##############################################################################################
|
|
135
|
|
136 ############################## SUBROUTINES ###################################################
|
|
137
|
|
138
|
|
139
|
|
140 sub parse_file_blast_parsed{
|
|
141
|
|
142 # read through the signature blastparsed file, fills up a hash with information on queries
|
|
143 # (deep sequences) mapping to the current subject (potential precursor) and resolve each
|
|
144 # potential precursor in turn
|
|
145
|
|
146 my $file_blast_parsed=shift;
|
|
147
|
|
148 open (FILE_BLAST_PARSED, "<$file_blast_parsed") or die "can not open $file_blast_parsed\n";
|
|
149 while (my $line=<FILE_BLAST_PARSED>){
|
|
150 if($line=~/^(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)$/){
|
|
151 my $query=$1;
|
|
152 my $query_lng=$2;
|
|
153 my $query_beg=$3;
|
|
154 my $query_end=$4;
|
|
155 my $subject=$5;
|
|
156 my $subject_lng=$6;
|
|
157 my $subject_beg=$7;
|
|
158 my $subject_end=$8;
|
|
159 my $e_value=$9;
|
|
160 my $pid=$10;
|
|
161 my $bitscore=$11;
|
|
162 my $other=$12;
|
|
163
|
|
164 #if the new line concerns a new subject (potential precursor) then the old subject must be resolved
|
|
165 if($subject_old and $subject_old ne $subject){
|
|
166 resolve_potential_precursor();
|
|
167 }
|
|
168
|
|
169 #resolve the strand
|
|
170 my $strand=find_strand($other);
|
|
171
|
|
172 #resolve the number of reads that the deep sequence represents
|
|
173 my $freq=find_freq($query);
|
|
174
|
|
175 #read information of the query (deep sequence) into hash
|
|
176 $hash_query{$query}{"subject_beg"}=$subject_beg;
|
|
177 $hash_query{$query}{"subject_end"}=$subject_end;
|
|
178 $hash_query{$query}{"strand"}=$strand;
|
|
179 $hash_query{$query}{"freq"}=$freq;
|
|
180
|
|
181 #save the signature information
|
|
182 $lines.=$line;
|
|
183
|
|
184 $subject_old=$subject;
|
|
185 }
|
|
186 }
|
|
187 resolve_potential_precursor();
|
|
188 }
|
|
189
|
|
190 sub resolve_potential_precursor{
|
|
191
|
|
192 # dissects the potential precursor in parts by filling hashes, and tests if it passes the
|
|
193 # initial filter and the scoring filter
|
|
194
|
|
195 # binary variable whether the potential precursor is still viable
|
|
196 my $ret=1;
|
|
197 #print STDERR ">$subject_old\n";
|
|
198
|
|
199 fill_structure();
|
|
200 #print STDERR "\%hash_bp",scalar keys %hash_bp,"\n";
|
|
201 fill_pri();
|
|
202 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
|
|
203
|
|
204 fill_mature();
|
|
205 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
|
|
206
|
|
207 fill_star();
|
|
208 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
|
|
209
|
|
210 fill_loop();
|
|
211 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
|
|
212
|
|
213 fill_lower_flanks();
|
|
214 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
|
|
215
|
|
216 # do_test_assemble();
|
|
217
|
|
218 # this is the actual classification
|
|
219 unless(pass_filtering_initial() and pass_threshold_score()){$ret=0;}
|
|
220
|
|
221 print_results($ret);
|
|
222
|
|
223 reset_variables();
|
|
224
|
|
225 return;
|
|
226
|
|
227 }
|
|
228
|
|
229
|
|
230
|
|
231 sub print_results{
|
|
232
|
|
233 my $ret=shift;
|
|
234
|
|
235 # print out if the precursor is accepted and accepted precursors should be printed out
|
|
236 # or if the potential precursor is discarded and discarded potential precursors should
|
|
237 # be printed out
|
|
238
|
|
239 if((!$options{t} and $ret) or ($options{t} and !$ret)){
|
|
240 #full output
|
|
241 unless($options{u}){
|
|
242 if($message_filter){print $message_filter;}
|
|
243 if($message_score){print $message_score;}
|
|
244 print_hash_comp();
|
|
245 print $lines,"\n\n";
|
|
246 return;
|
|
247 }
|
|
248 #limited output (only ids)
|
|
249 my $id=$hash_comp{"pri_id"};
|
|
250 print "$id\n";
|
|
251 }
|
|
252 }
|
|
253
|
|
254
|
|
255
|
|
256
|
|
257
|
|
258
|
|
259
|
|
260 sub pass_threshold_score{
|
|
261
|
|
262 # this is the scoring
|
|
263
|
|
264 #minimum free energy of the potential precursor
|
|
265 # my $score_mfe=score_mfe($hash_comp{"pri_mfe"});
|
|
266 my $score_mfe=score_mfe($hash_comp{"pri_mfe"},$hash_comp{"pri_end"});
|
|
267
|
|
268 #count of reads that map in accordance with Dicer processing
|
|
269 my $score_freq=score_freq($hash_comp{"freq"});
|
|
270 #print STDERR "score_mfe: $score_mfe\nscore_freq: $score_freq\n";
|
|
271
|
|
272 #basic score
|
|
273 my $score=$score_mfe+$score_freq;
|
|
274
|
|
275 #scoring of conserved nucleus/seed (optional)
|
|
276 if($options{s}){
|
|
277
|
|
278 #if the nucleus is conserved
|
|
279 if(test_nucleus_conservation()){
|
|
280
|
|
281 #nucleus from position 2-8
|
|
282 my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng);
|
|
283
|
|
284 #resolve DNA/RNA ambiguities
|
|
285 $nucleus=~tr/[T]/[U]/;
|
|
286
|
|
287 #print score contribution
|
|
288 score_s("score_nucleus\t$score_nucleus");
|
|
289
|
|
290 #print the ids of known miRNAs with same nucleus
|
|
291 score_s("$hash_mirs{$nucleus}");
|
|
292 #print STDERR "score_nucleus\t$score_nucleus\n";
|
|
293
|
|
294 #add to score
|
|
295 $score+=$score_nucleus;
|
|
296
|
|
297 #if the nucleus is not conserved
|
|
298 }else{
|
|
299 #print (negative) score contribution
|
|
300 score_s("score_nucleus\t$score_nucleus_not");
|
|
301
|
|
302 #add (negative) score contribution
|
|
303 $score+=$score_nucleus_not;
|
|
304 }
|
|
305 }
|
|
306
|
|
307 #if the majority of potential star reads fall as expected from Dicer processing
|
|
308 if($hash_comp{"star_read"}){
|
|
309 score_s("score_star\t$score_star");
|
|
310 #print STDERR "score_star\t$score_star\n";
|
|
311 $score+=$score_star;
|
|
312 }else{
|
|
313 score_s("score_star\t$score_star_not");
|
|
314 #print STDERR "score_star_not\t$score_star_not\n";
|
|
315 $score+=$score_star_not;
|
|
316 }
|
|
317
|
|
318 #score lower stems for potential for Drosha recognition (highly optional)
|
|
319 if($options{z}){
|
|
320 my $stem_bp=$hash_comp{"stem_bp"};
|
|
321 my $score_stem=$scores_stem[$stem_bp];
|
|
322 $score+=$score_stem;
|
|
323 score_s("score_stem\t$score_stem");
|
|
324 }
|
|
325
|
|
326 #print STDERR "score_intercept\t$score_intercept\n";
|
|
327
|
|
328 $score+=$score_intercept;
|
|
329
|
|
330 #score for randfold (optional)
|
|
331 if($options{y}){
|
|
332
|
|
333 # only calculate randfold value if it can make the difference between the potential precursor
|
|
334 # being accepted or discarded
|
|
335 if($score+$score_randfold>=$score_min and $score+$score_randfold_not<=$score_min){
|
|
336
|
|
337 #randfold value<0.05
|
|
338 if(test_randfold()){$score+=$score_randfold;score_s("score_randfold\t$score_randfold");}
|
|
339
|
|
340 #randfold value>0.05
|
|
341 else{$score+=$score_randfold_not;score_s("score_randfold\t$score_randfold_not");}
|
|
342 }
|
|
343 }
|
|
344
|
|
345 #round off values to one decimal
|
|
346 my $round_mfe=round($score_mfe*10)/10;
|
|
347 my $round_freq=round($score_freq*10)/10;
|
|
348 my $round=round($score*10)/10;
|
|
349
|
|
350 #print scores
|
|
351 score_s("score_mfe\t$round_mfe\nscore_freq\t$round_freq\nscore\t$round");
|
|
352
|
|
353 #return 1 if the potential precursor is accepted, return 0 if discarded
|
|
354 unless($score>=$score_min){return 0;}
|
|
355 return 1;
|
|
356 }
|
|
357
|
|
358 sub test_randfold{
|
|
359
|
|
360 #print sequence to temporary file, test randfold value, return 1 or 0
|
|
361
|
|
362 # print_file("pri_seq.fa",">pri_seq\n".$hash_comp{"pri_seq"});
|
|
363 #my $tmpfile=$tmpdir.$hash_comp{"pri_id"};
|
|
364 #open(FILE, ">$tmpfile");
|
|
365 #print FILE ">pri_seq\n",$hash_comp{"pri_seq"};
|
|
366 #close FILE;
|
|
367
|
|
368 # my $p_value=`randfold -s $tmpfile 999 | cut -f 3`;
|
|
369 #my $p1=`randfold -s $tmpfile 999 | cut -f 3`;
|
|
370 #my $p2=`randfold -s $tmpfile 999 | cut -f 3`;
|
|
371 my $p1=&randfold_pvalue($hash_comp{"pri_seq"},999);
|
|
372 my $p2=&randfold_pvalue($hash_comp{"pri_seq"},999);
|
|
373 my $p_value=($p1+$p2)/2;
|
|
374 wait;
|
|
375 # system "rm $tmpfile";
|
|
376
|
|
377 if($p_value<=0.05){return 1;}
|
|
378
|
|
379 return 0;
|
|
380 }
|
|
381
|
|
382 sub randfold_pvalue{
|
|
383 my $cpt_sup = 0;
|
|
384 my $cpt_inf = 0;
|
|
385 my $cpt_ega = 1;
|
|
386
|
|
387 my ($seq,$number_of_randomizations)=@_;
|
|
388 #my $str =$seq;
|
|
389 #my $mfe = RNA::fold($seq,$str);
|
|
390 my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`;
|
|
391 my @rawfolds=split/\s+/,$rnafold;
|
|
392 my $str=$rawfolds[1];
|
|
393 my $mfe=$rawfolds[-1];
|
|
394 $mfe=~s/\(//;
|
|
395 $mfe=~s/\)//;
|
|
396
|
|
397 for (my $i=0;$i<$number_of_randomizations;$i++) {
|
|
398 $seq = shuffle_sequence_dinucleotide($seq);
|
|
399 #$str = $seq;
|
|
400
|
|
401 #my $rand_mfe = RNA::fold($str,$str);
|
|
402 $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`;
|
|
403 my @rawfolds=split/\s+/,$rnafold;
|
|
404 my $str=$rawfolds[1];
|
|
405 my $rand_mfe=$rawfolds[-1];
|
|
406 $rand_mfe=~s/\(//;
|
|
407 $rand_mfe=~s/\)//;
|
|
408
|
|
409 if ($rand_mfe < $mfe) {
|
|
410 $cpt_inf++;
|
|
411 }
|
|
412 if ($rand_mfe == $mfe) {
|
|
413 $cpt_ega++;
|
|
414 }
|
|
415 if ($rand_mfe > $mfe) {
|
|
416 $cpt_sup++;
|
|
417 }
|
|
418 }
|
|
419
|
|
420 my $proba = ($cpt_ega + $cpt_inf) / ($number_of_randomizations + 1);
|
|
421
|
|
422 #print "$name\t$mfe\t$proba\n";
|
|
423 return $proba;
|
|
424 }
|
|
425
|
|
426 sub shuffle_sequence_dinucleotide {
|
|
427
|
|
428 my ($str) = @_;
|
|
429
|
|
430 # upper case and convert to ATGC
|
|
431 $str = uc($str);
|
|
432 $str =~ s/U/T/g;
|
|
433
|
|
434 my @nuc = ('A','T','G','C');
|
|
435 my $count_swap = 0;
|
|
436 # set maximum number of permutations
|
|
437 my $stop = length($str) * 10;
|
|
438
|
|
439 while($count_swap < $stop) {
|
|
440
|
|
441 my @pos;
|
|
442
|
|
443 # look start and end letters
|
|
444 my $firstnuc = $nuc[int(rand 4)];
|
|
445 my $thirdnuc = $nuc[int(rand 4)];
|
|
446
|
|
447 # get positions for matching nucleotides
|
|
448 for (my $i=0;$i<(length($str)-2);$i++) {
|
|
449 if ((substr($str,$i,1) eq $firstnuc) && (substr($str,$i+2,1) eq $thirdnuc)) {
|
|
450 push (@pos,($i+1));
|
|
451 $i++;
|
|
452 }
|
|
453 }
|
|
454
|
|
455 # swap at random trinucleotides
|
|
456 my $max = scalar(@pos);
|
|
457 for (my $i=0;$i<$max;$i++) {
|
|
458 my $swap = int(rand($max));
|
|
459 if ((abs($pos[$swap] - $pos[$i]) >= 3) && (substr($str,$pos[$i],1) ne substr($str,$pos[$swap],1))) {
|
|
460 $count_swap++;
|
|
461 my $w1 = substr($str,$pos[$i],1);
|
|
462 my $w2 = substr($str,$pos[$swap],1);
|
|
463 substr($str,$pos[$i],1,$w2);
|
|
464 substr($str,$pos[$swap],1,$w1);
|
|
465 }
|
|
466 }
|
|
467 }
|
|
468 return($str);
|
|
469 }
|
|
470
|
|
471 sub test_nucleus_conservation{
|
|
472
|
|
473 #test if nucleus is identical to nucleus from known miRNA, return 1 or 0
|
|
474
|
|
475 my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng);
|
|
476 $nucleus=~tr/[T]/[U]/;
|
|
477 if($hash_nuclei{$nucleus}){return 1;}
|
|
478
|
|
479 return 0;
|
|
480 }
|
|
481
|
|
482
|
|
483
|
|
484 sub pass_filtering_initial{
|
|
485
|
|
486 #test if the structure forms a plausible hairpin
|
|
487 unless(pass_filtering_structure()){filter_p("structure problem"); return 0;}
|
|
488
|
|
489 #test if >90% of reads map to the hairpin in consistence with Dicer processing
|
|
490 unless(pass_filtering_signature()){filter_p("signature problem");return 0;}
|
|
491
|
|
492 return 1;
|
|
493
|
|
494 }
|
|
495
|
|
496
|
|
497 sub pass_filtering_signature{
|
|
498
|
|
499 #number of reads that map in consistence with Dicer processing
|
|
500 my $consistent=0;
|
|
501
|
|
502 #number of reads that map inconsistent with Dicer processing
|
|
503 my $inconsistent=0;
|
|
504
|
|
505 # number of potential star reads map in good consistence with Drosha/Dicer processing
|
|
506 # (3' overhangs relative to mature product)
|
|
507 my $star_perfect=0;
|
|
508
|
|
509 # number of potential star reads that do not map in good consistence with 3' overhang
|
|
510 my $star_fuzzy=0;
|
|
511
|
|
512
|
|
513 #sort queries (deep sequences) by their position on the hairpin
|
|
514 my @queries=sort {$hash_query{$a}{"subject_beg"} <=> $hash_query{$b}{"subject_beg"}} keys %hash_query;
|
|
515
|
|
516 foreach my $query(@queries){
|
|
517
|
|
518 #number of reads that the deep sequence represents
|
|
519 unless(defined($hash_query{$query}{"freq"})){next;}
|
|
520 my $query_freq=$hash_query{$query}{"freq"};
|
|
521
|
|
522 #test which Dicer product (if any) the deep sequence corresponds to
|
|
523 my $product=test_query($query);
|
|
524
|
|
525 #if the deep sequence corresponds to a Dicer product, add to the 'consistent' variable
|
|
526 if($product){$consistent+=$query_freq;}
|
|
527
|
|
528 #if the deep sequence do not correspond to a Dicer product, add to the 'inconsistent' variable
|
|
529 else{$inconsistent+=$query_freq;}
|
|
530
|
|
531 #test a potential star sequence has good 3' overhang
|
|
532 if($product eq "star"){
|
|
533 if(test_star($query)){$star_perfect+=$query_freq;}
|
|
534 else{$star_fuzzy+=$query_freq;}
|
|
535 }
|
|
536 }
|
|
537
|
|
538 # if the majority of potential star sequences map in good accordance with 3' overhang
|
|
539 # score for the presence of star evidence
|
|
540 if($star_perfect>$star_fuzzy){$hash_comp{"star_read"}=1;}
|
|
541
|
|
542 #total number of reads mapping to the hairpin
|
|
543 my $freq=$consistent+$inconsistent;
|
|
544 $hash_comp{"freq"}=$freq;
|
|
545 unless($freq>0){filter_s("read frequency too low"); return 0;}
|
|
546
|
|
547 #unless >90% of the reads map in consistence with Dicer processing, the hairpin is discarded
|
|
548 my $inconsistent_fraction=$inconsistent/($inconsistent+$consistent);
|
|
549 unless($inconsistent_fraction<=0.1){filter_p("inconsistent\t$inconsistent\nconsistent\t$consistent"); return 0;}
|
|
550
|
|
551 #the hairpin is retained
|
|
552 return 1;
|
|
553 }
|
|
554
|
|
555 sub test_star{
|
|
556
|
|
557 #test if a deep sequence maps in good consistence with 3' overhang
|
|
558
|
|
559 my $query=shift;
|
|
560
|
|
561 #5' begin and 3' end positions
|
|
562 my $beg=$hash_query{$query}{"subject_beg"};
|
|
563 my $end=$hash_query{$query}{"subject_end"};
|
|
564
|
|
565 #the difference between observed and expected begin positions must be 0 or 1
|
|
566 my $offset=$beg-$hash_comp{"star_beg"};
|
|
567 if($offset==0 or $offset==1 or $offset==-1){return 1;}
|
|
568
|
|
569 return 0;
|
|
570 }
|
|
571
|
|
572
|
|
573
|
|
574 sub test_query{
|
|
575
|
|
576 #test if deep sequence maps in consistence with Dicer processing
|
|
577
|
|
578 my $query=shift;
|
|
579
|
|
580 #begin, end, strand and read count
|
|
581 my $beg=$hash_query{$query}{"subject_beg"};
|
|
582 my $end=$hash_query{$query}{"subject_end"};
|
|
583 my $strand=$hash_query{$query}{"strand"};
|
|
584 my $freq=$hash_query{$query}{"freq"};
|
|
585
|
|
586 #should not be on the minus strand (although this has in fact anecdotally been observed for known miRNAs)
|
|
587 if($strand eq '-'){return 0;}
|
|
588
|
|
589 #the deep sequence is allowed to stretch 2 nt beyond the expected 5' end
|
|
590 my $fuzz_beg=2;
|
|
591 #the deep sequence is allowed to stretch 5 nt beyond the expected 3' end
|
|
592 my $fuzz_end=2;
|
|
593
|
|
594 #if in accordance with Dicer processing, return the type of Dicer product
|
|
595 if(contained($beg,$end,$hash_comp{"mature_beg"}-$fuzz_beg,$hash_comp{"mature_end"}+$fuzz_end)){return "mature";}
|
|
596 if(contained($beg,$end,$hash_comp{"star_beg"}-$fuzz_beg,$hash_comp{"star_end"}+$fuzz_end)){return "star";}
|
|
597 if(contained($beg,$end,$hash_comp{"loop_beg"}-$fuzz_beg,$hash_comp{"loop_end"}+$fuzz_end)){return "loop";}
|
|
598
|
|
599 #if not in accordance, return 0
|
|
600 return 0;
|
|
601 }
|
|
602
|
|
603
|
|
604 sub pass_filtering_structure{
|
|
605
|
|
606 #The potential precursor must form a hairpin with miRNA precursor-like characteristics
|
|
607
|
|
608 #return value
|
|
609 my $ret=1;
|
|
610
|
|
611 #potential mature, star, loop and lower flank parts must be identifiable
|
|
612 unless(test_components()){return 0;}
|
|
613
|
|
614 #no bifurcations
|
|
615 unless(no_bifurcations_precursor()){$ret=0;}
|
|
616
|
|
617 #minimum 14 base pairings in duplex
|
|
618 unless(bp_duplex()>=15){$ret=0;filter_s("too few pairings in duplex");}
|
|
619
|
|
620 #not more than 6 nt difference between mature and star length
|
|
621 unless(-6<diff_lng() and diff_lng()<6){$ret=0; filter_s("too big difference between mature and star length") }
|
|
622
|
|
623 return $ret;
|
|
624 }
|
|
625
|
|
626
|
|
627
|
|
628
|
|
629
|
|
630
|
|
631 sub test_components{
|
|
632
|
|
633 #tests whether potential mature, star, loop and lower flank parts are identifiable
|
|
634
|
|
635 unless($hash_comp{"mature_struct"}){
|
|
636 filter_s("no mature");
|
|
637 # print STDERR "no mature\n";
|
|
638 return 0;
|
|
639 }
|
|
640
|
|
641 unless($hash_comp{"star_struct"}){
|
|
642 filter_s("no star");
|
|
643 # print STDERR "no star\n";
|
|
644 return 0;
|
|
645 }
|
|
646
|
|
647 unless($hash_comp{"loop_struct"}){
|
|
648 filter_s("no loop");
|
|
649 # print STDERR "no loop\n";
|
|
650 return 0;
|
|
651 }
|
|
652
|
|
653 unless($hash_comp{"flank_first_struct"}){
|
|
654 filter_s("no flanks");
|
|
655 #print STDERR "no flanks_first_struct\n";
|
|
656 return 0;
|
|
657 }
|
|
658
|
|
659 unless($hash_comp{"flank_second_struct"}){
|
|
660 filter_s("no flanks");
|
|
661 # print STDERR "no flanks_second_struct\n";
|
|
662 return 0;
|
|
663 }
|
|
664 return 1;
|
|
665 }
|
|
666
|
|
667
|
|
668
|
|
669
|
|
670
|
|
671 sub no_bifurcations_precursor{
|
|
672
|
|
673 #tests whether there are bifurcations in the hairpin
|
|
674
|
|
675 #assembles the potential precursor sequence and structure from the expected Dicer products
|
|
676 #this is the expected biological precursor, in contrast with 'pri_seq' that includes
|
|
677 #some genomic flanks on both sides
|
|
678
|
|
679 my $pre_struct;
|
|
680 my $pre_seq;
|
|
681 if($hash_comp{"mature_arm"} eq "first"){
|
|
682 $pre_struct.=$hash_comp{"mature_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"star_struct"};
|
|
683 $pre_seq.=$hash_comp{"mature_seq"}.$hash_comp{"loop_seq"}.$hash_comp{"star_seq"};
|
|
684 }else{
|
|
685 $pre_struct.=$hash_comp{"star_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"mature_struct"};
|
|
686 $pre_seq.=$hash_comp{"star_seq"}.$hash_comp{"loop_seq"}.$hash_comp{"mature_seq"};
|
|
687 }
|
|
688
|
|
689 #read into hash
|
|
690 $hash_comp{"pre_struct"}=$pre_struct;
|
|
691 $hash_comp{"pre_seq"}=$pre_seq;
|
|
692
|
|
693 #simple pattern matching checks for bifurcations
|
|
694 unless($pre_struct=~/^((\.|\()+..(\.|\))+)$/){
|
|
695 filter_s("bifurcation in precursor");
|
|
696 # print STDERR "bifurcation in precursor\n";
|
|
697 return 0;
|
|
698 }
|
|
699
|
|
700 return 1;
|
|
701 }
|
|
702
|
|
703 sub bp_precursor{
|
|
704
|
|
705 #total number of bps in the precursor
|
|
706
|
|
707 my $pre_struct=$hash_comp{"pre_struct"};
|
|
708
|
|
709 #simple pattern matching
|
|
710 my $pre_bps=0;
|
|
711 while($pre_struct=~/\(/g){
|
|
712 $pre_bps++;
|
|
713 }
|
|
714 return $pre_bps;
|
|
715 }
|
|
716
|
|
717
|
|
718 sub bp_duplex{
|
|
719
|
|
720 #total number of bps in the duplex
|
|
721
|
|
722 my $duplex_bps=0;
|
|
723 my $mature_struct=$hash_comp{"mature_struct"};
|
|
724
|
|
725 #simple pattern matching
|
|
726 while($mature_struct=~/(\(|\))/g){
|
|
727 $duplex_bps++;
|
|
728 }
|
|
729 return $duplex_bps;
|
|
730 }
|
|
731
|
|
732 sub diff_lng{
|
|
733
|
|
734 #find difference between mature and star lengths
|
|
735
|
|
736 my $mature_lng=length $hash_comp{"mature_struct"};
|
|
737 my $star_lng=length $hash_comp{"star_struct"};
|
|
738 my $diff_lng=$mature_lng-$star_lng;
|
|
739 return $diff_lng;
|
|
740 }
|
|
741
|
|
742
|
|
743
|
|
744 sub do_test_assemble{
|
|
745
|
|
746 # not currently used, tests if the 'pri_struct' as assembled from the parts (Dicer products, lower flanks)
|
|
747 # is identical to 'pri_struct' before disassembly into parts
|
|
748
|
|
749 my $assemble_struct;
|
|
750
|
|
751 if($hash_comp{"flank_first_struct"} and $hash_comp{"mature_struct"} and $hash_comp{"loop_struct"} and $hash_comp{"star_struct"} and $hash_comp{"flank_second_struct"}){
|
|
752 if($hash_comp{"mature_arm"} eq "first"){
|
|
753 $assemble_struct.=$hash_comp{"flank_first_struct"}.$hash_comp{"mature_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"star_struct"}.$hash_comp{"flank_second_struct"};
|
|
754 }else{
|
|
755 $assemble_struct.=$hash_comp{"flank_first_struct"}.$hash_comp{"star_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"mature_struct"}.$hash_comp{"flank_second_struct"};
|
|
756 }
|
|
757 unless($assemble_struct eq $hash_comp{"pri_struct"}){
|
|
758 $hash_comp{"test_assemble"}=$assemble_struct;
|
|
759 print_hash_comp();
|
|
760 }
|
|
761 }
|
|
762 return;
|
|
763 }
|
|
764
|
|
765
|
|
766
|
|
767 sub fill_structure{
|
|
768
|
|
769 #reads the dot bracket structure into the 'bp' hash where each key and value are basepaired
|
|
770
|
|
771 my $struct=$hash_struct{$subject_old};
|
|
772 my $lng=length $struct;
|
|
773
|
|
774 #local stack for keeping track of basepairings
|
|
775 my @bps;
|
|
776
|
|
777 for(my $pos=1;$pos<=$lng;$pos++){
|
|
778 my $struct_pos=excise_struct($struct,$pos,$pos,"+");
|
|
779
|
|
780 if($struct_pos eq "("){
|
|
781 push(@bps,$pos);
|
|
782 }
|
|
783
|
|
784 if($struct_pos eq ")"){
|
|
785 my $pos_prev=pop(@bps);
|
|
786 $hash_bp{$pos_prev}=$pos;
|
|
787 $hash_bp{$pos}=$pos_prev;
|
|
788 }
|
|
789 }
|
|
790 return;
|
|
791 }
|
|
792
|
|
793
|
|
794
|
|
795 sub fill_star{
|
|
796
|
|
797 #fills specifics on the expected star strand into 'comp' hash ('component' hash)
|
|
798
|
|
799 #if the mature sequence is not plausible, don't look for the star arm
|
|
800 my $mature_arm=$hash_comp{"mature_arm"};
|
|
801 unless($mature_arm){$hash_comp{"star_arm"}=0; return;}
|
|
802
|
|
803 #if the star sequence is not plausible, don't fill into the hash
|
|
804 my($star_beg,$star_end)=find_star();
|
|
805 my $star_arm=arm_star($star_beg,$star_end);
|
|
806 unless($star_arm){return;}
|
|
807
|
|
808 #excise expected star sequence and structure
|
|
809 my $star_seq=excise_seq($hash_comp{"pri_seq"},$star_beg,$star_end,"+");
|
|
810 my $star_struct=excise_seq($hash_comp{"pri_struct"},$star_beg,$star_end,"+");
|
|
811
|
|
812 #fill into hash
|
|
813 $hash_comp{"star_beg"}=$star_beg;
|
|
814 $hash_comp{"star_end"}=$star_end;
|
|
815 $hash_comp{"star_seq"}=$star_seq;
|
|
816 $hash_comp{"star_struct"}=$star_struct;
|
|
817 $hash_comp{"star_arm"}=$star_arm;
|
|
818
|
|
819 return;
|
|
820 }
|
|
821
|
|
822
|
|
823 sub find_star{
|
|
824
|
|
825 #uses the 'bp' hash to find the expected star begin and end positions from the mature positions
|
|
826
|
|
827 #the -2 is for the overhang
|
|
828 my $mature_beg=$hash_comp{"mature_beg"};
|
|
829 my $mature_end=$hash_comp{"mature_end"}-2;
|
|
830 my $mature_lng=$mature_end-$mature_beg+1;
|
|
831
|
|
832 #in some cases, the last nucleotide of the mature sequence does not form a base pair,
|
|
833 #and therefore does not basepair with the first nucleotide of the star sequence.
|
|
834 #In this case, the algorithm searches for the last nucleotide of the mature sequence
|
|
835 #to form a base pair. The offset is the number of nucleotides searched through.
|
|
836 my $offset_star_beg=0;
|
|
837 my $offset_beg=0;
|
|
838
|
|
839 #the offset should not be longer than the length of the mature sequence, then it
|
|
840 #means that the mature sequence does not form any base pairs
|
|
841 while(!$offset_star_beg and $offset_beg<$mature_lng){
|
|
842 if($hash_bp{$mature_end-$offset_beg}){
|
|
843 $offset_star_beg=$hash_bp{$mature_end-$offset_beg};
|
|
844 }else{
|
|
845 $offset_beg++;
|
|
846 }
|
|
847 }
|
|
848 #when defining the beginning of the star sequence, compensate for the offset
|
|
849 my $star_beg=$offset_star_beg-$offset_beg;
|
|
850
|
|
851 #same as above
|
|
852 my $offset_star_end=0;
|
|
853 my $offset_end=0;
|
|
854 while(!$offset_star_end and $offset_end<$mature_lng){
|
|
855 if($hash_bp{$mature_beg+$offset_end}){
|
|
856 $offset_star_end=$hash_bp{$mature_beg+$offset_end};
|
|
857 }else{
|
|
858 $offset_end++;
|
|
859 }
|
|
860 }
|
|
861 #the +2 is for the overhang
|
|
862 my $star_end=$offset_star_end+$offset_end+2;
|
|
863
|
|
864 return($star_beg,$star_end);
|
|
865 }
|
|
866
|
|
867
|
|
868 sub fill_pri{
|
|
869
|
|
870 #fills basic specifics on the precursor into the 'comp' hash
|
|
871
|
|
872 my $seq=$hash_seq{$subject_old};
|
|
873 my $struct=$hash_struct{$subject_old};
|
|
874 my $mfe=$hash_mfe{$subject_old};
|
|
875 my $length=length $seq;
|
|
876
|
|
877 $hash_comp{"pri_id"}=$subject_old;
|
|
878 $hash_comp{"pri_seq"}=$seq;
|
|
879 $hash_comp{"pri_struct"}=$struct;
|
|
880 $hash_comp{"pri_mfe"}=$mfe;
|
|
881 $hash_comp{"pri_beg"}=1;
|
|
882 $hash_comp{"pri_end"}=$length;
|
|
883
|
|
884 return;
|
|
885 }
|
|
886
|
|
887
|
|
888 sub fill_mature{
|
|
889
|
|
890 #fills specifics on the mature sequence into the 'comp' hash
|
|
891
|
|
892 my $mature_query=find_mature_query();
|
|
893 my($mature_beg,$mature_end)=find_positions_query($mature_query);
|
|
894 my $mature_strand=find_strand_query($mature_query);
|
|
895 my $mature_seq=excise_seq($hash_comp{"pri_seq"},$mature_beg,$mature_end,$mature_strand);
|
|
896 my $mature_struct=excise_struct($hash_comp{"pri_struct"},$mature_beg,$mature_end,$mature_strand);
|
|
897 my $mature_arm=arm_mature($mature_beg,$mature_end,$mature_strand);
|
|
898
|
|
899 $hash_comp{"mature_query"}=$mature_query;
|
|
900 $hash_comp{"mature_beg"}=$mature_beg;
|
|
901 $hash_comp{"mature_end"}=$mature_end;
|
|
902 $hash_comp{"mature_strand"}=$mature_strand;
|
|
903 $hash_comp{"mature_struct"}=$mature_struct;
|
|
904 $hash_comp{"mature_seq"}=$mature_seq;
|
|
905 $hash_comp{"mature_arm"}=$mature_arm;
|
|
906
|
|
907 return;
|
|
908 }
|
|
909
|
|
910
|
|
911
|
|
912 sub fill_loop{
|
|
913
|
|
914 #fills specifics on the loop sequence into the 'comp' hash
|
|
915
|
|
916 #unless both mature and star sequences are plausible, do not look for the loop
|
|
917 unless($hash_comp{"mature_arm"} and $hash_comp{"star_arm"}){return;}
|
|
918
|
|
919 my $loop_beg;
|
|
920 my $loop_end;
|
|
921
|
|
922 #defining the begin and end positions of the loop from the mature and star positions
|
|
923 #excision depends on whether the mature or star sequence is 5' of the loop ('first')
|
|
924 if($hash_comp{"mature_arm"} eq "first"){
|
|
925 $loop_beg=$hash_comp{"mature_end"}+1;
|
|
926 }else{
|
|
927 $loop_end=$hash_comp{"mature_beg"}-1;
|
|
928 }
|
|
929
|
|
930 if($hash_comp{"star_arm"} eq "first"){
|
|
931 $loop_beg=$hash_comp{"star_end"}+1;
|
|
932 }else{
|
|
933 $loop_end=$hash_comp{"star_beg"}-1;
|
|
934 }
|
|
935
|
|
936 #unless the positions are plausible, do not fill into hash
|
|
937 unless(test_loop($loop_beg,$loop_end)){return;}
|
|
938
|
|
939 my $loop_seq=excise_seq($hash_comp{"pri_seq"},$loop_beg,$loop_end,"+");
|
|
940 my $loop_struct=excise_struct($hash_comp{"pri_struct"},$loop_beg,$loop_end,"+");
|
|
941
|
|
942 $hash_comp{"loop_beg"}=$loop_beg;
|
|
943 $hash_comp{"loop_end"}=$loop_end;
|
|
944 $hash_comp{"loop_seq"}=$loop_seq;
|
|
945 $hash_comp{"loop_struct"}=$loop_struct;
|
|
946
|
|
947 return;
|
|
948 }
|
|
949
|
|
950
|
|
951 sub fill_lower_flanks{
|
|
952
|
|
953 #fills specifics on the lower flanks and unpaired strands into the 'comp' hash
|
|
954
|
|
955 #unless both mature and star sequences are plausible, do not look for the flanks
|
|
956 unless($hash_comp{"mature_arm"} and $hash_comp{"star_arm"}){return;}
|
|
957
|
|
958 my $flank_first_end;
|
|
959 my $flank_second_beg;
|
|
960
|
|
961 #defining the begin and end positions of the flanks from the mature and star positions
|
|
962 #excision depends on whether the mature or star sequence is 5' in the potenitial precursor ('first')
|
|
963 if($hash_comp{"mature_arm"} eq "first"){
|
|
964 $flank_first_end=$hash_comp{"mature_beg"}-1;
|
|
965 }else{
|
|
966 $flank_second_beg=$hash_comp{"mature_end"}+1;
|
|
967 }
|
|
968
|
|
969 if($hash_comp{"star_arm"} eq "first"){
|
|
970 $flank_first_end=$hash_comp{"star_beg"}-1;
|
|
971 }else{
|
|
972 $flank_second_beg=$hash_comp{"star_end"}+1;
|
|
973 }
|
|
974
|
|
975 #unless the positions are plausible, do not fill into hash
|
|
976 unless(test_flanks($flank_first_end,$flank_second_beg)){return;}
|
|
977
|
|
978 $hash_comp{"flank_first_end"}=$flank_first_end;
|
|
979 $hash_comp{"flank_second_beg"}=$flank_second_beg;
|
|
980 $hash_comp{"flank_first_seq"}=excise_seq($hash_comp{"pri_seq"},$hash_comp{"pri_beg"},$hash_comp{"flank_first_end"},"+");
|
|
981 $hash_comp{"flank_second_seq"}=excise_seq($hash_comp{"pri_seq"},$hash_comp{"flank_second_beg"},$hash_comp{"pri_end"},"+");
|
|
982 $hash_comp{"flank_first_struct"}=excise_struct($hash_comp{"pri_struct"},$hash_comp{"pri_beg"},$hash_comp{"flank_first_end"},"+");
|
|
983 $hash_comp{"flank_second_struct"}=excise_struct($hash_comp{"pri_struct"},$hash_comp{"flank_second_beg"},$hash_comp{"pri_end"},"+");
|
|
984
|
|
985 if($options{z}){
|
|
986 fill_stems_drosha();
|
|
987 }
|
|
988
|
|
989 return;
|
|
990 }
|
|
991
|
|
992
|
|
993 sub fill_stems_drosha{
|
|
994
|
|
995 #scores the number of base pairings formed by the first ten nt of the lower stems
|
|
996 #in general, the more stems, the higher the score contribution
|
|
997 #warning: this options has not been thoroughly tested
|
|
998
|
|
999 my $flank_first_struct=$hash_comp{"flank_first_struct"};
|
|
1000 my $flank_second_struct=$hash_comp{"flank_second_struct"};
|
|
1001
|
|
1002 my $stem_first=substr($flank_first_struct,-10);
|
|
1003 my $stem_second=substr($flank_second_struct,0,10);
|
|
1004
|
|
1005 my $stem_bp_first=0;
|
|
1006 my $stem_bp_second=0;
|
|
1007
|
|
1008 #find base pairings by simple pattern matching
|
|
1009 while($stem_first=~/\(/g){
|
|
1010 $stem_bp_first++;
|
|
1011 }
|
|
1012
|
|
1013 while($stem_second=~/\)/g){
|
|
1014 $stem_bp_second++;
|
|
1015 }
|
|
1016
|
|
1017 my $stem_bp=min2($stem_bp_first,$stem_bp_second);
|
|
1018
|
|
1019 $hash_comp{"stem_first"}=$stem_first;
|
|
1020 $hash_comp{"stem_second"}=$stem_second;
|
|
1021 $hash_comp{"stem_bp_first"}=$stem_bp_first;
|
|
1022 $hash_comp{"stem_bp_second"}=$stem_bp_second;
|
|
1023 $hash_comp{"stem_bp"}=$stem_bp;
|
|
1024
|
|
1025 return;
|
|
1026 }
|
|
1027
|
|
1028
|
|
1029
|
|
1030
|
|
1031 sub arm_mature{
|
|
1032
|
|
1033 #tests whether the mature sequence is in the 5' ('first') or 3' ('second') arm of the potential precursor
|
|
1034
|
|
1035 my ($beg,$end,$strand)=@_;
|
|
1036
|
|
1037 #mature and star sequences should alway be on plus strand
|
|
1038 if($strand eq "-"){return 0;}
|
|
1039
|
|
1040 #there should be no bifurcations and minimum one base pairing
|
|
1041 my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,$strand);
|
|
1042 if(defined($struct) and $struct=~/^(\(|\.)+$/ and $struct=~/\(/){
|
|
1043 return "first";
|
|
1044 }elsif(defined($struct) and $struct=~/^(\)|\.)+$/ and $struct=~/\)/){
|
|
1045 return "second";
|
|
1046 }
|
|
1047 return 0;
|
|
1048 }
|
|
1049
|
|
1050
|
|
1051 sub arm_star{
|
|
1052
|
|
1053 #tests whether the star sequence is in the 5' ('first') or 3' ('second') arm of the potential precursor
|
|
1054
|
|
1055 my ($beg,$end)=@_;
|
|
1056
|
|
1057 #unless the begin and end positions are plausible, test negative
|
|
1058 unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;}
|
|
1059
|
|
1060 #no overlap between the mature and the star sequence
|
|
1061 if($hash_comp{"mature_arm"} eq "first"){
|
|
1062 ($hash_comp{"mature_end"}<$beg) or return 0;
|
|
1063 }elsif($hash_comp{"mature_arm"} eq "second"){
|
|
1064 ($end<$hash_comp{"mature_beg"}) or return 0;
|
|
1065 }
|
|
1066
|
|
1067 #there should be no bifurcations and minimum one base pairing
|
|
1068 my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,"+");
|
|
1069 if($struct=~/^(\(|\.)+$/ and $struct=~/\(/){
|
|
1070 return "first";
|
|
1071 }elsif($struct=~/^(\)|\.)+$/ and $struct=~/\)/){
|
|
1072 return "second";
|
|
1073 }
|
|
1074 return 0;
|
|
1075 }
|
|
1076
|
|
1077
|
|
1078 sub test_loop{
|
|
1079
|
|
1080 #tests the loop positions
|
|
1081
|
|
1082 my ($beg,$end)=@_;
|
|
1083
|
|
1084 #unless the begin and end positions are plausible, test negative
|
|
1085 unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;}
|
|
1086
|
|
1087 return 1;
|
|
1088 }
|
|
1089
|
|
1090
|
|
1091 sub test_flanks{
|
|
1092
|
|
1093 #tests the positions of the lower flanks
|
|
1094
|
|
1095 my ($beg,$end)=@_;
|
|
1096
|
|
1097 #unless the begin and end positions are plausible, test negative
|
|
1098 unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;}
|
|
1099
|
|
1100 return 1;
|
|
1101 }
|
|
1102
|
|
1103
|
|
1104 sub comp{
|
|
1105
|
|
1106 #subroutine to retrive from the 'comp' hash
|
|
1107
|
|
1108 my $type=shift;
|
|
1109 my $component=$hash_comp{$type};
|
|
1110 return $component;
|
|
1111 }
|
|
1112
|
|
1113
|
|
1114 sub find_strand_query{
|
|
1115
|
|
1116 #subroutine to find the strand for a given query
|
|
1117
|
|
1118 my $query=shift;
|
|
1119 my $strand=$hash_query{$query}{"strand"};
|
|
1120 return $strand;
|
|
1121 }
|
|
1122
|
|
1123
|
|
1124 sub find_positions_query{
|
|
1125
|
|
1126 #subroutine to find the begin and end positions for a given query
|
|
1127
|
|
1128 my $query=shift;
|
|
1129 my $beg=$hash_query{$query}{"subject_beg"};
|
|
1130 my $end=$hash_query{$query}{"subject_end"};
|
|
1131 return ($beg,$end);
|
|
1132 }
|
|
1133
|
|
1134
|
|
1135
|
|
1136 sub find_mature_query{
|
|
1137
|
|
1138 #finds the query with the highest frequency of reads and returns it
|
|
1139 #is used to determine the positions of the potential mature sequence
|
|
1140
|
|
1141 my @queries=sort {$hash_query{$b}{"freq"} <=> $hash_query{$a}{"freq"}} keys %hash_query;
|
|
1142 my $mature_query=$queries[0];
|
|
1143 return $mature_query;
|
|
1144 }
|
|
1145
|
|
1146
|
|
1147
|
|
1148
|
|
1149 sub reset_variables{
|
|
1150
|
|
1151 #resets the hashes for the next potential precursor
|
|
1152
|
|
1153 # %hash_query=();
|
|
1154 # %hash_comp=();
|
|
1155 # %hash_bp=();
|
|
1156 foreach my $key (keys %hash_query) {delete($hash_query{$key});}
|
|
1157 foreach my $key (keys %hash_comp) {delete($hash_comp{$key});}
|
|
1158 foreach my $key (keys %hash_bp) {delete($hash_bp{$key});}
|
|
1159
|
|
1160 # $message_filter=();
|
|
1161 # $message_score=();
|
|
1162 # $lines=();
|
|
1163 undef($message_filter);
|
|
1164 undef($message_score);
|
|
1165 undef($lines);
|
|
1166 return;
|
|
1167 }
|
|
1168
|
|
1169
|
|
1170
|
|
1171 sub excise_seq{
|
|
1172
|
|
1173 #excise sub sequence from the potential precursor
|
|
1174
|
|
1175 my($seq,$beg,$end,$strand)=@_;
|
|
1176
|
|
1177 #begin can be equal to end if only one nucleotide is excised
|
|
1178 unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;}
|
|
1179
|
|
1180 #rarely, permuted combinations of signature and structure cause out of bound excision errors.
|
|
1181 #this happens once appr. every two thousand combinations
|
|
1182 unless($beg<=length($seq)){$out_of_bound++;return 0;}
|
|
1183
|
|
1184 #if on the minus strand, the reverse complement should be excised
|
|
1185 if($strand eq "-"){$seq=revcom($seq);}
|
|
1186
|
|
1187 #the blast parsed format is 1-indexed, substr is 0-indexed
|
|
1188 my $sub_seq=substr($seq,$beg-1,$end-$beg+1);
|
|
1189
|
|
1190 return $sub_seq;
|
|
1191
|
|
1192 }
|
|
1193
|
|
1194 sub excise_struct{
|
|
1195
|
|
1196 #excise sub structure
|
|
1197
|
|
1198 my($struct,$beg,$end,$strand)=@_;
|
|
1199 my $lng=length $struct;
|
|
1200
|
|
1201 #begin can be equal to end if only one nucleotide is excised
|
|
1202 unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;}
|
|
1203
|
|
1204 #rarely, permuted combinations of signature and structure cause out of bound excision errors.
|
|
1205 #this happens once appr. every two thousand combinations
|
|
1206 unless($beg<=length($struct)){return 0;}
|
|
1207
|
|
1208 #if excising relative to minus strand, positions are reversed
|
|
1209 if($strand eq "-"){($beg,$end)=rev_pos($beg,$end,$lng);}
|
|
1210
|
|
1211 #the blast parsed format is 1-indexed, substr is 0-indexed
|
|
1212 my $sub_struct=substr($struct,$beg-1,$end-$beg+1);
|
|
1213
|
|
1214 return $sub_struct;
|
|
1215 }
|
|
1216
|
|
1217
|
|
1218 sub create_hash_nuclei{
|
|
1219 #parses a fasta file with sequences of known miRNAs considered for conservation purposes
|
|
1220 #reads the nuclei into a hash
|
|
1221
|
|
1222 my ($file) = @_;
|
|
1223 my ($id, $desc, $sequence, $nucleus) = ();
|
|
1224
|
|
1225 open (FASTA, "<$file") or die "can not open $file\n";
|
|
1226 while (<FASTA>)
|
|
1227 {
|
|
1228 chomp;
|
|
1229 if (/^>(\S+)(.*)/)
|
|
1230 {
|
|
1231 $id = $1;
|
|
1232 $desc = $2;
|
|
1233 $sequence = "";
|
|
1234 $nucleus = "";
|
|
1235 while (<FASTA>){
|
|
1236 chomp;
|
|
1237 if (/^>(\S+)(.*)/){
|
|
1238 $nucleus = substr($sequence,1,$nucleus_lng);
|
|
1239 $nucleus =~ tr/[T]/[U]/;
|
|
1240 $hash_mirs{$nucleus} .="$id\t";
|
|
1241 $hash_nuclei{$nucleus} += 1;
|
|
1242
|
|
1243 $id = $1;
|
|
1244 $desc = $2;
|
|
1245 $sequence = "";
|
|
1246 $nucleus = "";
|
|
1247 next;
|
|
1248 }
|
|
1249 $sequence .= $_;
|
|
1250 }
|
|
1251 }
|
|
1252 }
|
|
1253 $nucleus = substr($sequence,1,$nucleus_lng);
|
|
1254 $nucleus =~ tr/[T]/[U]/;
|
|
1255 $hash_mirs{$nucleus} .="$id\t";
|
|
1256 $hash_nuclei{$nucleus} += 1;
|
|
1257 close FASTA;
|
|
1258 }
|
|
1259
|
|
1260
|
|
1261 sub parse_file_struct{
|
|
1262 #parses the output from RNAfoldand reads it into hashes
|
|
1263 my($file) = @_;
|
|
1264 my($id,$desc,$seq,$struct,$mfe) = ();
|
|
1265 open (FILE_STRUCT, "<$file") or die "can not open $file\n";
|
|
1266 while (<FILE_STRUCT>){
|
|
1267 chomp;
|
|
1268 if (/^>(\S+)\s*(.*)/){
|
|
1269 $id= $1;
|
|
1270 $desc= $2;
|
|
1271 $seq= "";
|
|
1272 $struct= "";
|
|
1273 $mfe= "";
|
|
1274 while (<FILE_STRUCT>){
|
|
1275 chomp;
|
|
1276 if (/^>(\S+)\s*(.*)/){
|
|
1277 $hash_desc{$id} = $desc;
|
|
1278 $hash_seq{$id} = $seq;
|
|
1279 $hash_struct{$id} = $struct;
|
|
1280 $hash_mfe{$id} = $mfe;
|
|
1281 $id = $1;
|
|
1282 $desc = $2;
|
|
1283 $seq = "";
|
|
1284 $struct = "";
|
|
1285 $mfe = "";
|
|
1286 next;
|
|
1287 }
|
|
1288 if(/^\w/){
|
|
1289 tr/uU/tT/;
|
|
1290 $seq .= $_;
|
|
1291 next;
|
|
1292 }
|
|
1293 if(/((\.|\(|\))+)/){$struct .=$1;}
|
|
1294 if(/\((\s*-\d+\.\d+)\)/){$mfe = $1;}
|
|
1295 }
|
|
1296 }
|
|
1297 }
|
|
1298 $hash_desc{$id} = $desc;
|
|
1299 $hash_seq{$id} = $seq;
|
|
1300 $hash_struct{$id} = $struct;
|
|
1301 $hash_mfe{$id} = $mfe;
|
|
1302 close FILE_STRUCT;
|
|
1303 return;
|
|
1304 }
|
|
1305
|
|
1306
|
|
1307 sub score_s{
|
|
1308
|
|
1309 #this score message is appended to the end of the string of score messages outputted for the potential precursor
|
|
1310
|
|
1311 my $message=shift;
|
|
1312 $message_score.=$message."\n";;
|
|
1313 return;
|
|
1314 }
|
|
1315
|
|
1316
|
|
1317
|
|
1318 sub score_p{
|
|
1319
|
|
1320 #this score message is appended to the beginning of the string of score messages outputted for the potential precursor
|
|
1321
|
|
1322 my $message=shift;
|
|
1323 $message_score=$message."\n".$message_score;
|
|
1324 return;
|
|
1325 }
|
|
1326
|
|
1327
|
|
1328
|
|
1329 sub filter_s{
|
|
1330
|
|
1331 #this filtering message is appended to the end of the string of filtering messages outputted for the potential precursor
|
|
1332
|
|
1333 my $message=shift;
|
|
1334 $message_filter.=$message."\n";
|
|
1335 return;
|
|
1336 }
|
|
1337
|
|
1338
|
|
1339 sub filter_p{
|
|
1340
|
|
1341 #this filtering message is appended to the beginning of the string of filtering messages outputted for the potential precursor
|
|
1342
|
|
1343 my $message=shift;
|
|
1344 if(defined $message_filter){$message_filter=$message."\n".$message_filter;}
|
|
1345 else{$message_filter=$message."\n";}
|
|
1346 return;
|
|
1347 }
|
|
1348
|
|
1349
|
|
1350 sub find_freq{
|
|
1351
|
|
1352 #finds the frequency of a given read query from its id.
|
|
1353
|
|
1354 my($query)=@_;
|
|
1355
|
|
1356 if($query=~/x(\d+)/i){
|
|
1357 my $freq=$1;
|
|
1358 return $freq;
|
|
1359 }else{
|
|
1360 #print STDERR "Problem with read format\n";
|
|
1361 return 0;
|
|
1362 }
|
|
1363 }
|
|
1364
|
|
1365
|
|
1366 sub print_hash_comp{
|
|
1367
|
|
1368 #prints the 'comp' hash
|
|
1369
|
|
1370 my @keys=sort keys %hash_comp;
|
|
1371 foreach my $key(@keys){
|
|
1372 my $value=$hash_comp{$key};
|
|
1373 print "$key \t$value\n";
|
|
1374 }
|
|
1375 }
|
|
1376
|
|
1377
|
|
1378
|
|
1379 sub print_hash_bp{
|
|
1380
|
|
1381 #prints the 'bp' hash
|
|
1382
|
|
1383 my @keys=sort {$a<=>$b} keys %hash_bp;
|
|
1384 foreach my $key(@keys){
|
|
1385 my $value=$hash_bp{$key};
|
|
1386 print "$key\t$value\n";
|
|
1387 }
|
|
1388 print "\n";
|
|
1389 }
|
|
1390
|
|
1391
|
|
1392
|
|
1393 sub find_strand{
|
|
1394
|
|
1395 #A subroutine to find the strand, parsing different blast formats
|
|
1396
|
|
1397 my($other)=@_;
|
|
1398
|
|
1399 my $strand="+";
|
|
1400
|
|
1401 if($other=~/-/){
|
|
1402 $strand="-";
|
|
1403 }
|
|
1404
|
|
1405 if($other=~/minus/i){
|
|
1406 $strand="-";
|
|
1407 }
|
|
1408 return($strand);
|
|
1409 }
|
|
1410
|
|
1411
|
|
1412 sub contained{
|
|
1413
|
|
1414 #Is the stretch defined by the first positions contained in the stretch defined by the second?
|
|
1415
|
|
1416 my($beg1,$end1,$beg2,$end2)=@_;
|
|
1417
|
|
1418 testbeginend($beg1,$end1,$beg2,$end2);
|
|
1419
|
|
1420 if($beg2<=$beg1 and $end1<=$end2){
|
|
1421 return 1;
|
|
1422 }else{
|
|
1423 return 0;
|
|
1424 }
|
|
1425 }
|
|
1426
|
|
1427
|
|
1428 sub testbeginend{
|
|
1429
|
|
1430 #Are the beginposition numerically smaller than the endposition for each pair?
|
|
1431
|
|
1432 my($begin1,$end1,$begin2,$end2)=@_;
|
|
1433
|
|
1434 unless($begin1<=$end1 and $begin2<=$end2){
|
|
1435 print STDERR "beg can not be larger than end for $subject_old\n";
|
|
1436 exit;
|
|
1437 }
|
|
1438 }
|
|
1439
|
|
1440
|
|
1441 sub rev_pos{
|
|
1442
|
|
1443 # The blast_parsed format always uses positions that are relative to the 5' of the given strand
|
|
1444 # This means that for a sequence of length n, the first nucleotide on the minus strand base pairs with
|
|
1445 # the n't nucleotide on the plus strand
|
|
1446
|
|
1447 # This subroutine reverses the begin and end positions of positions of the minus strand so that they
|
|
1448 # are relative to the 5' end of the plus strand
|
|
1449
|
|
1450 my($beg,$end,$lng)=@_;
|
|
1451
|
|
1452 my $new_end=$lng-$beg+1;
|
|
1453 my $new_beg=$lng-$end+1;
|
|
1454
|
|
1455 return($new_beg,$new_end);
|
|
1456 }
|
|
1457
|
|
1458 sub round {
|
|
1459
|
|
1460 #rounds to nearest integer
|
|
1461
|
|
1462 my($number) = shift;
|
|
1463 return int($number + .5);
|
|
1464
|
|
1465 }
|
|
1466
|
|
1467
|
|
1468 sub rev{
|
|
1469
|
|
1470 #reverses the order of nucleotides in a sequence
|
|
1471
|
|
1472 my($sequence)=@_;
|
|
1473
|
|
1474 my $rev=reverse $sequence;
|
|
1475
|
|
1476 return $rev;
|
|
1477 }
|
|
1478
|
|
1479 sub com{
|
|
1480
|
|
1481 #the complementary of a sequence
|
|
1482
|
|
1483 my($sequence)=@_;
|
|
1484
|
|
1485 $sequence=~tr/acgtuACGTU/TGCAATGCAA/;
|
|
1486
|
|
1487 return $sequence;
|
|
1488 }
|
|
1489
|
|
1490 sub revcom{
|
|
1491
|
|
1492 #reverse complement
|
|
1493
|
|
1494 my($sequence)=@_;
|
|
1495
|
|
1496 my $revcom=rev(com($sequence));
|
|
1497
|
|
1498 return $revcom;
|
|
1499 }
|
|
1500
|
|
1501
|
|
1502 sub max2 {
|
|
1503
|
|
1504 #max of two numbers
|
|
1505
|
|
1506 my($a, $b) = @_;
|
|
1507 return ($a>$b ? $a : $b);
|
|
1508 }
|
|
1509
|
|
1510 sub min2 {
|
|
1511
|
|
1512 #min of two numbers
|
|
1513
|
|
1514 my($a, $b) = @_;
|
|
1515 return ($a<$b ? $a : $b);
|
|
1516 }
|
|
1517
|
|
1518
|
|
1519
|
|
1520 sub score_freq{
|
|
1521
|
|
1522 # scores the count of reads that map to the potential precursor
|
|
1523 # Assumes geometric distribution as described in methods section of manuscript
|
|
1524
|
|
1525 my $freq=shift;
|
|
1526
|
|
1527 #parameters of known precursors and background hairpins
|
|
1528 my $parameter_test=0.999;
|
|
1529 my $parameter_control=0.6;
|
|
1530
|
|
1531 #log_odds calculated directly to avoid underflow
|
|
1532 my $intercept=log((1-$parameter_test)/(1-$parameter_control));
|
|
1533 my $slope=log($parameter_test/$parameter_control);
|
|
1534 my $log_odds=$slope*$freq+$intercept;
|
|
1535
|
|
1536 #if no strong evidence for 3' overhangs, limit the score contribution to 0
|
|
1537 unless($options{x} or $hash_comp{"star_read"}){$log_odds=min2($log_odds,0);}
|
|
1538
|
|
1539 return $log_odds;
|
|
1540 }
|
|
1541
|
|
1542
|
|
1543
|
|
1544 ##sub score_mfe{
|
|
1545
|
|
1546 # scores the minimum free energy in kCal/mol of the potential precursor
|
|
1547 # Assumes Gumbel distribution as described in methods section of manuscript
|
|
1548
|
|
1549 ## my $mfe=shift;
|
|
1550
|
|
1551 #numerical value, minimum 1
|
|
1552 ## my $mfe_adj=max2(1,-$mfe);
|
|
1553
|
|
1554 #parameters of known precursors and background hairpins, scale and location
|
|
1555 ## my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32);
|
|
1556 ## my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23);
|
|
1557
|
|
1558 ## my $odds=$prob_test/$prob_background;
|
|
1559 ## my $log_odds=log($odds);
|
|
1560
|
|
1561 ## return $log_odds;
|
|
1562 ##}
|
|
1563
|
|
1564 sub score_mfe{
|
|
1565 # use bignum;
|
|
1566
|
|
1567 # scores the minimum free energy in kCal/mol of the potential precursor
|
|
1568 # Assumes Gumbel distribution as described in methods section of manuscript
|
|
1569
|
|
1570 my ($mfe,$mlng)=@_;
|
|
1571
|
|
1572 #numerical value, minimum 1
|
|
1573 my $mfe_adj=max2(1,-$mfe);
|
|
1574 my $mfe_adj1=$mfe/$mlng;
|
|
1575 #parameters of known precursors and background hairpins, scale and location
|
|
1576 my $a=1.339e-12;my $b=2.778e-13;my $c=45.834;
|
|
1577 my $ev=$e**($mfe_adj1*$c);
|
|
1578 #print STDERR "\n***",$ev,"**\t",$ev+$b,"\t";
|
|
1579 my $log_odds=($a/($b+$ev));
|
|
1580
|
|
1581
|
|
1582 my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32);
|
|
1583 my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23);
|
|
1584
|
|
1585 my $odds=$prob_test/$prob_background;
|
|
1586 my $log_odds_2=log($odds);
|
|
1587 #print STDERR "log_odds :",$log_odds,"\t",$log_odds_2,"\n";
|
|
1588 return $log_odds;
|
|
1589 }
|
|
1590
|
|
1591
|
|
1592
|
|
1593 sub prob_gumbel_discretized{
|
|
1594
|
|
1595 # discretized Gumbel distribution, probabilities within windows of 1 kCal/mol
|
|
1596 # uses the subroutine that calculates the cdf to find the probabilities
|
|
1597
|
|
1598 my ($var,$scale,$location)=@_;
|
|
1599
|
|
1600 my $bound_lower=$var-0.5;
|
|
1601 my $bound_upper=$var+0.5;
|
|
1602
|
|
1603 my $cdf_lower=cdf_gumbel($bound_lower,$scale,$location);
|
|
1604 my $cdf_upper=cdf_gumbel($bound_upper,$scale,$location);
|
|
1605
|
|
1606 my $prob=$cdf_upper-$cdf_lower;
|
|
1607
|
|
1608 return $prob;
|
|
1609 }
|
|
1610
|
|
1611
|
|
1612 sub cdf_gumbel{
|
|
1613
|
|
1614 # calculates the cumulative distribution function of the Gumbel distribution
|
|
1615
|
|
1616 my ($var,$scale,$location)=@_;
|
|
1617
|
|
1618 my $cdf=$e**(-($e**(-($var-$location)/$scale)));
|
|
1619
|
|
1620 return $cdf;
|
|
1621 }
|
|
1622
|