annotate miRDeep_plant.pl @ 50:7b5a48b972e9 draft

Uploaded
author big-tiandm
date Fri, 05 Dec 2014 00:11:02 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
50
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1 #!/usr/bin/perl
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
2
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
3 use warnings;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
4 use strict;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
5 use Getopt::Std;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
6 #use RNA;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
7
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
8
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
9 ################################# MIRDEEP #################################################
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
10
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
11 ################################## USAGE ##################################################
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
12
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
13
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
14 my $usage=
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
15 "$0 file_signature file_structure temp_out_directory
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
16
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
17 This is the core algorithm of miRDeep. It takes as input a file in blastparsed format with
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
18 information on the positions of reads aligned to potential precursor sequences (signature).
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
19 It also takes as input an RNAfold output file, giving information on the sequence, structure
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
20 and mimimum free energy of the potential precursor sequences.
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
21
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
22 Extra arguments can be given. -s specifies a fastafile containing the known mature miRNA
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
23 sequences that should be considered for conservation purposes. -t prints out the potential
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
24 precursor sequences that do _not_ exceed the cut-off (default prints out the sequences that
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
25 exceeds the cut-off). -u gives limited output, that is only the ids of the potential precursors
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
26 that exceed the cut-off. -v varies the cut-off. -x is a sensitive option for Sanger sequences
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
27 obtained through conventional cloning. -z consider the number of base pairings in the lower
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
28 stems (this option is not well tested).
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
29
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
30 -h print this usage
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
31 -s fasta file with known miRNAs
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
32 #-o temp directory ,maked befor running the program.
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
33 -t print filtered
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
34 -u limited output (only ids)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
35 -v cut-off (default 1)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
36 -x sensitive option for Sanger sequences
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
37 -y use Randfold
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
38 -z consider Drosha processing
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
39 ";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
40
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
41
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
42
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
43
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
44
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
45 ############################################################################################
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
46
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
47 ################################### INPUT ##################################################
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
48
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
49
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
50 #signature file in blast_parsed format
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
51 my $file_blast_parsed=shift or die $usage;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
52
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
53 #structure file outputted from RNAfold
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
54 my $file_struct=shift or die $usage;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
55
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
56 my $tmpdir=shift or die $usage;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
57 #options
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
58 my %options=();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
59 getopts("hs:tuv:xyz",\%options);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
60
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
61
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
62
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
63
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
64
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
65
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
66 #############################################################################################
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
67
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
68 ############################# GLOBAL VARIABLES ##############################################
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
69
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
70
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
71 #parameters
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
72 my $nucleus_lng=11;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
73
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
74 my $score_star=3.9;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
75 my $score_star_not=-1.3;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
76 my $score_nucleus=7.63;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
77 my $score_nucleus_not=-1.17;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
78 my $score_randfold=1.37;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
79 my $score_randfold_not=-3.624;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
80 my $score_intercept=0.3;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
81 my @scores_stem=(-3.1,-2.3,-2.2,-1.6,-1.5,0.1,0.6,0.8,0.9,0.9,0);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
82 my $score_min=1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
83 if($options{v}){$score_min=$options{v};}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
84 if($options{x}){$score_min=-5;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
85
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
86 my $e=2.718281828;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
87
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
88 #hashes
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
89 my %hash_desc;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
90 my %hash_seq;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
91 my %hash_struct;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
92 my %hash_mfe;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
93 my %hash_nuclei;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
94 my %hash_mirs;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
95 my %hash_query;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
96 my %hash_comp;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
97 my %hash_bp;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
98
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
99 #other variables
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
100 my $subject_old;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
101 my $message_filter;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
102 my $message_score;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
103 my $lines;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
104 my $out_of_bound;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
105
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
106
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
107
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
108 ##############################################################################################
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
109
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
110 ################################ MAIN ######################################################
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
111
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
112
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
113 #print help if that option is used
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
114 if($options{h}){die $usage;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
115 unless ($tmpdir=~/\/$/) {$tmpdir .="/";}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
116 if(!(-s $tmpdir)){mkdir $tmpdir;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
117 $tmpdir .="TMP_DIR/";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
118 mkdir $tmpdir;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
119
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
120 #parse structure file outputted from RNAfold
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
121 parse_file_struct($file_struct);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
122
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
123 #if conservation is scored, the fasta file of known miRNA sequences is parsed
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
124 if($options{s}){create_hash_nuclei($options{s})};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
125
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
126 #parse signature file in blast_parsed format and resolve each potential precursor
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
127 parse_file_blast_parsed($file_blast_parsed);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
128 #`rm -rf $tmpdir`;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
129 exit;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
130
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
131
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
132
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
133
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
134 ##############################################################################################
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
135
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
136 ############################## SUBROUTINES ###################################################
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
137
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
138
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
139
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
140 sub parse_file_blast_parsed{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
141
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
142 # read through the signature blastparsed file, fills up a hash with information on queries
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
143 # (deep sequences) mapping to the current subject (potential precursor) and resolve each
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
144 # potential precursor in turn
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
145
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
146 my $file_blast_parsed=shift;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
147
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
148 open (FILE_BLAST_PARSED, "<$file_blast_parsed") or die "can not open $file_blast_parsed\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
149 while (my $line=<FILE_BLAST_PARSED>){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
150 if($line=~/^(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\d+)\.+(\d+)\s+(\S+)\s+(\S+)\s+(\S+)\s+(.+)$/){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
151 my $query=$1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
152 my $query_lng=$2;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
153 my $query_beg=$3;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
154 my $query_end=$4;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
155 my $subject=$5;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
156 my $subject_lng=$6;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
157 my $subject_beg=$7;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
158 my $subject_end=$8;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
159 my $e_value=$9;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
160 my $pid=$10;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
161 my $bitscore=$11;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
162 my $other=$12;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
163
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
164 #if the new line concerns a new subject (potential precursor) then the old subject must be resolved
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
165 if($subject_old and $subject_old ne $subject){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
166 resolve_potential_precursor();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
167 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
168
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
169 #resolve the strand
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
170 my $strand=find_strand($other);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
171
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
172 #resolve the number of reads that the deep sequence represents
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
173 my $freq=find_freq($query);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
174
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
175 #read information of the query (deep sequence) into hash
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
176 $hash_query{$query}{"subject_beg"}=$subject_beg;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
177 $hash_query{$query}{"subject_end"}=$subject_end;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
178 $hash_query{$query}{"strand"}=$strand;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
179 $hash_query{$query}{"freq"}=$freq;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
180
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
181 #save the signature information
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
182 $lines.=$line;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
183
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
184 $subject_old=$subject;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
185 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
186 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
187 resolve_potential_precursor();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
188 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
189
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
190 sub resolve_potential_precursor{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
191
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
192 # dissects the potential precursor in parts by filling hashes, and tests if it passes the
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
193 # initial filter and the scoring filter
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
194
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
195 # binary variable whether the potential precursor is still viable
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
196 my $ret=1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
197 #print STDERR ">$subject_old\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
198
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
199 fill_structure();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
200 #print STDERR "\%hash_bp",scalar keys %hash_bp,"\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
201 fill_pri();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
202 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
203
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
204 fill_mature();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
205 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
206
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
207 fill_star();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
208 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
209
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
210 fill_loop();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
211 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
212
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
213 fill_lower_flanks();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
214 #print STDERR "\%hash_comp",scalar keys %hash_comp,"\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
215
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
216 # do_test_assemble();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
217
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
218 # this is the actual classification
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
219 unless(pass_filtering_initial() and pass_threshold_score()){$ret=0;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
220
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
221 print_results($ret);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
222
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
223 reset_variables();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
224
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
225 return;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
226
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
227 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
228
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
229
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
230
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
231 sub print_results{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
232
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
233 my $ret=shift;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
234
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
235 # print out if the precursor is accepted and accepted precursors should be printed out
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
236 # or if the potential precursor is discarded and discarded potential precursors should
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
237 # be printed out
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
238
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
239 if((!$options{t} and $ret) or ($options{t} and !$ret)){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
240 #full output
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
241 unless($options{u}){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
242 if($message_filter){print $message_filter;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
243 if($message_score){print $message_score;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
244 print_hash_comp();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
245 print $lines,"\n\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
246 return;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
247 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
248 #limited output (only ids)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
249 my $id=$hash_comp{"pri_id"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
250 print "$id\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
251 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
252 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
253
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
254
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
255
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
256
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
257
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
258
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
259
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
260 sub pass_threshold_score{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
261
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
262 # this is the scoring
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
263
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
264 #minimum free energy of the potential precursor
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
265 # my $score_mfe=score_mfe($hash_comp{"pri_mfe"});
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
266 my $score_mfe=score_mfe($hash_comp{"pri_mfe"},$hash_comp{"pri_end"});
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
267
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
268 #count of reads that map in accordance with Dicer processing
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
269 my $score_freq=score_freq($hash_comp{"freq"});
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
270 #print STDERR "score_mfe: $score_mfe\nscore_freq: $score_freq\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
271
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
272 #basic score
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
273 my $score=$score_mfe+$score_freq;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
274
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
275 #scoring of conserved nucleus/seed (optional)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
276 if($options{s}){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
277
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
278 #if the nucleus is conserved
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
279 if(test_nucleus_conservation()){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
280
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
281 #nucleus from position 2-8
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
282 my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
283
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
284 #resolve DNA/RNA ambiguities
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
285 $nucleus=~tr/[T]/[U]/;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
286
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
287 #print score contribution
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
288 score_s("score_nucleus\t$score_nucleus");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
289
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
290 #print the ids of known miRNAs with same nucleus
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
291 score_s("$hash_mirs{$nucleus}");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
292 #print STDERR "score_nucleus\t$score_nucleus\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
293
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
294 #add to score
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
295 $score+=$score_nucleus;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
296
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
297 #if the nucleus is not conserved
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
298 }else{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
299 #print (negative) score contribution
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
300 score_s("score_nucleus\t$score_nucleus_not");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
301
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
302 #add (negative) score contribution
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
303 $score+=$score_nucleus_not;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
304 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
305 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
306
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
307 #if the majority of potential star reads fall as expected from Dicer processing
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
308 if($hash_comp{"star_read"}){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
309 score_s("score_star\t$score_star");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
310 #print STDERR "score_star\t$score_star\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
311 $score+=$score_star;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
312 }else{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
313 score_s("score_star\t$score_star_not");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
314 #print STDERR "score_star_not\t$score_star_not\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
315 $score+=$score_star_not;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
316 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
317
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
318 #score lower stems for potential for Drosha recognition (highly optional)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
319 if($options{z}){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
320 my $stem_bp=$hash_comp{"stem_bp"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
321 my $score_stem=$scores_stem[$stem_bp];
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
322 $score+=$score_stem;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
323 score_s("score_stem\t$score_stem");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
324 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
325
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
326 #print STDERR "score_intercept\t$score_intercept\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
327
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
328 $score+=$score_intercept;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
329
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
330 #score for randfold (optional)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
331 if($options{y}){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
332
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
333 # only calculate randfold value if it can make the difference between the potential precursor
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
334 # being accepted or discarded
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
335 if($score+$score_randfold>=$score_min and $score+$score_randfold_not<=$score_min){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
336
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
337 #randfold value<0.05
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
338 if(test_randfold()){$score+=$score_randfold;score_s("score_randfold\t$score_randfold");}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
339
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
340 #randfold value>0.05
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
341 else{$score+=$score_randfold_not;score_s("score_randfold\t$score_randfold_not");}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
342 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
343 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
344
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
345 #round off values to one decimal
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
346 my $round_mfe=round($score_mfe*10)/10;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
347 my $round_freq=round($score_freq*10)/10;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
348 my $round=round($score*10)/10;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
349
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
350 #print scores
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
351 score_s("score_mfe\t$round_mfe\nscore_freq\t$round_freq\nscore\t$round");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
352
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
353 #return 1 if the potential precursor is accepted, return 0 if discarded
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
354 unless($score>=$score_min){return 0;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
355 return 1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
356 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
357
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
358 sub test_randfold{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
359
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
360 #print sequence to temporary file, test randfold value, return 1 or 0
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
361
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
362 # print_file("pri_seq.fa",">pri_seq\n".$hash_comp{"pri_seq"});
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
363 #my $tmpfile=$tmpdir.$hash_comp{"pri_id"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
364 #open(FILE, ">$tmpfile");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
365 #print FILE ">pri_seq\n",$hash_comp{"pri_seq"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
366 #close FILE;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
367
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
368 # my $p_value=`randfold -s $tmpfile 999 | cut -f 3`;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
369 #my $p1=`randfold -s $tmpfile 999 | cut -f 3`;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
370 #my $p2=`randfold -s $tmpfile 999 | cut -f 3`;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
371 my $p1=&randfold_pvalue($hash_comp{"pri_seq"},999);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
372 my $p2=&randfold_pvalue($hash_comp{"pri_seq"},999);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
373 my $p_value=($p1+$p2)/2;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
374 wait;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
375 # system "rm $tmpfile";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
376
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
377 if($p_value<=0.05){return 1;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
378
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
379 return 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
380 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
381
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
382 sub randfold_pvalue{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
383 my $cpt_sup = 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
384 my $cpt_inf = 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
385 my $cpt_ega = 1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
386
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
387 my ($seq,$number_of_randomizations)=@_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
388 #my $str =$seq;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
389 #my $mfe = RNA::fold($seq,$str);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
390 my $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
391 my @rawfolds=split/\s+/,$rnafold;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
392 my $str=$rawfolds[1];
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
393 my $mfe=$rawfolds[-1];
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
394 $mfe=~s/\(//;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
395 $mfe=~s/\)//;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
396
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
397 for (my $i=0;$i<$number_of_randomizations;$i++) {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
398 $seq = shuffle_sequence_dinucleotide($seq);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
399 #$str = $seq;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
400
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
401 #my $rand_mfe = RNA::fold($str,$str);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
402 $rnafold=`perl -e 'print "$seq"' | RNAfold --noPS`;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
403 my @rawfolds=split/\s+/,$rnafold;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
404 my $str=$rawfolds[1];
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
405 my $rand_mfe=$rawfolds[-1];
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
406 $rand_mfe=~s/\(//;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
407 $rand_mfe=~s/\)//;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
408
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
409 if ($rand_mfe < $mfe) {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
410 $cpt_inf++;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
411 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
412 if ($rand_mfe == $mfe) {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
413 $cpt_ega++;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
414 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
415 if ($rand_mfe > $mfe) {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
416 $cpt_sup++;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
417 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
418 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
419
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
420 my $proba = ($cpt_ega + $cpt_inf) / ($number_of_randomizations + 1);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
421
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
422 #print "$name\t$mfe\t$proba\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
423 return $proba;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
424 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
425
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
426 sub shuffle_sequence_dinucleotide {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
427
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
428 my ($str) = @_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
429
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
430 # upper case and convert to ATGC
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
431 $str = uc($str);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
432 $str =~ s/U/T/g;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
433
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
434 my @nuc = ('A','T','G','C');
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
435 my $count_swap = 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
436 # set maximum number of permutations
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
437 my $stop = length($str) * 10;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
438
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
439 while($count_swap < $stop) {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
440
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
441 my @pos;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
442
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
443 # look start and end letters
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
444 my $firstnuc = $nuc[int(rand 4)];
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
445 my $thirdnuc = $nuc[int(rand 4)];
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
446
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
447 # get positions for matching nucleotides
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
448 for (my $i=0;$i<(length($str)-2);$i++) {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
449 if ((substr($str,$i,1) eq $firstnuc) && (substr($str,$i+2,1) eq $thirdnuc)) {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
450 push (@pos,($i+1));
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
451 $i++;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
452 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
453 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
454
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
455 # swap at random trinucleotides
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
456 my $max = scalar(@pos);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
457 for (my $i=0;$i<$max;$i++) {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
458 my $swap = int(rand($max));
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
459 if ((abs($pos[$swap] - $pos[$i]) >= 3) && (substr($str,$pos[$i],1) ne substr($str,$pos[$swap],1))) {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
460 $count_swap++;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
461 my $w1 = substr($str,$pos[$i],1);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
462 my $w2 = substr($str,$pos[$swap],1);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
463 substr($str,$pos[$i],1,$w2);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
464 substr($str,$pos[$swap],1,$w1);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
465 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
466 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
467 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
468 return($str);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
469 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
470
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
471 sub test_nucleus_conservation{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
472
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
473 #test if nucleus is identical to nucleus from known miRNA, return 1 or 0
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
474
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
475 my $nucleus=substr($hash_comp{"mature_seq"},1,$nucleus_lng);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
476 $nucleus=~tr/[T]/[U]/;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
477 if($hash_nuclei{$nucleus}){return 1;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
478
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
479 return 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
480 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
481
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
482
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
483
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
484 sub pass_filtering_initial{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
485
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
486 #test if the structure forms a plausible hairpin
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
487 unless(pass_filtering_structure()){filter_p("structure problem"); return 0;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
488
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
489 #test if >90% of reads map to the hairpin in consistence with Dicer processing
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
490 unless(pass_filtering_signature()){filter_p("signature problem");return 0;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
491
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
492 return 1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
493
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
494 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
495
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
496
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
497 sub pass_filtering_signature{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
498
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
499 #number of reads that map in consistence with Dicer processing
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
500 my $consistent=0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
501
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
502 #number of reads that map inconsistent with Dicer processing
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
503 my $inconsistent=0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
504
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
505 # number of potential star reads map in good consistence with Drosha/Dicer processing
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
506 # (3' overhangs relative to mature product)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
507 my $star_perfect=0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
508
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
509 # number of potential star reads that do not map in good consistence with 3' overhang
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
510 my $star_fuzzy=0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
511
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
512
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
513 #sort queries (deep sequences) by their position on the hairpin
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
514 my @queries=sort {$hash_query{$a}{"subject_beg"} <=> $hash_query{$b}{"subject_beg"}} keys %hash_query;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
515
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
516 foreach my $query(@queries){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
517
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
518 #number of reads that the deep sequence represents
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
519 unless(defined($hash_query{$query}{"freq"})){next;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
520 my $query_freq=$hash_query{$query}{"freq"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
521
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
522 #test which Dicer product (if any) the deep sequence corresponds to
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
523 my $product=test_query($query);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
524
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
525 #if the deep sequence corresponds to a Dicer product, add to the 'consistent' variable
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
526 if($product){$consistent+=$query_freq;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
527
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
528 #if the deep sequence do not correspond to a Dicer product, add to the 'inconsistent' variable
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
529 else{$inconsistent+=$query_freq;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
530
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
531 #test a potential star sequence has good 3' overhang
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
532 if($product eq "star"){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
533 if(test_star($query)){$star_perfect+=$query_freq;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
534 else{$star_fuzzy+=$query_freq;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
535 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
536 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
537
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
538 # if the majority of potential star sequences map in good accordance with 3' overhang
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
539 # score for the presence of star evidence
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
540 if($star_perfect>$star_fuzzy){$hash_comp{"star_read"}=1;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
541
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
542 #total number of reads mapping to the hairpin
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
543 my $freq=$consistent+$inconsistent;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
544 $hash_comp{"freq"}=$freq;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
545 unless($freq>0){filter_s("read frequency too low"); return 0;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
546
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
547 #unless >90% of the reads map in consistence with Dicer processing, the hairpin is discarded
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
548 my $inconsistent_fraction=$inconsistent/($inconsistent+$consistent);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
549 unless($inconsistent_fraction<=0.1){filter_p("inconsistent\t$inconsistent\nconsistent\t$consistent"); return 0;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
550
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
551 #the hairpin is retained
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
552 return 1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
553 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
554
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
555 sub test_star{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
556
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
557 #test if a deep sequence maps in good consistence with 3' overhang
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
558
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
559 my $query=shift;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
560
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
561 #5' begin and 3' end positions
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
562 my $beg=$hash_query{$query}{"subject_beg"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
563 my $end=$hash_query{$query}{"subject_end"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
564
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
565 #the difference between observed and expected begin positions must be 0 or 1
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
566 my $offset=$beg-$hash_comp{"star_beg"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
567 if($offset==0 or $offset==1 or $offset==-1){return 1;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
568
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
569 return 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
570 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
571
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
572
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
573
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
574 sub test_query{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
575
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
576 #test if deep sequence maps in consistence with Dicer processing
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
577
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
578 my $query=shift;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
579
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
580 #begin, end, strand and read count
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
581 my $beg=$hash_query{$query}{"subject_beg"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
582 my $end=$hash_query{$query}{"subject_end"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
583 my $strand=$hash_query{$query}{"strand"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
584 my $freq=$hash_query{$query}{"freq"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
585
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
586 #should not be on the minus strand (although this has in fact anecdotally been observed for known miRNAs)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
587 if($strand eq '-'){return 0;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
588
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
589 #the deep sequence is allowed to stretch 2 nt beyond the expected 5' end
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
590 my $fuzz_beg=2;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
591 #the deep sequence is allowed to stretch 5 nt beyond the expected 3' end
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
592 my $fuzz_end=2;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
593
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
594 #if in accordance with Dicer processing, return the type of Dicer product
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
595 if(contained($beg,$end,$hash_comp{"mature_beg"}-$fuzz_beg,$hash_comp{"mature_end"}+$fuzz_end)){return "mature";}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
596 if(contained($beg,$end,$hash_comp{"star_beg"}-$fuzz_beg,$hash_comp{"star_end"}+$fuzz_end)){return "star";}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
597 if(contained($beg,$end,$hash_comp{"loop_beg"}-$fuzz_beg,$hash_comp{"loop_end"}+$fuzz_end)){return "loop";}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
598
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
599 #if not in accordance, return 0
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
600 return 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
601 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
602
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
603
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
604 sub pass_filtering_structure{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
605
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
606 #The potential precursor must form a hairpin with miRNA precursor-like characteristics
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
607
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
608 #return value
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
609 my $ret=1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
610
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
611 #potential mature, star, loop and lower flank parts must be identifiable
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
612 unless(test_components()){return 0;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
613
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
614 #no bifurcations
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
615 unless(no_bifurcations_precursor()){$ret=0;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
616
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
617 #minimum 14 base pairings in duplex
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
618 unless(bp_duplex()>=15){$ret=0;filter_s("too few pairings in duplex");}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
619
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
620 #not more than 6 nt difference between mature and star length
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
621 unless(-6<diff_lng() and diff_lng()<6){$ret=0; filter_s("too big difference between mature and star length") }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
622
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
623 return $ret;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
624 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
625
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
626
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
627
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
628
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
629
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
630
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
631 sub test_components{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
632
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
633 #tests whether potential mature, star, loop and lower flank parts are identifiable
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
634
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
635 unless($hash_comp{"mature_struct"}){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
636 filter_s("no mature");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
637 # print STDERR "no mature\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
638 return 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
639 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
640
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
641 unless($hash_comp{"star_struct"}){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
642 filter_s("no star");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
643 # print STDERR "no star\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
644 return 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
645 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
646
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
647 unless($hash_comp{"loop_struct"}){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
648 filter_s("no loop");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
649 # print STDERR "no loop\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
650 return 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
651 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
652
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
653 unless($hash_comp{"flank_first_struct"}){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
654 filter_s("no flanks");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
655 #print STDERR "no flanks_first_struct\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
656 return 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
657 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
658
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
659 unless($hash_comp{"flank_second_struct"}){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
660 filter_s("no flanks");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
661 # print STDERR "no flanks_second_struct\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
662 return 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
663 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
664 return 1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
665 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
666
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
667
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
668
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
669
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
670
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
671 sub no_bifurcations_precursor{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
672
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
673 #tests whether there are bifurcations in the hairpin
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
674
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
675 #assembles the potential precursor sequence and structure from the expected Dicer products
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
676 #this is the expected biological precursor, in contrast with 'pri_seq' that includes
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
677 #some genomic flanks on both sides
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
678
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
679 my $pre_struct;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
680 my $pre_seq;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
681 if($hash_comp{"mature_arm"} eq "first"){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
682 $pre_struct.=$hash_comp{"mature_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"star_struct"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
683 $pre_seq.=$hash_comp{"mature_seq"}.$hash_comp{"loop_seq"}.$hash_comp{"star_seq"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
684 }else{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
685 $pre_struct.=$hash_comp{"star_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"mature_struct"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
686 $pre_seq.=$hash_comp{"star_seq"}.$hash_comp{"loop_seq"}.$hash_comp{"mature_seq"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
687 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
688
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
689 #read into hash
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
690 $hash_comp{"pre_struct"}=$pre_struct;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
691 $hash_comp{"pre_seq"}=$pre_seq;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
692
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
693 #simple pattern matching checks for bifurcations
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
694 unless($pre_struct=~/^((\.|\()+..(\.|\))+)$/){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
695 filter_s("bifurcation in precursor");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
696 # print STDERR "bifurcation in precursor\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
697 return 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
698 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
699
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
700 return 1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
701 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
702
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
703 sub bp_precursor{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
704
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
705 #total number of bps in the precursor
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
706
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
707 my $pre_struct=$hash_comp{"pre_struct"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
708
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
709 #simple pattern matching
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
710 my $pre_bps=0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
711 while($pre_struct=~/\(/g){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
712 $pre_bps++;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
713 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
714 return $pre_bps;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
715 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
716
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
717
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
718 sub bp_duplex{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
719
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
720 #total number of bps in the duplex
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
721
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
722 my $duplex_bps=0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
723 my $mature_struct=$hash_comp{"mature_struct"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
724
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
725 #simple pattern matching
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
726 while($mature_struct=~/(\(|\))/g){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
727 $duplex_bps++;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
728 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
729 return $duplex_bps;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
730 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
731
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
732 sub diff_lng{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
733
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
734 #find difference between mature and star lengths
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
735
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
736 my $mature_lng=length $hash_comp{"mature_struct"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
737 my $star_lng=length $hash_comp{"star_struct"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
738 my $diff_lng=$mature_lng-$star_lng;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
739 return $diff_lng;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
740 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
741
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
742
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
743
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
744 sub do_test_assemble{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
745
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
746 # not currently used, tests if the 'pri_struct' as assembled from the parts (Dicer products, lower flanks)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
747 # is identical to 'pri_struct' before disassembly into parts
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
748
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
749 my $assemble_struct;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
750
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
751 if($hash_comp{"flank_first_struct"} and $hash_comp{"mature_struct"} and $hash_comp{"loop_struct"} and $hash_comp{"star_struct"} and $hash_comp{"flank_second_struct"}){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
752 if($hash_comp{"mature_arm"} eq "first"){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
753 $assemble_struct.=$hash_comp{"flank_first_struct"}.$hash_comp{"mature_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"star_struct"}.$hash_comp{"flank_second_struct"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
754 }else{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
755 $assemble_struct.=$hash_comp{"flank_first_struct"}.$hash_comp{"star_struct"}.$hash_comp{"loop_struct"}.$hash_comp{"mature_struct"}.$hash_comp{"flank_second_struct"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
756 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
757 unless($assemble_struct eq $hash_comp{"pri_struct"}){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
758 $hash_comp{"test_assemble"}=$assemble_struct;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
759 print_hash_comp();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
760 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
761 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
762 return;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
763 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
764
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
765
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
766
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
767 sub fill_structure{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
768
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
769 #reads the dot bracket structure into the 'bp' hash where each key and value are basepaired
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
770
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
771 my $struct=$hash_struct{$subject_old};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
772 my $lng=length $struct;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
773
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
774 #local stack for keeping track of basepairings
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
775 my @bps;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
776
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
777 for(my $pos=1;$pos<=$lng;$pos++){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
778 my $struct_pos=excise_struct($struct,$pos,$pos,"+");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
779
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
780 if($struct_pos eq "("){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
781 push(@bps,$pos);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
782 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
783
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
784 if($struct_pos eq ")"){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
785 my $pos_prev=pop(@bps);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
786 $hash_bp{$pos_prev}=$pos;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
787 $hash_bp{$pos}=$pos_prev;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
788 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
789 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
790 return;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
791 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
792
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
793
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
794
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
795 sub fill_star{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
796
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
797 #fills specifics on the expected star strand into 'comp' hash ('component' hash)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
798
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
799 #if the mature sequence is not plausible, don't look for the star arm
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
800 my $mature_arm=$hash_comp{"mature_arm"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
801 unless($mature_arm){$hash_comp{"star_arm"}=0; return;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
802
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
803 #if the star sequence is not plausible, don't fill into the hash
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
804 my($star_beg,$star_end)=find_star();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
805 my $star_arm=arm_star($star_beg,$star_end);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
806 unless($star_arm){return;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
807
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
808 #excise expected star sequence and structure
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
809 my $star_seq=excise_seq($hash_comp{"pri_seq"},$star_beg,$star_end,"+");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
810 my $star_struct=excise_seq($hash_comp{"pri_struct"},$star_beg,$star_end,"+");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
811
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
812 #fill into hash
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
813 $hash_comp{"star_beg"}=$star_beg;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
814 $hash_comp{"star_end"}=$star_end;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
815 $hash_comp{"star_seq"}=$star_seq;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
816 $hash_comp{"star_struct"}=$star_struct;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
817 $hash_comp{"star_arm"}=$star_arm;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
818
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
819 return;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
820 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
821
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
822
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
823 sub find_star{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
824
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
825 #uses the 'bp' hash to find the expected star begin and end positions from the mature positions
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
826
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
827 #the -2 is for the overhang
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
828 my $mature_beg=$hash_comp{"mature_beg"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
829 my $mature_end=$hash_comp{"mature_end"}-2;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
830 my $mature_lng=$mature_end-$mature_beg+1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
831
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
832 #in some cases, the last nucleotide of the mature sequence does not form a base pair,
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
833 #and therefore does not basepair with the first nucleotide of the star sequence.
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
834 #In this case, the algorithm searches for the last nucleotide of the mature sequence
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
835 #to form a base pair. The offset is the number of nucleotides searched through.
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
836 my $offset_star_beg=0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
837 my $offset_beg=0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
838
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
839 #the offset should not be longer than the length of the mature sequence, then it
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
840 #means that the mature sequence does not form any base pairs
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
841 while(!$offset_star_beg and $offset_beg<$mature_lng){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
842 if($hash_bp{$mature_end-$offset_beg}){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
843 $offset_star_beg=$hash_bp{$mature_end-$offset_beg};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
844 }else{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
845 $offset_beg++;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
846 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
847 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
848 #when defining the beginning of the star sequence, compensate for the offset
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
849 my $star_beg=$offset_star_beg-$offset_beg;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
850
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
851 #same as above
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
852 my $offset_star_end=0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
853 my $offset_end=0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
854 while(!$offset_star_end and $offset_end<$mature_lng){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
855 if($hash_bp{$mature_beg+$offset_end}){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
856 $offset_star_end=$hash_bp{$mature_beg+$offset_end};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
857 }else{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
858 $offset_end++;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
859 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
860 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
861 #the +2 is for the overhang
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
862 my $star_end=$offset_star_end+$offset_end+2;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
863
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
864 return($star_beg,$star_end);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
865 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
866
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
867
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
868 sub fill_pri{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
869
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
870 #fills basic specifics on the precursor into the 'comp' hash
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
871
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
872 my $seq=$hash_seq{$subject_old};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
873 my $struct=$hash_struct{$subject_old};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
874 my $mfe=$hash_mfe{$subject_old};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
875 my $length=length $seq;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
876
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
877 $hash_comp{"pri_id"}=$subject_old;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
878 $hash_comp{"pri_seq"}=$seq;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
879 $hash_comp{"pri_struct"}=$struct;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
880 $hash_comp{"pri_mfe"}=$mfe;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
881 $hash_comp{"pri_beg"}=1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
882 $hash_comp{"pri_end"}=$length;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
883
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
884 return;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
885 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
886
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
887
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
888 sub fill_mature{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
889
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
890 #fills specifics on the mature sequence into the 'comp' hash
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
891
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
892 my $mature_query=find_mature_query();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
893 my($mature_beg,$mature_end)=find_positions_query($mature_query);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
894 my $mature_strand=find_strand_query($mature_query);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
895 my $mature_seq=excise_seq($hash_comp{"pri_seq"},$mature_beg,$mature_end,$mature_strand);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
896 my $mature_struct=excise_struct($hash_comp{"pri_struct"},$mature_beg,$mature_end,$mature_strand);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
897 my $mature_arm=arm_mature($mature_beg,$mature_end,$mature_strand);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
898
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
899 $hash_comp{"mature_query"}=$mature_query;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
900 $hash_comp{"mature_beg"}=$mature_beg;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
901 $hash_comp{"mature_end"}=$mature_end;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
902 $hash_comp{"mature_strand"}=$mature_strand;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
903 $hash_comp{"mature_struct"}=$mature_struct;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
904 $hash_comp{"mature_seq"}=$mature_seq;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
905 $hash_comp{"mature_arm"}=$mature_arm;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
906
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
907 return;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
908 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
909
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
910
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
911
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
912 sub fill_loop{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
913
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
914 #fills specifics on the loop sequence into the 'comp' hash
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
915
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
916 #unless both mature and star sequences are plausible, do not look for the loop
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
917 unless($hash_comp{"mature_arm"} and $hash_comp{"star_arm"}){return;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
918
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
919 my $loop_beg;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
920 my $loop_end;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
921
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
922 #defining the begin and end positions of the loop from the mature and star positions
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
923 #excision depends on whether the mature or star sequence is 5' of the loop ('first')
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
924 if($hash_comp{"mature_arm"} eq "first"){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
925 $loop_beg=$hash_comp{"mature_end"}+1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
926 }else{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
927 $loop_end=$hash_comp{"mature_beg"}-1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
928 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
929
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
930 if($hash_comp{"star_arm"} eq "first"){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
931 $loop_beg=$hash_comp{"star_end"}+1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
932 }else{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
933 $loop_end=$hash_comp{"star_beg"}-1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
934 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
935
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
936 #unless the positions are plausible, do not fill into hash
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
937 unless(test_loop($loop_beg,$loop_end)){return;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
938
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
939 my $loop_seq=excise_seq($hash_comp{"pri_seq"},$loop_beg,$loop_end,"+");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
940 my $loop_struct=excise_struct($hash_comp{"pri_struct"},$loop_beg,$loop_end,"+");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
941
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
942 $hash_comp{"loop_beg"}=$loop_beg;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
943 $hash_comp{"loop_end"}=$loop_end;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
944 $hash_comp{"loop_seq"}=$loop_seq;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
945 $hash_comp{"loop_struct"}=$loop_struct;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
946
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
947 return;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
948 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
949
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
950
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
951 sub fill_lower_flanks{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
952
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
953 #fills specifics on the lower flanks and unpaired strands into the 'comp' hash
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
954
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
955 #unless both mature and star sequences are plausible, do not look for the flanks
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
956 unless($hash_comp{"mature_arm"} and $hash_comp{"star_arm"}){return;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
957
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
958 my $flank_first_end;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
959 my $flank_second_beg;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
960
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
961 #defining the begin and end positions of the flanks from the mature and star positions
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
962 #excision depends on whether the mature or star sequence is 5' in the potenitial precursor ('first')
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
963 if($hash_comp{"mature_arm"} eq "first"){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
964 $flank_first_end=$hash_comp{"mature_beg"}-1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
965 }else{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
966 $flank_second_beg=$hash_comp{"mature_end"}+1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
967 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
968
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
969 if($hash_comp{"star_arm"} eq "first"){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
970 $flank_first_end=$hash_comp{"star_beg"}-1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
971 }else{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
972 $flank_second_beg=$hash_comp{"star_end"}+1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
973 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
974
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
975 #unless the positions are plausible, do not fill into hash
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
976 unless(test_flanks($flank_first_end,$flank_second_beg)){return;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
977
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
978 $hash_comp{"flank_first_end"}=$flank_first_end;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
979 $hash_comp{"flank_second_beg"}=$flank_second_beg;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
980 $hash_comp{"flank_first_seq"}=excise_seq($hash_comp{"pri_seq"},$hash_comp{"pri_beg"},$hash_comp{"flank_first_end"},"+");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
981 $hash_comp{"flank_second_seq"}=excise_seq($hash_comp{"pri_seq"},$hash_comp{"flank_second_beg"},$hash_comp{"pri_end"},"+");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
982 $hash_comp{"flank_first_struct"}=excise_struct($hash_comp{"pri_struct"},$hash_comp{"pri_beg"},$hash_comp{"flank_first_end"},"+");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
983 $hash_comp{"flank_second_struct"}=excise_struct($hash_comp{"pri_struct"},$hash_comp{"flank_second_beg"},$hash_comp{"pri_end"},"+");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
984
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
985 if($options{z}){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
986 fill_stems_drosha();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
987 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
988
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
989 return;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
990 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
991
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
992
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
993 sub fill_stems_drosha{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
994
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
995 #scores the number of base pairings formed by the first ten nt of the lower stems
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
996 #in general, the more stems, the higher the score contribution
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
997 #warning: this options has not been thoroughly tested
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
998
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
999 my $flank_first_struct=$hash_comp{"flank_first_struct"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1000 my $flank_second_struct=$hash_comp{"flank_second_struct"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1001
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1002 my $stem_first=substr($flank_first_struct,-10);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1003 my $stem_second=substr($flank_second_struct,0,10);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1004
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1005 my $stem_bp_first=0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1006 my $stem_bp_second=0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1007
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1008 #find base pairings by simple pattern matching
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1009 while($stem_first=~/\(/g){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1010 $stem_bp_first++;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1011 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1012
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1013 while($stem_second=~/\)/g){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1014 $stem_bp_second++;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1015 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1016
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1017 my $stem_bp=min2($stem_bp_first,$stem_bp_second);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1018
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1019 $hash_comp{"stem_first"}=$stem_first;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1020 $hash_comp{"stem_second"}=$stem_second;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1021 $hash_comp{"stem_bp_first"}=$stem_bp_first;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1022 $hash_comp{"stem_bp_second"}=$stem_bp_second;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1023 $hash_comp{"stem_bp"}=$stem_bp;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1024
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1025 return;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1026 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1027
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1028
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1029
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1030
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1031 sub arm_mature{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1032
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1033 #tests whether the mature sequence is in the 5' ('first') or 3' ('second') arm of the potential precursor
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1034
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1035 my ($beg,$end,$strand)=@_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1036
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1037 #mature and star sequences should alway be on plus strand
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1038 if($strand eq "-"){return 0;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1039
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1040 #there should be no bifurcations and minimum one base pairing
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1041 my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,$strand);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1042 if(defined($struct) and $struct=~/^(\(|\.)+$/ and $struct=~/\(/){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1043 return "first";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1044 }elsif(defined($struct) and $struct=~/^(\)|\.)+$/ and $struct=~/\)/){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1045 return "second";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1046 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1047 return 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1048 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1049
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1050
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1051 sub arm_star{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1052
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1053 #tests whether the star sequence is in the 5' ('first') or 3' ('second') arm of the potential precursor
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1054
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1055 my ($beg,$end)=@_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1056
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1057 #unless the begin and end positions are plausible, test negative
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1058 unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1059
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1060 #no overlap between the mature and the star sequence
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1061 if($hash_comp{"mature_arm"} eq "first"){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1062 ($hash_comp{"mature_end"}<$beg) or return 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1063 }elsif($hash_comp{"mature_arm"} eq "second"){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1064 ($end<$hash_comp{"mature_beg"}) or return 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1065 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1066
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1067 #there should be no bifurcations and minimum one base pairing
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1068 my $struct=excise_seq($hash_comp{"pri_struct"},$beg,$end,"+");
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1069 if($struct=~/^(\(|\.)+$/ and $struct=~/\(/){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1070 return "first";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1071 }elsif($struct=~/^(\)|\.)+$/ and $struct=~/\)/){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1072 return "second";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1073 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1074 return 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1075 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1076
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1077
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1078 sub test_loop{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1079
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1080 #tests the loop positions
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1081
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1082 my ($beg,$end)=@_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1083
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1084 #unless the begin and end positions are plausible, test negative
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1085 unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1086
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1087 return 1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1088 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1089
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1090
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1091 sub test_flanks{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1092
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1093 #tests the positions of the lower flanks
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1094
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1095 my ($beg,$end)=@_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1096
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1097 #unless the begin and end positions are plausible, test negative
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1098 unless($beg>0 and $beg<=$hash_comp{"pri_end"} and $end>0 and $end<=$hash_comp{"pri_end"} and $beg<=$end){return 0;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1099
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1100 return 1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1101 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1102
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1103
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1104 sub comp{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1105
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1106 #subroutine to retrive from the 'comp' hash
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1107
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1108 my $type=shift;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1109 my $component=$hash_comp{$type};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1110 return $component;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1111 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1112
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1113
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1114 sub find_strand_query{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1115
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1116 #subroutine to find the strand for a given query
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1117
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1118 my $query=shift;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1119 my $strand=$hash_query{$query}{"strand"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1120 return $strand;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1121 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1122
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1123
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1124 sub find_positions_query{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1125
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1126 #subroutine to find the begin and end positions for a given query
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1127
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1128 my $query=shift;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1129 my $beg=$hash_query{$query}{"subject_beg"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1130 my $end=$hash_query{$query}{"subject_end"};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1131 return ($beg,$end);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1132 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1133
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1134
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1135
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1136 sub find_mature_query{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1137
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1138 #finds the query with the highest frequency of reads and returns it
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1139 #is used to determine the positions of the potential mature sequence
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1140
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1141 my @queries=sort {$hash_query{$b}{"freq"} <=> $hash_query{$a}{"freq"}} keys %hash_query;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1142 my $mature_query=$queries[0];
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1143 return $mature_query;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1144 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1145
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1146
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1147
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1148
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1149 sub reset_variables{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1150
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1151 #resets the hashes for the next potential precursor
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1152
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1153 # %hash_query=();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1154 # %hash_comp=();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1155 # %hash_bp=();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1156 foreach my $key (keys %hash_query) {delete($hash_query{$key});}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1157 foreach my $key (keys %hash_comp) {delete($hash_comp{$key});}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1158 foreach my $key (keys %hash_bp) {delete($hash_bp{$key});}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1159
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1160 # $message_filter=();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1161 # $message_score=();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1162 # $lines=();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1163 undef($message_filter);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1164 undef($message_score);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1165 undef($lines);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1166 return;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1167 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1168
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1169
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1170
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1171 sub excise_seq{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1172
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1173 #excise sub sequence from the potential precursor
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1174
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1175 my($seq,$beg,$end,$strand)=@_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1176
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1177 #begin can be equal to end if only one nucleotide is excised
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1178 unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1179
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1180 #rarely, permuted combinations of signature and structure cause out of bound excision errors.
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1181 #this happens once appr. every two thousand combinations
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1182 unless($beg<=length($seq)){$out_of_bound++;return 0;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1183
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1184 #if on the minus strand, the reverse complement should be excised
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1185 if($strand eq "-"){$seq=revcom($seq);}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1186
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1187 #the blast parsed format is 1-indexed, substr is 0-indexed
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1188 my $sub_seq=substr($seq,$beg-1,$end-$beg+1);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1189
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1190 return $sub_seq;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1191
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1192 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1193
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1194 sub excise_struct{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1195
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1196 #excise sub structure
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1197
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1198 my($struct,$beg,$end,$strand)=@_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1199 my $lng=length $struct;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1200
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1201 #begin can be equal to end if only one nucleotide is excised
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1202 unless($beg<=$end){print STDERR "begin can not be smaller than end for $subject_old\n";exit;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1203
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1204 #rarely, permuted combinations of signature and structure cause out of bound excision errors.
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1205 #this happens once appr. every two thousand combinations
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1206 unless($beg<=length($struct)){return 0;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1207
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1208 #if excising relative to minus strand, positions are reversed
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1209 if($strand eq "-"){($beg,$end)=rev_pos($beg,$end,$lng);}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1210
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1211 #the blast parsed format is 1-indexed, substr is 0-indexed
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1212 my $sub_struct=substr($struct,$beg-1,$end-$beg+1);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1213
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1214 return $sub_struct;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1215 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1216
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1217
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1218 sub create_hash_nuclei{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1219 #parses a fasta file with sequences of known miRNAs considered for conservation purposes
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1220 #reads the nuclei into a hash
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1221
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1222 my ($file) = @_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1223 my ($id, $desc, $sequence, $nucleus) = ();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1224
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1225 open (FASTA, "<$file") or die "can not open $file\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1226 while (<FASTA>)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1227 {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1228 chomp;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1229 if (/^>(\S+)(.*)/)
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1230 {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1231 $id = $1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1232 $desc = $2;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1233 $sequence = "";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1234 $nucleus = "";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1235 while (<FASTA>){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1236 chomp;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1237 if (/^>(\S+)(.*)/){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1238 $nucleus = substr($sequence,1,$nucleus_lng);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1239 $nucleus =~ tr/[T]/[U]/;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1240 $hash_mirs{$nucleus} .="$id\t";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1241 $hash_nuclei{$nucleus} += 1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1242
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1243 $id = $1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1244 $desc = $2;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1245 $sequence = "";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1246 $nucleus = "";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1247 next;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1248 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1249 $sequence .= $_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1250 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1251 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1252 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1253 $nucleus = substr($sequence,1,$nucleus_lng);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1254 $nucleus =~ tr/[T]/[U]/;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1255 $hash_mirs{$nucleus} .="$id\t";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1256 $hash_nuclei{$nucleus} += 1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1257 close FASTA;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1258 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1259
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1260
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1261 sub parse_file_struct{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1262 #parses the output from RNAfoldand reads it into hashes
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1263 my($file) = @_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1264 my($id,$desc,$seq,$struct,$mfe) = ();
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1265 open (FILE_STRUCT, "<$file") or die "can not open $file\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1266 while (<FILE_STRUCT>){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1267 chomp;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1268 if (/^>(\S+)\s*(.*)/){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1269 $id= $1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1270 $desc= $2;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1271 $seq= "";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1272 $struct= "";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1273 $mfe= "";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1274 while (<FILE_STRUCT>){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1275 chomp;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1276 if (/^>(\S+)\s*(.*)/){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1277 $hash_desc{$id} = $desc;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1278 $hash_seq{$id} = $seq;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1279 $hash_struct{$id} = $struct;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1280 $hash_mfe{$id} = $mfe;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1281 $id = $1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1282 $desc = $2;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1283 $seq = "";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1284 $struct = "";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1285 $mfe = "";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1286 next;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1287 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1288 if(/^\w/){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1289 tr/uU/tT/;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1290 $seq .= $_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1291 next;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1292 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1293 if(/((\.|\(|\))+)/){$struct .=$1;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1294 if(/\((\s*-\d+\.\d+)\)/){$mfe = $1;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1295 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1296 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1297 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1298 $hash_desc{$id} = $desc;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1299 $hash_seq{$id} = $seq;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1300 $hash_struct{$id} = $struct;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1301 $hash_mfe{$id} = $mfe;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1302 close FILE_STRUCT;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1303 return;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1304 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1305
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1306
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1307 sub score_s{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1308
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1309 #this score message is appended to the end of the string of score messages outputted for the potential precursor
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1310
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1311 my $message=shift;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1312 $message_score.=$message."\n";;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1313 return;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1314 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1315
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1316
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1317
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1318 sub score_p{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1319
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1320 #this score message is appended to the beginning of the string of score messages outputted for the potential precursor
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1321
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1322 my $message=shift;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1323 $message_score=$message."\n".$message_score;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1324 return;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1325 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1326
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1327
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1328
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1329 sub filter_s{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1330
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1331 #this filtering message is appended to the end of the string of filtering messages outputted for the potential precursor
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1332
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1333 my $message=shift;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1334 $message_filter.=$message."\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1335 return;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1336 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1337
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1338
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1339 sub filter_p{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1340
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1341 #this filtering message is appended to the beginning of the string of filtering messages outputted for the potential precursor
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1342
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1343 my $message=shift;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1344 if(defined $message_filter){$message_filter=$message."\n".$message_filter;}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1345 else{$message_filter=$message."\n";}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1346 return;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1347 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1348
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1349
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1350 sub find_freq{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1351
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1352 #finds the frequency of a given read query from its id.
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1353
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1354 my($query)=@_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1355
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1356 if($query=~/x(\d+)/i){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1357 my $freq=$1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1358 return $freq;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1359 }else{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1360 #print STDERR "Problem with read format\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1361 return 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1362 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1363 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1364
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1365
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1366 sub print_hash_comp{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1367
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1368 #prints the 'comp' hash
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1369
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1370 my @keys=sort keys %hash_comp;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1371 foreach my $key(@keys){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1372 my $value=$hash_comp{$key};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1373 print "$key \t$value\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1374 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1375 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1376
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1377
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1378
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1379 sub print_hash_bp{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1380
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1381 #prints the 'bp' hash
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1382
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1383 my @keys=sort {$a<=>$b} keys %hash_bp;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1384 foreach my $key(@keys){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1385 my $value=$hash_bp{$key};
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1386 print "$key\t$value\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1387 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1388 print "\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1389 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1390
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1391
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1392
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1393 sub find_strand{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1394
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1395 #A subroutine to find the strand, parsing different blast formats
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1396
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1397 my($other)=@_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1398
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1399 my $strand="+";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1400
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1401 if($other=~/-/){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1402 $strand="-";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1403 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1404
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1405 if($other=~/minus/i){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1406 $strand="-";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1407 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1408 return($strand);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1409 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1410
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1411
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1412 sub contained{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1413
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1414 #Is the stretch defined by the first positions contained in the stretch defined by the second?
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1415
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1416 my($beg1,$end1,$beg2,$end2)=@_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1417
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1418 testbeginend($beg1,$end1,$beg2,$end2);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1419
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1420 if($beg2<=$beg1 and $end1<=$end2){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1421 return 1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1422 }else{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1423 return 0;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1424 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1425 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1426
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1427
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1428 sub testbeginend{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1429
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1430 #Are the beginposition numerically smaller than the endposition for each pair?
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1431
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1432 my($begin1,$end1,$begin2,$end2)=@_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1433
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1434 unless($begin1<=$end1 and $begin2<=$end2){
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1435 print STDERR "beg can not be larger than end for $subject_old\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1436 exit;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1437 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1438 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1439
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1440
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1441 sub rev_pos{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1442
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1443 # The blast_parsed format always uses positions that are relative to the 5' of the given strand
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1444 # This means that for a sequence of length n, the first nucleotide on the minus strand base pairs with
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1445 # the n't nucleotide on the plus strand
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1446
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1447 # This subroutine reverses the begin and end positions of positions of the minus strand so that they
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1448 # are relative to the 5' end of the plus strand
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1449
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1450 my($beg,$end,$lng)=@_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1451
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1452 my $new_end=$lng-$beg+1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1453 my $new_beg=$lng-$end+1;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1454
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1455 return($new_beg,$new_end);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1456 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1457
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1458 sub round {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1459
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1460 #rounds to nearest integer
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1461
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1462 my($number) = shift;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1463 return int($number + .5);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1464
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1465 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1466
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1467
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1468 sub rev{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1469
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1470 #reverses the order of nucleotides in a sequence
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1471
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1472 my($sequence)=@_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1473
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1474 my $rev=reverse $sequence;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1475
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1476 return $rev;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1477 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1478
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1479 sub com{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1480
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1481 #the complementary of a sequence
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1482
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1483 my($sequence)=@_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1484
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1485 $sequence=~tr/acgtuACGTU/TGCAATGCAA/;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1486
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1487 return $sequence;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1488 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1489
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1490 sub revcom{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1491
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1492 #reverse complement
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1493
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1494 my($sequence)=@_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1495
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1496 my $revcom=rev(com($sequence));
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1497
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1498 return $revcom;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1499 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1500
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1501
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1502 sub max2 {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1503
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1504 #max of two numbers
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1505
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1506 my($a, $b) = @_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1507 return ($a>$b ? $a : $b);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1508 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1509
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1510 sub min2 {
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1511
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1512 #min of two numbers
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1513
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1514 my($a, $b) = @_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1515 return ($a<$b ? $a : $b);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1516 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1517
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1518
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1519
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1520 sub score_freq{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1521
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1522 # scores the count of reads that map to the potential precursor
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1523 # Assumes geometric distribution as described in methods section of manuscript
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1524
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1525 my $freq=shift;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1526
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1527 #parameters of known precursors and background hairpins
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1528 my $parameter_test=0.999;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1529 my $parameter_control=0.6;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1530
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1531 #log_odds calculated directly to avoid underflow
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1532 my $intercept=log((1-$parameter_test)/(1-$parameter_control));
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1533 my $slope=log($parameter_test/$parameter_control);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1534 my $log_odds=$slope*$freq+$intercept;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1535
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1536 #if no strong evidence for 3' overhangs, limit the score contribution to 0
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1537 unless($options{x} or $hash_comp{"star_read"}){$log_odds=min2($log_odds,0);}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1538
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1539 return $log_odds;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1540 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1541
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1542
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1543
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1544 ##sub score_mfe{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1545
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1546 # scores the minimum free energy in kCal/mol of the potential precursor
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1547 # Assumes Gumbel distribution as described in methods section of manuscript
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1548
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1549 ## my $mfe=shift;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1550
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1551 #numerical value, minimum 1
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1552 ## my $mfe_adj=max2(1,-$mfe);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1553
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1554 #parameters of known precursors and background hairpins, scale and location
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1555 ## my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1556 ## my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1557
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1558 ## my $odds=$prob_test/$prob_background;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1559 ## my $log_odds=log($odds);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1560
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1561 ## return $log_odds;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1562 ##}
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1563
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1564 sub score_mfe{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1565 # use bignum;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1566
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1567 # scores the minimum free energy in kCal/mol of the potential precursor
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1568 # Assumes Gumbel distribution as described in methods section of manuscript
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1569
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1570 my ($mfe,$mlng)=@_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1571
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1572 #numerical value, minimum 1
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1573 my $mfe_adj=max2(1,-$mfe);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1574 my $mfe_adj1=$mfe/$mlng;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1575 #parameters of known precursors and background hairpins, scale and location
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1576 my $a=1.339e-12;my $b=2.778e-13;my $c=45.834;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1577 my $ev=$e**($mfe_adj1*$c);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1578 #print STDERR "\n***",$ev,"**\t",$ev+$b,"\t";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1579 my $log_odds=($a/($b+$ev));
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1580
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1581
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1582 my $prob_test=prob_gumbel_discretized($mfe_adj,5.5,32);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1583 my $prob_background=prob_gumbel_discretized($mfe_adj,4.8,23);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1584
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1585 my $odds=$prob_test/$prob_background;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1586 my $log_odds_2=log($odds);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1587 #print STDERR "log_odds :",$log_odds,"\t",$log_odds_2,"\n";
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1588 return $log_odds;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1589 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1590
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1591
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1592
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1593 sub prob_gumbel_discretized{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1594
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1595 # discretized Gumbel distribution, probabilities within windows of 1 kCal/mol
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1596 # uses the subroutine that calculates the cdf to find the probabilities
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1597
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1598 my ($var,$scale,$location)=@_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1599
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1600 my $bound_lower=$var-0.5;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1601 my $bound_upper=$var+0.5;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1602
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1603 my $cdf_lower=cdf_gumbel($bound_lower,$scale,$location);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1604 my $cdf_upper=cdf_gumbel($bound_upper,$scale,$location);
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1605
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1606 my $prob=$cdf_upper-$cdf_lower;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1607
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1608 return $prob;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1609 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1610
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1611
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1612 sub cdf_gumbel{
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1613
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1614 # calculates the cumulative distribution function of the Gumbel distribution
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1615
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1616 my ($var,$scale,$location)=@_;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1617
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1618 my $cdf=$e**(-($e**(-($var-$location)/$scale)));
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1619
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1620 return $cdf;
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1621 }
7b5a48b972e9 Uploaded
big-tiandm
parents:
diff changeset
1622