Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
comparison orthologs/ucsb_hamster/lib/run_genewise.pm @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5b9a38ec4a39 |
---|---|
1 package run_genewise; | |
2 use strict; | |
3 $ENV{'WISECONFIGDIR'} = "/home/osiris/galaxy-dist/tools/osiris/orthologs/ucsb_hamster/lib/wisecfg"; | |
4 # this module runs genewise on a DNA sequence and a protein sequence | |
5 # and then allows to parse this result. | |
6 # the constructor creates an object containing a reference to an array | |
7 # containing the file content | |
8 1; | |
9 sub new { | |
10 my $self_tmp = []; | |
11 my $self; | |
12 my ($class, $dna, $prot, $path) = @_; | |
13 if (!defined $path) { | |
14 $path = '/tmp'; | |
15 } | |
16 $dna =~ s/R/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence | |
17 $dna =~ s/S/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence | |
18 $dna =~ s/W/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence | |
19 $dna =~ s/D/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence | |
20 $dna =~ s/K/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence | |
21 $dna =~ s/Y/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence | |
22 $dna =~ s/B/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence | |
23 $dna =~ s/V/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence | |
24 $dna =~ s/M/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence | |
25 | |
26 # the file names | |
27 my $protname = 'protein'; | |
28 my $dnaname = 'dna'; | |
29 | |
30 ## print the two sequences to default path /tmp/ | |
31 open (DNA, ">$path/dna.fa") or die "could not open $path/dna.fa for writing\n"; | |
32 print DNA ">$dnaname\n$dna"; | |
33 close DNA; | |
34 open (PROTEIN, ">$path/prot.fa") or die "could not open $path/prot.fa for writing\n"; | |
35 print PROTEIN ">$protname\n$prot"; | |
36 close PROTEIN; | |
37 | |
38 ## run genewise on the two sequences | |
39 `echo \$WISECONFIGDIR`; | |
40 | |
41 # $self_tmp = [`.\/genewise -trans -cdna -pep -sum $path/prot.fa $path/dna.fa`]; | |
42 #THO--For Galaxy run Genewise in the path | |
43 $self_tmp = [`genewise -trans -cdna -pep -sum $path/prot.fa $path/dna.fa`]; | |
44 for (my $i = 0; $i < @$self_tmp; $i++) { | |
45 $self_tmp->[$i] =~ s/\s{1,}$//; | |
46 } | |
47 $self->{gw} = $self_tmp; | |
48 $self->{nt_seq} = $dna; | |
49 $self->{prot_seq} = $prot; | |
50 $self->{protname} = $protname; | |
51 $self->{dnaname} = $dnaname; | |
52 $self->{gw_count} = @$self_tmp; | |
53 $self->{get_indel} = 1; ## per default the indel-part is recovererd, rather than masked by 'N'. See code for details | |
54 $self->{indels} = _GetIndels($self_tmp); | |
55 bless ($self, $class); | |
56 return $self;} | |
57 ################# | |
58 ## sub score extract the score for the alignment | |
59 sub score { | |
60 my $self = shift; | |
61 my $score; | |
62 for (my $i = 0; $i < $self->{gw_count}; $i ++) { | |
63 if ($self->{gw}->[$i] =~ /^(\d{1,}\.{0,1}\d{0,}).*/) { | |
64 $score = $1; | |
65 last; | |
66 } | |
67 } | |
68 return ($score); | |
69 } | |
70 ################## | |
71 sub protein { | |
72 my $self = shift; | |
73 my $gw = $self->{gw}; | |
74 my $prot = ''; | |
75 for (my $i = 0; $i < @$gw; $i++) { | |
76 if ($gw->[$i] =~ />.*\.pep/) { #the protein seq starts | |
77 my $count = 1; | |
78 while ($gw->[$i+$count] ne '//') { | |
79 my $protpart = $gw->[$i+$count]; | |
80 chomp $protpart; | |
81 $prot .= $protpart; | |
82 $count ++; | |
83 } | |
84 } | |
85 elsif (length $prot > 0) { | |
86 last; | |
87 } | |
88 } | |
89 return($prot); | |
90 } | |
91 ################## | |
92 sub translation { | |
93 my $self = shift; | |
94 my $finish = 0; | |
95 my $translated_seq = ''; | |
96 my @transtmp; | |
97 | |
98 ## step 1: extract the relevant info from the genewise output | |
99 | |
100 for (my $i = 0; $i < $self->{gw_count}; $i++) { | |
101 if ($self->{gw}->[$i] =~ />.*.tr/) {# a translated bit starts | |
102 while ($self->{gw}->[$i] !~ '//') { | |
103 push @transtmp, $self->{gw}->[$i]; | |
104 $i++; | |
105 } | |
106 last; # end the for loop since nothing left to be done | |
107 } | |
108 } | |
109 | |
110 ## step two: get the sequences | |
111 my $count = -1; | |
112 my $trans; | |
113 for (my $i = 0; $i < @transtmp; $i++) { | |
114 if ($transtmp[$i] =~ />/) { | |
115 $count++; | |
116 $trans->[$count]->{seq} = ''; # initialize | |
117 if ($transtmp[$i] =~ /.*\[([0-9]{1,}):([0-9]{1,})\].*/) { | |
118 $trans->[$count]->{start} = $1; | |
119 $trans->[$count]->{end} = $2; | |
120 } | |
121 } | |
122 else { | |
123 $trans->[$count]->{seq} .= $transtmp[$i]; | |
124 } | |
125 } | |
126 | |
127 ## step 3: connect the fragments | |
128 if (@$trans == 1) { | |
129 $translated_seq = $trans->[0]->{seq}; | |
130 } | |
131 else { | |
132 for (my $i = 0; $i < @$trans; $i++) { | |
133 $translated_seq .= $trans->[$i]->{seq}; | |
134 if ($i < (@$trans - 1)) { | |
135 my $missing = $trans->[$i+1]->{start} - $trans->[$i]->{end} -1; | |
136 $translated_seq .= 'X'; | |
137 } | |
138 } | |
139 } | |
140 return($translated_seq); | |
141 } | |
142 | |
143 ################## | |
144 sub codons { | |
145 my $self = shift; | |
146 my $finish = 0; | |
147 my $codon_seq = ''; | |
148 my @transtmp; | |
149 | |
150 ## step 1: extract the relevant info from the genewise output | |
151 for (my $i = 0; $i < $self->{gw_count}; $i++) { | |
152 if ($self->{gw}->[$i] =~ />.*sp$/) {# the codons set starts | |
153 while ($self->{gw}->[$i] !~ '//') { | |
154 push @transtmp, $self->{gw}->[$i]; | |
155 $i++; | |
156 } | |
157 last; # end the for loop since nothing left to be done | |
158 } | |
159 } | |
160 | |
161 ## step two: get the sequences | |
162 my $count = -1; | |
163 my $trans; | |
164 for (my $i = 0; $i < @transtmp; $i++) { | |
165 if ($transtmp[$i] =~ />/) { | |
166 $count++; | |
167 $trans->[$count]->{seq} = ''; # initialize | |
168 if ($transtmp[$i] =~ /.*\[([0-9]{1,}):([0-9]{1,})\].*/) { | |
169 $trans->[$count]->{start} = $1; | |
170 $trans->[$count]->{end} = $2; | |
171 } | |
172 } | |
173 else { | |
174 $transtmp[$i] =~ tr/a-z/A-Z/; | |
175 $trans->[$count]->{seq} .= $transtmp[$i]; | |
176 } | |
177 } | |
178 | |
179 ## step 3: connect the fragments | |
180 if ( @$trans == 1) { | |
181 $codon_seq = $trans->[0]->{seq}; | |
182 } | |
183 else { | |
184 for (my $i = 0; $i < @$trans; $i++) { | |
185 $codon_seq .= $trans->[$i]->{seq}; | |
186 if ($i < (@$trans - 1)) { | |
187 my $indel = ''; | |
188 my $missing = $trans->[$i+1]->{start} - $trans->[$i]->{end} -1; | |
189 | |
190 ## now decide whether the nts that did not got translated are masked by | |
191 ## 'N' or whether they will be represented as lower case letters | |
192 if ($self->{get_indel}) { | |
193 $indel = substr($self->{nt_seq}, $trans->[$i]->{end}, $missing); | |
194 $indel =~ tr/A-Z/a-z/; | |
195 } | |
196 else { | |
197 $indel = 'N' x $missing; | |
198 } | |
199 ## now append gap characters until the frame is recovered. Not that the gap | |
200 ## characters are added to the end of the indel-part. Thus, the codons are | |
201 ## not considered. | |
202 while (length($indel)%3 != 0) { | |
203 $indel .= '-'; | |
204 } | |
205 | |
206 $codon_seq .= $indel; | |
207 } | |
208 } | |
209 } | |
210 return ($codon_seq); | |
211 } | |
212 ########################### | |
213 sub protein_borders { | |
214 my $self = shift; | |
215 my $gw = $self->{gw}; | |
216 for (my $i = 0; $i < @$gw; $i++) { | |
217 if ($gw->[$i] =~ /Bits.*introns$/) { | |
218 my ($start, $end) = $gw->[$i+1] =~ /.*$self->{protname}\s{1,}([0-9]{1,})\s{1,}([0-9]{1,}).*/; | |
219 return($start, $end); | |
220 } | |
221 else { | |
222 die "no protein-start and end could not be determnined. Check genewise command\n"; | |
223 } | |
224 } | |
225 } | |
226 ########################## | |
227 sub cdna_borders { | |
228 my $self = shift; | |
229 my $gw = $self->{gw}; | |
230 for (my $i = 0; $i < @$gw; $i++) { | |
231 if ($gw->[$i] =~ /Bits.*introns$/) { | |
232 my ($start, $end) = $gw->[$i+1] =~ /.*$self->{dnaname}\s{1,}([0-9]{1,})\s{1,}([0-9]{1,}).*/; | |
233 return($start, $end); | |
234 } | |
235 else { | |
236 die "no cdna-start and end could not be determnined. Check genewise command\n"; | |
237 } | |
238 } | |
239 } | |
240 ########################## | |
241 sub _GetIndels { | |
242 my $gw = shift; | |
243 my $indel; | |
244 for (my $i = 0; $i < @$gw; $i++) { | |
245 if ($gw->[$i] =~ /Bits/) { | |
246 $indel = $gw->[$i+1] =~ /.*([0-9]{1,})/; | |
247 return($indel); | |
248 } | |
249 } | |
250 } |