comparison orthologs/ucsb_hamster/lib/run_genewise.pm @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 package run_genewise;
2 use strict;
3 $ENV{'WISECONFIGDIR'} = "/home/osiris/galaxy-dist/tools/osiris/orthologs/ucsb_hamster/lib/wisecfg";
4 # this module runs genewise on a DNA sequence and a protein sequence
5 # and then allows to parse this result.
6 # the constructor creates an object containing a reference to an array
7 # containing the file content
8 1;
9 sub new {
10 my $self_tmp = [];
11 my $self;
12 my ($class, $dna, $prot, $path) = @_;
13 if (!defined $path) {
14 $path = '/tmp';
15 }
16 $dna =~ s/R/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence
17 $dna =~ s/S/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence
18 $dna =~ s/W/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence
19 $dna =~ s/D/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence
20 $dna =~ s/K/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence
21 $dna =~ s/Y/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence
22 $dna =~ s/B/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence
23 $dna =~ s/V/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence
24 $dna =~ s/M/N/g; #Added by THO -- genewise crashed with 'R' in dna sequence
25
26 # the file names
27 my $protname = 'protein';
28 my $dnaname = 'dna';
29
30 ## print the two sequences to default path /tmp/
31 open (DNA, ">$path/dna.fa") or die "could not open $path/dna.fa for writing\n";
32 print DNA ">$dnaname\n$dna";
33 close DNA;
34 open (PROTEIN, ">$path/prot.fa") or die "could not open $path/prot.fa for writing\n";
35 print PROTEIN ">$protname\n$prot";
36 close PROTEIN;
37
38 ## run genewise on the two sequences
39 `echo \$WISECONFIGDIR`;
40
41 # $self_tmp = [`.\/genewise -trans -cdna -pep -sum $path/prot.fa $path/dna.fa`];
42 #THO--For Galaxy run Genewise in the path
43 $self_tmp = [`genewise -trans -cdna -pep -sum $path/prot.fa $path/dna.fa`];
44 for (my $i = 0; $i < @$self_tmp; $i++) {
45 $self_tmp->[$i] =~ s/\s{1,}$//;
46 }
47 $self->{gw} = $self_tmp;
48 $self->{nt_seq} = $dna;
49 $self->{prot_seq} = $prot;
50 $self->{protname} = $protname;
51 $self->{dnaname} = $dnaname;
52 $self->{gw_count} = @$self_tmp;
53 $self->{get_indel} = 1; ## per default the indel-part is recovererd, rather than masked by 'N'. See code for details
54 $self->{indels} = _GetIndels($self_tmp);
55 bless ($self, $class);
56 return $self;}
57 #################
58 ## sub score extract the score for the alignment
59 sub score {
60 my $self = shift;
61 my $score;
62 for (my $i = 0; $i < $self->{gw_count}; $i ++) {
63 if ($self->{gw}->[$i] =~ /^(\d{1,}\.{0,1}\d{0,}).*/) {
64 $score = $1;
65 last;
66 }
67 }
68 return ($score);
69 }
70 ##################
71 sub protein {
72 my $self = shift;
73 my $gw = $self->{gw};
74 my $prot = '';
75 for (my $i = 0; $i < @$gw; $i++) {
76 if ($gw->[$i] =~ />.*\.pep/) { #the protein seq starts
77 my $count = 1;
78 while ($gw->[$i+$count] ne '//') {
79 my $protpart = $gw->[$i+$count];
80 chomp $protpart;
81 $prot .= $protpart;
82 $count ++;
83 }
84 }
85 elsif (length $prot > 0) {
86 last;
87 }
88 }
89 return($prot);
90 }
91 ##################
92 sub translation {
93 my $self = shift;
94 my $finish = 0;
95 my $translated_seq = '';
96 my @transtmp;
97
98 ## step 1: extract the relevant info from the genewise output
99
100 for (my $i = 0; $i < $self->{gw_count}; $i++) {
101 if ($self->{gw}->[$i] =~ />.*.tr/) {# a translated bit starts
102 while ($self->{gw}->[$i] !~ '//') {
103 push @transtmp, $self->{gw}->[$i];
104 $i++;
105 }
106 last; # end the for loop since nothing left to be done
107 }
108 }
109
110 ## step two: get the sequences
111 my $count = -1;
112 my $trans;
113 for (my $i = 0; $i < @transtmp; $i++) {
114 if ($transtmp[$i] =~ />/) {
115 $count++;
116 $trans->[$count]->{seq} = ''; # initialize
117 if ($transtmp[$i] =~ /.*\[([0-9]{1,}):([0-9]{1,})\].*/) {
118 $trans->[$count]->{start} = $1;
119 $trans->[$count]->{end} = $2;
120 }
121 }
122 else {
123 $trans->[$count]->{seq} .= $transtmp[$i];
124 }
125 }
126
127 ## step 3: connect the fragments
128 if (@$trans == 1) {
129 $translated_seq = $trans->[0]->{seq};
130 }
131 else {
132 for (my $i = 0; $i < @$trans; $i++) {
133 $translated_seq .= $trans->[$i]->{seq};
134 if ($i < (@$trans - 1)) {
135 my $missing = $trans->[$i+1]->{start} - $trans->[$i]->{end} -1;
136 $translated_seq .= 'X';
137 }
138 }
139 }
140 return($translated_seq);
141 }
142
143 ##################
144 sub codons {
145 my $self = shift;
146 my $finish = 0;
147 my $codon_seq = '';
148 my @transtmp;
149
150 ## step 1: extract the relevant info from the genewise output
151 for (my $i = 0; $i < $self->{gw_count}; $i++) {
152 if ($self->{gw}->[$i] =~ />.*sp$/) {# the codons set starts
153 while ($self->{gw}->[$i] !~ '//') {
154 push @transtmp, $self->{gw}->[$i];
155 $i++;
156 }
157 last; # end the for loop since nothing left to be done
158 }
159 }
160
161 ## step two: get the sequences
162 my $count = -1;
163 my $trans;
164 for (my $i = 0; $i < @transtmp; $i++) {
165 if ($transtmp[$i] =~ />/) {
166 $count++;
167 $trans->[$count]->{seq} = ''; # initialize
168 if ($transtmp[$i] =~ /.*\[([0-9]{1,}):([0-9]{1,})\].*/) {
169 $trans->[$count]->{start} = $1;
170 $trans->[$count]->{end} = $2;
171 }
172 }
173 else {
174 $transtmp[$i] =~ tr/a-z/A-Z/;
175 $trans->[$count]->{seq} .= $transtmp[$i];
176 }
177 }
178
179 ## step 3: connect the fragments
180 if ( @$trans == 1) {
181 $codon_seq = $trans->[0]->{seq};
182 }
183 else {
184 for (my $i = 0; $i < @$trans; $i++) {
185 $codon_seq .= $trans->[$i]->{seq};
186 if ($i < (@$trans - 1)) {
187 my $indel = '';
188 my $missing = $trans->[$i+1]->{start} - $trans->[$i]->{end} -1;
189
190 ## now decide whether the nts that did not got translated are masked by
191 ## 'N' or whether they will be represented as lower case letters
192 if ($self->{get_indel}) {
193 $indel = substr($self->{nt_seq}, $trans->[$i]->{end}, $missing);
194 $indel =~ tr/A-Z/a-z/;
195 }
196 else {
197 $indel = 'N' x $missing;
198 }
199 ## now append gap characters until the frame is recovered. Not that the gap
200 ## characters are added to the end of the indel-part. Thus, the codons are
201 ## not considered.
202 while (length($indel)%3 != 0) {
203 $indel .= '-';
204 }
205
206 $codon_seq .= $indel;
207 }
208 }
209 }
210 return ($codon_seq);
211 }
212 ###########################
213 sub protein_borders {
214 my $self = shift;
215 my $gw = $self->{gw};
216 for (my $i = 0; $i < @$gw; $i++) {
217 if ($gw->[$i] =~ /Bits.*introns$/) {
218 my ($start, $end) = $gw->[$i+1] =~ /.*$self->{protname}\s{1,}([0-9]{1,})\s{1,}([0-9]{1,}).*/;
219 return($start, $end);
220 }
221 else {
222 die "no protein-start and end could not be determnined. Check genewise command\n";
223 }
224 }
225 }
226 ##########################
227 sub cdna_borders {
228 my $self = shift;
229 my $gw = $self->{gw};
230 for (my $i = 0; $i < @$gw; $i++) {
231 if ($gw->[$i] =~ /Bits.*introns$/) {
232 my ($start, $end) = $gw->[$i+1] =~ /.*$self->{dnaname}\s{1,}([0-9]{1,})\s{1,}([0-9]{1,}).*/;
233 return($start, $end);
234 }
235 else {
236 die "no cdna-start and end could not be determnined. Check genewise command\n";
237 }
238 }
239 }
240 ##########################
241 sub _GetIndels {
242 my $gw = shift;
243 my $indel;
244 for (my $i = 0; $i < @$gw; $i++) {
245 if ($gw->[$i] =~ /Bits/) {
246 $indel = $gw->[$i+1] =~ /.*([0-9]{1,})/;
247 return($indel);
248 }
249 }
250 }