Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
comparison orthologs/ucsb_hamster/translate.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5b9a38ec4a39 |
---|---|
1 #!/usr/bin/perl -w | |
2 use strict; | |
3 use FindBin; | |
4 use lib "$FindBin::Bin/lib"; | |
5 use Getopt::Long; | |
6 use Bio::Perl; | |
7 use Bio::SeqIO; | |
8 use File::Copy; | |
9 | |
10 # PROGRAMNAME: translate.pl | |
11 | |
12 # AUTHOR: INGO EBERSBERGER, ingo.ebersberger@univie.ac.at | |
13 | |
14 # PROGRAM DESCRIPTION: | |
15 | |
16 # DATE: Tue May 12 14:03:34 CEST 2009 | |
17 | |
18 | |
19 # DATE LAST MODIFIED: | |
20 ######################## start main ############################# | |
21 my $help; | |
22 my @out; | |
23 my @estout; | |
24 my $infile; | |
25 my $trunc = 1; | |
26 my $outfile = "translate_tc.out"; | |
27 my $limit = 20; ## this sets the maximum length for the sequence identifier. If sequence identifier are | |
28 ## too long, then one can run into troubles with the parsing of the hmmsearch results. | |
29 open (LOG, ">>hamstrsearch.log") or warn "could not open logfile for writing\n"; | |
30 print LOG "##### Translation of the ESTs########"; | |
31 ######### | |
32 my $usage = "Name:\n\ttranslate.pl\n | |
33 Synopsis:\n\ttranslate_tc5.pl [-infile=FILE] [options] [-outfile=FILE]\n | |
34 Description:\n\tThis program takes a batch fasta-file with DNA | |
35 \tsequences as an input and translates the individual DNA sequences in | |
36 \tall six reading frames. | |
37 \t-infile: provide the relative or absolute path of the infile\n | |
38 \t-outfile: provide the realtive or absolute path of the | |
39 \toutfile. Default is: translate_tc.out\n | |
40 \ttrunc: set -trunc=0 to prevent truncation of the sequence header (see below). | |
41 \t-h: prints this help-message\n | |
42 NOTE: if the seq-id (everything up to the first [[:space:]]) contains a '|' everything between the '>' and the '|' will be taken as seq-id. Otherwise, the entire seq-id will be used. You can change this behavior by setting -trunc=0\n | |
43 NOTE: the script as an automated routine to check for unique sequence names in the input file. This may lead to cases where the $trunc value is overruled and additionally part of the sequence description may be included."; | |
44 ########## | |
45 | |
46 GetOptions ( | |
47 "h" => \$help, | |
48 "infile=s" => \$infile, | |
49 "outfile=s" => \$outfile, | |
50 "trunc=s" => \$trunc); | |
51 if ($help) { | |
52 print "$usage"; | |
53 exit; | |
54 } | |
55 if (-e "$outfile") { | |
56 print LOG "an outfile $outfile already exists. Renaming to $outfile.old\n\n"; | |
57 my $newname = "$outfile.old"; | |
58 rename($outfile, $newname); | |
59 } | |
60 ##BUG -- BIOPERL GUESSES PROTEIN FILE FORMAT WHEN AMBIGUITY CODES ARE PRESENT | |
61 ##CAUSING AN ERROR IN THE TRANLATE_6 FRAMES, WHICH INTERRUPTS ALL TRANSLATION -- THO | |
62 #below replaces read_all_sequences function to declare that sequence is DNA | |
63 #original line next | |
64 #my @seq_object = read_all_sequences($infile, 'fasta'); | |
65 | |
66 my $tempseqio; | |
67 $tempseqio = Bio::SeqIO->new( '-file' => $infile, '-format' => 'fasta'); | |
68 my @seq_object; | |
69 | |
70 while( my $seq = $tempseqio->next_seq() ) { | |
71 $seq->alphabet('dna'); | |
72 push(@seq_object,$seq); | |
73 } | |
74 #End THO changes | |
75 | |
76 | |
77 ## determine whether the seq-ids are unique given the chosen value for $trunc | |
78 my ($message, $cont, $check) = &checkIds(); | |
79 if ($cont == 1) { | |
80 ## the check for unique identifiers has failed and the programm is exiting | |
81 print LOG "$message\n"; | |
82 exit; | |
83 } | |
84 else { | |
85 print LOG "All sequence identifier are unique!\n"; | |
86 if ($check == 2) { | |
87 my $newname = "$infile.original"; | |
88 rename($infile, $newname); | |
89 print LOG "Sequence description was needed to make seq-id unique. The original version of the infile was stored in $infile.original\n"; | |
90 } | |
91 for (my $j = 0; $j < @seq_object; $j++) { | |
92 my $finalid = $seq_object[$j]->{finalid}; | |
93 my $estseq = $seq_object[$j]->seq; | |
94 my $inid = $seq_object[$j]->display_id; | |
95 my @all_trans = Bio::SeqUtils->translate_6frames($seq_object[$j]); | |
96 for (my $i = 0; $i < @all_trans; $i++) { | |
97 my $count = $i+1; | |
98 my $id = $all_trans[$i]->display_id; | |
99 my $seq = $all_trans[$i]->seq; | |
100 $id =~ s/$inid/$finalid/; | |
101 $id =~ s/-[0-9][RF]/_RF$count.0/; | |
102 push @out, ">$id\n$seq"; | |
103 } | |
104 push @estout, ">$finalid\n$estseq"; | |
105 if ($j%100 == 0) { | |
106 print "$j Sequences processed\n"; | |
107 open (OUT, ">>$outfile") or die "failed to open outfile\n"; | |
108 print OUT join "\n", @out; | |
109 print OUT "\n"; | |
110 @out = qw(); | |
111 close OUT; | |
112 if ($check == 2) { | |
113 ## part of the description was added to the seq-id | |
114 open (OUT, ">>$infile"); | |
115 print OUT join "\n", @estout; | |
116 print OUT "\n"; | |
117 @estout = qw(); | |
118 } | |
119 } | |
120 } | |
121 open (OUT, ">>$outfile") or die "failed to open outfile\n"; | |
122 print OUT join "\n", @out; | |
123 print OUT "\n"; | |
124 @out = qw(); | |
125 close OUT; | |
126 if ($check == 2) { | |
127 ## part of the description was added to the seq-id | |
128 open (OUT, ">>$infile"); | |
129 print OUT join "\n", @estout; | |
130 print OUT "\n"; | |
131 close OUT; | |
132 @estout = qw(); | |
133 } | |
134 } | |
135 close LOG; | |
136 exit; | |
137 ########################## start sub ################ | |
138 sub checkIds { | |
139 my $message; | |
140 my $check = 1; | |
141 my $cont = 1; | |
142 my $counter; | |
143 ## Everything up to the first whitespace | |
144 ## in the fasta header will be taken as sequence id by bioperl. If this | |
145 ## id contains a '|' and $trunc is set to 1 (default), the ids may no longer | |
146 ## be unique. This will be checked and if necessary the id will not be truncated | |
147 ## for $check == 0, the truncated version of the id will be checked (only if $trunc == 1) | |
148 ## for $check == 1, the complete id will be checked | |
149 ## for $check == 2, the first 20 characters of the concatenated id and description | |
150 ## will be checked | |
151 if ($trunc == 1) { | |
152 $check = 0; | |
153 } | |
154 | |
155 while ($check < 3 and $cont == 1) { | |
156 $cont = 0; | |
157 for (my $i=0; $i < @seq_object; $i++) { | |
158 my $id = $seq_object[$i]->display_id; | |
159 $id =~ s/(.{0,$limit}).*/$1/; | |
160 if ($check == 0) { | |
161 $id =~ s/|.*//; | |
162 } | |
163 elsif ($check == 2) { | |
164 print "Check 2: ".$seq_object[$i]->display_id." \n"; | |
165 $id = $id . '_' . $seq_object[$i]->desc; | |
166 $id =~ s/(.{0,$limit}).*/$1/; | |
167 } | |
168 if (defined $counter->{$id}) { | |
169 if ($check == 0) { | |
170 $message = "trying next without truncating the id"; | |
171 } | |
172 elsif ($check == 1) { | |
173 $message = 'trying next to include sequence description'; | |
174 } | |
175 else { | |
176 $message = "Sequence identifier are not unique, using the first 20 characters. Aborting..."; | |
177 } | |
178 print LOG "sequence ids are not unique in the file $infile, $message. The offending identfier is $id\n\n"; | |
179 print "sequence ids are not unique in the file $infile, $message. The offending identfier is $id\n\n"; | |
180 $check ++; | |
181 $cont = 1; | |
182 $counter = undef; | |
183 last; | |
184 } | |
185 else { | |
186 $counter->{$id} = 1; | |
187 $seq_object[$i]->{finalid} = $id; | |
188 } | |
189 } | |
190 } | |
191 ## return the value of $cont. If this is 1, then the sequence id check has failed. | |
192 return($message, $cont, $check); | |
193 } |