comparison orthologs/ucsb_hamster/translate.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 #!/usr/bin/perl -w
2 use strict;
3 use FindBin;
4 use lib "$FindBin::Bin/lib";
5 use Getopt::Long;
6 use Bio::Perl;
7 use Bio::SeqIO;
8 use File::Copy;
9
10 # PROGRAMNAME: translate.pl
11
12 # AUTHOR: INGO EBERSBERGER, ingo.ebersberger@univie.ac.at
13
14 # PROGRAM DESCRIPTION:
15
16 # DATE: Tue May 12 14:03:34 CEST 2009
17
18
19 # DATE LAST MODIFIED:
20 ######################## start main #############################
21 my $help;
22 my @out;
23 my @estout;
24 my $infile;
25 my $trunc = 1;
26 my $outfile = "translate_tc.out";
27 my $limit = 20; ## this sets the maximum length for the sequence identifier. If sequence identifier are
28 ## too long, then one can run into troubles with the parsing of the hmmsearch results.
29 open (LOG, ">>hamstrsearch.log") or warn "could not open logfile for writing\n";
30 print LOG "##### Translation of the ESTs########";
31 #########
32 my $usage = "Name:\n\ttranslate.pl\n
33 Synopsis:\n\ttranslate_tc5.pl [-infile=FILE] [options] [-outfile=FILE]\n
34 Description:\n\tThis program takes a batch fasta-file with DNA
35 \tsequences as an input and translates the individual DNA sequences in
36 \tall six reading frames.
37 \t-infile: provide the relative or absolute path of the infile\n
38 \t-outfile: provide the realtive or absolute path of the
39 \toutfile. Default is: translate_tc.out\n
40 \ttrunc: set -trunc=0 to prevent truncation of the sequence header (see below).
41 \t-h: prints this help-message\n
42 NOTE: if the seq-id (everything up to the first [[:space:]]) contains a '|' everything between the '>' and the '|' will be taken as seq-id. Otherwise, the entire seq-id will be used. You can change this behavior by setting -trunc=0\n
43 NOTE: the script as an automated routine to check for unique sequence names in the input file. This may lead to cases where the $trunc value is overruled and additionally part of the sequence description may be included.";
44 ##########
45
46 GetOptions (
47 "h" => \$help,
48 "infile=s" => \$infile,
49 "outfile=s" => \$outfile,
50 "trunc=s" => \$trunc);
51 if ($help) {
52 print "$usage";
53 exit;
54 }
55 if (-e "$outfile") {
56 print LOG "an outfile $outfile already exists. Renaming to $outfile.old\n\n";
57 my $newname = "$outfile.old";
58 rename($outfile, $newname);
59 }
60 ##BUG -- BIOPERL GUESSES PROTEIN FILE FORMAT WHEN AMBIGUITY CODES ARE PRESENT
61 ##CAUSING AN ERROR IN THE TRANLATE_6 FRAMES, WHICH INTERRUPTS ALL TRANSLATION -- THO
62 #below replaces read_all_sequences function to declare that sequence is DNA
63 #original line next
64 #my @seq_object = read_all_sequences($infile, 'fasta');
65
66 my $tempseqio;
67 $tempseqio = Bio::SeqIO->new( '-file' => $infile, '-format' => 'fasta');
68 my @seq_object;
69
70 while( my $seq = $tempseqio->next_seq() ) {
71 $seq->alphabet('dna');
72 push(@seq_object,$seq);
73 }
74 #End THO changes
75
76
77 ## determine whether the seq-ids are unique given the chosen value for $trunc
78 my ($message, $cont, $check) = &checkIds();
79 if ($cont == 1) {
80 ## the check for unique identifiers has failed and the programm is exiting
81 print LOG "$message\n";
82 exit;
83 }
84 else {
85 print LOG "All sequence identifier are unique!\n";
86 if ($check == 2) {
87 my $newname = "$infile.original";
88 rename($infile, $newname);
89 print LOG "Sequence description was needed to make seq-id unique. The original version of the infile was stored in $infile.original\n";
90 }
91 for (my $j = 0; $j < @seq_object; $j++) {
92 my $finalid = $seq_object[$j]->{finalid};
93 my $estseq = $seq_object[$j]->seq;
94 my $inid = $seq_object[$j]->display_id;
95 my @all_trans = Bio::SeqUtils->translate_6frames($seq_object[$j]);
96 for (my $i = 0; $i < @all_trans; $i++) {
97 my $count = $i+1;
98 my $id = $all_trans[$i]->display_id;
99 my $seq = $all_trans[$i]->seq;
100 $id =~ s/$inid/$finalid/;
101 $id =~ s/-[0-9][RF]/_RF$count.0/;
102 push @out, ">$id\n$seq";
103 }
104 push @estout, ">$finalid\n$estseq";
105 if ($j%100 == 0) {
106 print "$j Sequences processed\n";
107 open (OUT, ">>$outfile") or die "failed to open outfile\n";
108 print OUT join "\n", @out;
109 print OUT "\n";
110 @out = qw();
111 close OUT;
112 if ($check == 2) {
113 ## part of the description was added to the seq-id
114 open (OUT, ">>$infile");
115 print OUT join "\n", @estout;
116 print OUT "\n";
117 @estout = qw();
118 }
119 }
120 }
121 open (OUT, ">>$outfile") or die "failed to open outfile\n";
122 print OUT join "\n", @out;
123 print OUT "\n";
124 @out = qw();
125 close OUT;
126 if ($check == 2) {
127 ## part of the description was added to the seq-id
128 open (OUT, ">>$infile");
129 print OUT join "\n", @estout;
130 print OUT "\n";
131 close OUT;
132 @estout = qw();
133 }
134 }
135 close LOG;
136 exit;
137 ########################## start sub ################
138 sub checkIds {
139 my $message;
140 my $check = 1;
141 my $cont = 1;
142 my $counter;
143 ## Everything up to the first whitespace
144 ## in the fasta header will be taken as sequence id by bioperl. If this
145 ## id contains a '|' and $trunc is set to 1 (default), the ids may no longer
146 ## be unique. This will be checked and if necessary the id will not be truncated
147 ## for $check == 0, the truncated version of the id will be checked (only if $trunc == 1)
148 ## for $check == 1, the complete id will be checked
149 ## for $check == 2, the first 20 characters of the concatenated id and description
150 ## will be checked
151 if ($trunc == 1) {
152 $check = 0;
153 }
154
155 while ($check < 3 and $cont == 1) {
156 $cont = 0;
157 for (my $i=0; $i < @seq_object; $i++) {
158 my $id = $seq_object[$i]->display_id;
159 $id =~ s/(.{0,$limit}).*/$1/;
160 if ($check == 0) {
161 $id =~ s/|.*//;
162 }
163 elsif ($check == 2) {
164 print "Check 2: ".$seq_object[$i]->display_id." \n";
165 $id = $id . '_' . $seq_object[$i]->desc;
166 $id =~ s/(.{0,$limit}).*/$1/;
167 }
168 if (defined $counter->{$id}) {
169 if ($check == 0) {
170 $message = "trying next without truncating the id";
171 }
172 elsif ($check == 1) {
173 $message = 'trying next to include sequence description';
174 }
175 else {
176 $message = "Sequence identifier are not unique, using the first 20 characters. Aborting...";
177 }
178 print LOG "sequence ids are not unique in the file $infile, $message. The offending identfier is $id\n\n";
179 print "sequence ids are not unique in the file $infile, $message. The offending identfier is $id\n\n";
180 $check ++;
181 $cont = 1;
182 $counter = undef;
183 last;
184 }
185 else {
186 $counter->{$id} = 1;
187 $seq_object[$i]->{finalid} = $id;
188 }
189 }
190 }
191 ## return the value of $cont. If this is 1, then the sequence id check has failed.
192 return($message, $cont, $check);
193 }