Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
diff orthologs/ucsb_hamster/translate.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/orthologs/ucsb_hamster/translate.pl Tue Mar 11 12:19:13 2014 -0700 @@ -0,0 +1,193 @@ +#!/usr/bin/perl -w +use strict; +use FindBin; +use lib "$FindBin::Bin/lib"; +use Getopt::Long; +use Bio::Perl; +use Bio::SeqIO; +use File::Copy; + +# PROGRAMNAME: translate.pl + +# AUTHOR: INGO EBERSBERGER, ingo.ebersberger@univie.ac.at + +# PROGRAM DESCRIPTION: + +# DATE: Tue May 12 14:03:34 CEST 2009 + + +# DATE LAST MODIFIED: +######################## start main ############################# +my $help; +my @out; +my @estout; +my $infile; +my $trunc = 1; +my $outfile = "translate_tc.out"; +my $limit = 20; ## this sets the maximum length for the sequence identifier. If sequence identifier are +## too long, then one can run into troubles with the parsing of the hmmsearch results. +open (LOG, ">>hamstrsearch.log") or warn "could not open logfile for writing\n"; +print LOG "##### Translation of the ESTs########"; +######### +my $usage = "Name:\n\ttranslate.pl\n +Synopsis:\n\ttranslate_tc5.pl [-infile=FILE] [options] [-outfile=FILE]\n +Description:\n\tThis program takes a batch fasta-file with DNA +\tsequences as an input and translates the individual DNA sequences in +\tall six reading frames. +\t-infile: provide the relative or absolute path of the infile\n +\t-outfile: provide the realtive or absolute path of the +\toutfile. Default is: translate_tc.out\n +\ttrunc: set -trunc=0 to prevent truncation of the sequence header (see below). +\t-h: prints this help-message\n +NOTE: if the seq-id (everything up to the first [[:space:]]) contains a '|' everything between the '>' and the '|' will be taken as seq-id. Otherwise, the entire seq-id will be used. You can change this behavior by setting -trunc=0\n +NOTE: the script as an automated routine to check for unique sequence names in the input file. This may lead to cases where the $trunc value is overruled and additionally part of the sequence description may be included."; +########## + +GetOptions ( + "h" => \$help, + "infile=s" => \$infile, + "outfile=s" => \$outfile, + "trunc=s" => \$trunc); +if ($help) { + print "$usage"; + exit; +} +if (-e "$outfile") { + print LOG "an outfile $outfile already exists. Renaming to $outfile.old\n\n"; + my $newname = "$outfile.old"; + rename($outfile, $newname); +} +##BUG -- BIOPERL GUESSES PROTEIN FILE FORMAT WHEN AMBIGUITY CODES ARE PRESENT +##CAUSING AN ERROR IN THE TRANLATE_6 FRAMES, WHICH INTERRUPTS ALL TRANSLATION -- THO +#below replaces read_all_sequences function to declare that sequence is DNA +#original line next +#my @seq_object = read_all_sequences($infile, 'fasta'); + + my $tempseqio; + $tempseqio = Bio::SeqIO->new( '-file' => $infile, '-format' => 'fasta'); + my @seq_object; + + while( my $seq = $tempseqio->next_seq() ) { + $seq->alphabet('dna'); + push(@seq_object,$seq); + } +#End THO changes + + +## determine whether the seq-ids are unique given the chosen value for $trunc +my ($message, $cont, $check) = &checkIds(); +if ($cont == 1) { + ## the check for unique identifiers has failed and the programm is exiting + print LOG "$message\n"; + exit; +} +else { + print LOG "All sequence identifier are unique!\n"; + if ($check == 2) { + my $newname = "$infile.original"; + rename($infile, $newname); + print LOG "Sequence description was needed to make seq-id unique. The original version of the infile was stored in $infile.original\n"; + } + for (my $j = 0; $j < @seq_object; $j++) { + my $finalid = $seq_object[$j]->{finalid}; + my $estseq = $seq_object[$j]->seq; + my $inid = $seq_object[$j]->display_id; + my @all_trans = Bio::SeqUtils->translate_6frames($seq_object[$j]); + for (my $i = 0; $i < @all_trans; $i++) { + my $count = $i+1; + my $id = $all_trans[$i]->display_id; + my $seq = $all_trans[$i]->seq; + $id =~ s/$inid/$finalid/; + $id =~ s/-[0-9][RF]/_RF$count.0/; + push @out, ">$id\n$seq"; + } + push @estout, ">$finalid\n$estseq"; + if ($j%100 == 0) { + print "$j Sequences processed\n"; + open (OUT, ">>$outfile") or die "failed to open outfile\n"; + print OUT join "\n", @out; + print OUT "\n"; + @out = qw(); + close OUT; + if ($check == 2) { + ## part of the description was added to the seq-id + open (OUT, ">>$infile"); + print OUT join "\n", @estout; + print OUT "\n"; + @estout = qw(); + } + } + } + open (OUT, ">>$outfile") or die "failed to open outfile\n"; + print OUT join "\n", @out; + print OUT "\n"; + @out = qw(); + close OUT; + if ($check == 2) { + ## part of the description was added to the seq-id + open (OUT, ">>$infile"); + print OUT join "\n", @estout; + print OUT "\n"; + close OUT; + @estout = qw(); + } +} +close LOG; +exit; +########################## start sub ################ +sub checkIds { + my $message; + my $check = 1; + my $cont = 1; + my $counter; + ## Everything up to the first whitespace + ## in the fasta header will be taken as sequence id by bioperl. If this + ## id contains a '|' and $trunc is set to 1 (default), the ids may no longer + ## be unique. This will be checked and if necessary the id will not be truncated + ## for $check == 0, the truncated version of the id will be checked (only if $trunc == 1) + ## for $check == 1, the complete id will be checked + ## for $check == 2, the first 20 characters of the concatenated id and description + ## will be checked + if ($trunc == 1) { + $check = 0; + } + + while ($check < 3 and $cont == 1) { + $cont = 0; + for (my $i=0; $i < @seq_object; $i++) { + my $id = $seq_object[$i]->display_id; + $id =~ s/(.{0,$limit}).*/$1/; + if ($check == 0) { + $id =~ s/|.*//; + } + elsif ($check == 2) { +print "Check 2: ".$seq_object[$i]->display_id." \n"; + $id = $id . '_' . $seq_object[$i]->desc; + $id =~ s/(.{0,$limit}).*/$1/; + } + if (defined $counter->{$id}) { + if ($check == 0) { + $message = "trying next without truncating the id"; + } + elsif ($check == 1) { + $message = 'trying next to include sequence description'; + } + else { + $message = "Sequence identifier are not unique, using the first 20 characters. Aborting..."; + } + print LOG "sequence ids are not unique in the file $infile, $message. The offending identfier is $id\n\n"; + print "sequence ids are not unique in the file $infile, $message. The offending identfier is $id\n\n"; + $check ++; + $cont = 1; + $counter = undef; + last; + } + else { + $counter->{$id} = 1; + $seq_object[$i]->{finalid} = $id; + } + } + } + ## return the value of $cont. If this is 1, then the sequence id check has failed. + return($message, $cont, $check); +}