diff orthologs/ucsb_hamster/translate.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/orthologs/ucsb_hamster/translate.pl	Tue Mar 11 12:19:13 2014 -0700
@@ -0,0 +1,193 @@
+#!/usr/bin/perl -w 
+use strict;
+use FindBin;
+use lib "$FindBin::Bin/lib";
+use Getopt::Long;
+use Bio::Perl;
+use Bio::SeqIO;
+use File::Copy;
+
+# PROGRAMNAME: translate.pl
+
+# AUTHOR: INGO EBERSBERGER, ingo.ebersberger@univie.ac.at
+
+# PROGRAM DESCRIPTION:
+
+# DATE: Tue May 12 14:03:34 CEST 2009
+
+
+# DATE LAST MODIFIED:
+######################## start main #############################
+my $help;
+my @out;
+my @estout;
+my $infile;
+my $trunc = 1;
+my $outfile = "translate_tc.out";
+my $limit = 20; ## this sets the maximum length for the sequence identifier. If sequence identifier are
+## too long, then one can run into troubles with the parsing of the hmmsearch results.
+open (LOG, ">>hamstrsearch.log") or warn "could not open logfile for writing\n";
+print LOG "##### Translation of the ESTs########";
+######### 
+my $usage = "Name:\n\ttranslate.pl\n
+Synopsis:\n\ttranslate_tc5.pl [-infile=FILE] [options] [-outfile=FILE]\n
+Description:\n\tThis program takes a batch fasta-file with DNA
+\tsequences as an input and translates the individual DNA sequences in
+\tall six reading frames.
+\t-infile: provide the relative or absolute path of the infile\n
+\t-outfile: provide the realtive or absolute path of the
+\toutfile. Default is: translate_tc.out\n
+\ttrunc: set -trunc=0 to prevent truncation of the sequence header (see below).
+\t-h: prints this help-message\n
+NOTE: if the seq-id (everything up to the first [[:space:]]) contains a '|' everything between the '>' and the '|' will be taken as seq-id. Otherwise, the entire seq-id will be used. You can change this behavior by setting -trunc=0\n
+NOTE: the script as an automated routine to check for unique sequence names in the input file. This may lead to cases where the $trunc value is overruled and additionally part of the sequence description may be included.";
+##########
+
+GetOptions (
+    "h" => \$help,
+    "infile=s" => \$infile,
+    "outfile=s" => \$outfile,
+    "trunc=s" => \$trunc);
+if ($help) {
+	print "$usage";
+	exit;
+}
+if (-e "$outfile") {
+    print LOG "an outfile $outfile already exists. Renaming to $outfile.old\n\n";
+    my $newname = "$outfile.old";
+    rename($outfile, $newname);
+}
+##BUG -- BIOPERL GUESSES PROTEIN FILE FORMAT WHEN AMBIGUITY CODES ARE PRESENT 
+##CAUSING AN ERROR IN THE TRANLATE_6 FRAMES, WHICH INTERRUPTS ALL TRANSLATION -- THO
+#below replaces read_all_sequences function to declare that sequence is DNA
+#original line next
+#my @seq_object = read_all_sequences($infile, 'fasta');
+
+   my $tempseqio;
+   $tempseqio = Bio::SeqIO->new( '-file' => $infile, '-format' => 'fasta');
+   my @seq_object;
+
+   while( my $seq = $tempseqio->next_seq() ) {
+	$seq->alphabet('dna');
+	push(@seq_object,$seq);
+   }
+#End THO changes
+
+
+## determine whether the seq-ids are unique given the chosen value for $trunc
+my ($message, $cont, $check) = &checkIds();
+if ($cont == 1) {
+    ## the check for unique identifiers has failed and the programm is exiting
+    print LOG "$message\n";
+	exit;
+}
+else {
+    print LOG "All sequence identifier are unique!\n";
+    if ($check == 2) {
+	my $newname = "$infile.original";
+	rename($infile, $newname);
+	print LOG "Sequence description was needed to make seq-id unique. The original version of the infile was stored in $infile.original\n";
+    }
+    for (my $j = 0; $j < @seq_object; $j++) { 
+	my $finalid = $seq_object[$j]->{finalid};
+	my $estseq = $seq_object[$j]->seq;
+	my $inid = $seq_object[$j]->display_id;
+	my @all_trans = Bio::SeqUtils->translate_6frames($seq_object[$j]);
+	for (my $i = 0; $i < @all_trans; $i++) {
+	    my $count = $i+1;
+	    my $id = $all_trans[$i]->display_id;
+	    my $seq = $all_trans[$i]->seq;
+	    $id =~ s/$inid/$finalid/;
+	    $id =~ s/-[0-9][RF]/_RF$count.0/;
+	    push @out, ">$id\n$seq";
+	}
+	push @estout, ">$finalid\n$estseq";
+	if ($j%100 == 0) {
+	    print "$j Sequences processed\n";
+	    open (OUT, ">>$outfile") or die "failed to open outfile\n";
+	    print OUT join "\n", @out;
+	    print OUT "\n";
+	    @out = qw();
+	    close OUT;
+	    if ($check == 2) {
+		## part of the description was added to the seq-id
+		open (OUT, ">>$infile");
+		print OUT join "\n", @estout;
+		print OUT "\n";
+		@estout = qw();
+	    }
+	}
+    }
+    open (OUT, ">>$outfile") or die "failed to open outfile\n";
+    print OUT join "\n", @out;
+    print OUT "\n";
+    @out = qw();
+    close OUT;
+    if ($check == 2) {
+	## part of the description was added to the seq-id
+	open (OUT, ">>$infile");
+	print OUT join "\n", @estout;
+	print OUT "\n";
+	close OUT;
+	@estout = qw();
+    }
+}
+close LOG;
+exit;
+########################## start sub ################
+sub checkIds {
+    my $message;
+    my $check = 1;
+    my $cont = 1;
+    my $counter;
+    ## Everything up to the first whitespace
+    ## in the fasta header will be taken as sequence id by bioperl. If this
+    ## id contains a '|' and $trunc is set to 1 (default), the ids may no longer
+    ## be unique. This will be checked and if necessary the id will not be truncated
+    ## for $check == 0, the truncated version of the id will be checked (only if $trunc == 1)
+    ## for $check == 1, the complete id will be checked
+    ## for $check == 2, the first 20 characters of the concatenated id and description
+    ## will be checked
+    if ($trunc == 1) {
+	$check = 0;
+    }
+    
+    while ($check < 3 and $cont == 1) {
+	$cont = 0;
+	for (my $i=0; $i < @seq_object; $i++) {
+	    my $id = $seq_object[$i]->display_id;
+	    $id =~ s/(.{0,$limit}).*/$1/;
+	    if ($check == 0) {
+		$id =~ s/|.*//;
+	    }
+	    elsif ($check == 2) {
+print "Check 2: ".$seq_object[$i]->display_id." \n";
+		$id = $id . '_' . $seq_object[$i]->desc;
+		$id =~ s/(.{0,$limit}).*/$1/;
+	    }
+	    if (defined $counter->{$id}) {
+		if ($check == 0) {
+		    $message = "trying next without truncating the id";
+		}
+		elsif ($check == 1) {
+		    $message = 'trying next to include sequence description';
+		}
+		else {
+		    $message = "Sequence identifier are not unique, using the first 20 characters. Aborting...";
+		}
+		print LOG "sequence ids are not unique in the file $infile, $message. The offending identfier is $id\n\n";
+		print "sequence ids are not unique in the file $infile, $message. The offending identfier is $id\n\n";
+		$check ++;
+		$cont = 1;
+		$counter = undef;
+		last;
+	    }
+	    else {
+		$counter->{$id} = 1;
+		$seq_object[$i]->{finalid} = $id;
+	    }
+	}
+    }
+    ## return the value of $cont. If this is 1, then the sequence id check has failed. 
+    return($message, $cont, $check);
+}