view orthologs/ucsb_hamster/translate.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
line wrap: on
line source

#!/usr/bin/perl -w 
use strict;
use FindBin;
use lib "$FindBin::Bin/lib";
use Getopt::Long;
use Bio::Perl;
use Bio::SeqIO;
use File::Copy;

# PROGRAMNAME: translate.pl

# AUTHOR: INGO EBERSBERGER, ingo.ebersberger@univie.ac.at

# PROGRAM DESCRIPTION:

# DATE: Tue May 12 14:03:34 CEST 2009


# DATE LAST MODIFIED:
######################## start main #############################
my $help;
my @out;
my @estout;
my $infile;
my $trunc = 1;
my $outfile = "translate_tc.out";
my $limit = 20; ## this sets the maximum length for the sequence identifier. If sequence identifier are
## too long, then one can run into troubles with the parsing of the hmmsearch results.
open (LOG, ">>hamstrsearch.log") or warn "could not open logfile for writing\n";
print LOG "##### Translation of the ESTs########";
######### 
my $usage = "Name:\n\ttranslate.pl\n
Synopsis:\n\ttranslate_tc5.pl [-infile=FILE] [options] [-outfile=FILE]\n
Description:\n\tThis program takes a batch fasta-file with DNA
\tsequences as an input and translates the individual DNA sequences in
\tall six reading frames.
\t-infile: provide the relative or absolute path of the infile\n
\t-outfile: provide the realtive or absolute path of the
\toutfile. Default is: translate_tc.out\n
\ttrunc: set -trunc=0 to prevent truncation of the sequence header (see below).
\t-h: prints this help-message\n
NOTE: if the seq-id (everything up to the first [[:space:]]) contains a '|' everything between the '>' and the '|' will be taken as seq-id. Otherwise, the entire seq-id will be used. You can change this behavior by setting -trunc=0\n
NOTE: the script as an automated routine to check for unique sequence names in the input file. This may lead to cases where the $trunc value is overruled and additionally part of the sequence description may be included.";
##########

GetOptions (
    "h" => \$help,
    "infile=s" => \$infile,
    "outfile=s" => \$outfile,
    "trunc=s" => \$trunc);
if ($help) {
	print "$usage";
	exit;
}
if (-e "$outfile") {
    print LOG "an outfile $outfile already exists. Renaming to $outfile.old\n\n";
    my $newname = "$outfile.old";
    rename($outfile, $newname);
}
##BUG -- BIOPERL GUESSES PROTEIN FILE FORMAT WHEN AMBIGUITY CODES ARE PRESENT 
##CAUSING AN ERROR IN THE TRANLATE_6 FRAMES, WHICH INTERRUPTS ALL TRANSLATION -- THO
#below replaces read_all_sequences function to declare that sequence is DNA
#original line next
#my @seq_object = read_all_sequences($infile, 'fasta');

   my $tempseqio;
   $tempseqio = Bio::SeqIO->new( '-file' => $infile, '-format' => 'fasta');
   my @seq_object;

   while( my $seq = $tempseqio->next_seq() ) {
	$seq->alphabet('dna');
	push(@seq_object,$seq);
   }
#End THO changes


## determine whether the seq-ids are unique given the chosen value for $trunc
my ($message, $cont, $check) = &checkIds();
if ($cont == 1) {
    ## the check for unique identifiers has failed and the programm is exiting
    print LOG "$message\n";
	exit;
}
else {
    print LOG "All sequence identifier are unique!\n";
    if ($check == 2) {
	my $newname = "$infile.original";
	rename($infile, $newname);
	print LOG "Sequence description was needed to make seq-id unique. The original version of the infile was stored in $infile.original\n";
    }
    for (my $j = 0; $j < @seq_object; $j++) { 
	my $finalid = $seq_object[$j]->{finalid};
	my $estseq = $seq_object[$j]->seq;
	my $inid = $seq_object[$j]->display_id;
	my @all_trans = Bio::SeqUtils->translate_6frames($seq_object[$j]);
	for (my $i = 0; $i < @all_trans; $i++) {
	    my $count = $i+1;
	    my $id = $all_trans[$i]->display_id;
	    my $seq = $all_trans[$i]->seq;
	    $id =~ s/$inid/$finalid/;
	    $id =~ s/-[0-9][RF]/_RF$count.0/;
	    push @out, ">$id\n$seq";
	}
	push @estout, ">$finalid\n$estseq";
	if ($j%100 == 0) {
	    print "$j Sequences processed\n";
	    open (OUT, ">>$outfile") or die "failed to open outfile\n";
	    print OUT join "\n", @out;
	    print OUT "\n";
	    @out = qw();
	    close OUT;
	    if ($check == 2) {
		## part of the description was added to the seq-id
		open (OUT, ">>$infile");
		print OUT join "\n", @estout;
		print OUT "\n";
		@estout = qw();
	    }
	}
    }
    open (OUT, ">>$outfile") or die "failed to open outfile\n";
    print OUT join "\n", @out;
    print OUT "\n";
    @out = qw();
    close OUT;
    if ($check == 2) {
	## part of the description was added to the seq-id
	open (OUT, ">>$infile");
	print OUT join "\n", @estout;
	print OUT "\n";
	close OUT;
	@estout = qw();
    }
}
close LOG;
exit;
########################## start sub ################
sub checkIds {
    my $message;
    my $check = 1;
    my $cont = 1;
    my $counter;
    ## Everything up to the first whitespace
    ## in the fasta header will be taken as sequence id by bioperl. If this
    ## id contains a '|' and $trunc is set to 1 (default), the ids may no longer
    ## be unique. This will be checked and if necessary the id will not be truncated
    ## for $check == 0, the truncated version of the id will be checked (only if $trunc == 1)
    ## for $check == 1, the complete id will be checked
    ## for $check == 2, the first 20 characters of the concatenated id and description
    ## will be checked
    if ($trunc == 1) {
	$check = 0;
    }
    
    while ($check < 3 and $cont == 1) {
	$cont = 0;
	for (my $i=0; $i < @seq_object; $i++) {
	    my $id = $seq_object[$i]->display_id;
	    $id =~ s/(.{0,$limit}).*/$1/;
	    if ($check == 0) {
		$id =~ s/|.*//;
	    }
	    elsif ($check == 2) {
print "Check 2: ".$seq_object[$i]->display_id." \n";
		$id = $id . '_' . $seq_object[$i]->desc;
		$id =~ s/(.{0,$limit}).*/$1/;
	    }
	    if (defined $counter->{$id}) {
		if ($check == 0) {
		    $message = "trying next without truncating the id";
		}
		elsif ($check == 1) {
		    $message = 'trying next to include sequence description';
		}
		else {
		    $message = "Sequence identifier are not unique, using the first 20 characters. Aborting...";
		}
		print LOG "sequence ids are not unique in the file $infile, $message. The offending identfier is $id\n\n";
		print "sequence ids are not unique in the file $infile, $message. The offending identfier is $id\n\n";
		$check ++;
		$cont = 1;
		$counter = undef;
		last;
	    }
	    else {
		$counter->{$id} = 1;
		$seq_object[$i]->{finalid} = $id;
	    }
	}
    }
    ## return the value of $cont. If this is 1, then the sequence id check has failed. 
    return($message, $cont, $check);
}