osiris_phylogenetics: orthologs/ucsb_hamster/hamstrsearch

comparison orthologs/ucsb_hamster/hamstrsearch_local-hmmer3.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories

author	osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date	Tue, 11 Mar 2014 12:19:13 -0700
parents
children

comparison

equal deleted inserted replaced

--1:000000000000
+:5b9a38ec4a39
+#!/usr/bin/perl -w
+use strict;
+use FindBin;
+use lib "$FindBin::Bin/lib";
+use Getopt::Long;
+use Bio::SearchIO;
+use Bio::Search::Hit::BlastHit;
+use run_genewise;
+# PROGRAMNAME: hamstrsearch_local.pl
+# Copyright (C) 2009 INGO EBERSBERGER, ingo.ebersberger@univie.ac.at
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published
+# by the Free Software Foundation; either version 3 of the License
+# or any later version.
+# This program is distributed in the hope that it will be useful
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+# You should have received a copy of the GNU General Public License
+# along with this program; If not, see http://www.gnu.org/licenses
+# PROGRAM DESCRIPTION: This is the relevant version!
+# DATE: Wed Dec 19 10:41:09 CEST 2007
+######################## start main #############################
+my $version = "\nhamstrsearch_local-v1.0.pl\nGalaxy implementation by Todd H. Oakley and Roger Ngo, UCSB\n";
+print "$version\n";
+### EDIT THE FOLLOWING LINES TO CUSTOMIZE YOUR SCRIPT
+## note: all the variables can also be set via the command line
+my $pid = $$;
+my $prog = 'hmmsearch'; #program for the hmm search
+my $alignmentprog = 'clustalw';
+#PATH SETTINGS
+my $hmmpath = '.';
+my $blastpath = '.'; #Uses Galaxy working directory
+my $tmpdir = 'tmp_' . $pid;
+my $eval = 1; # eval cutoff for the hmm search
+my $logfile = "hamstrsearch_" . $pid . '.log';
+my $hmm_dir = 'hmm_dir';
+my $fa_dir  = '';
+##############################
+my $help;
+my $seq2store_file='';
+my $cds2store_file='';
+my $hmm;
+my @hmms;
+my $fa;
+my $fafile;
+my @seqs2store;
+my @cds2store;
+my $ep2eg;
+my $estfile;
+my $aln;
+my $idfile;
+my $taxon_check = 0;
+my $hmmset;
+my $hmmsearch_dir;
+my $dbfile = ''; # the file hmmsearch is run against
+my $dbfile_short;
+my $taxon_file;
+my $refspec_string;
+my @refspec = qw();
+my @primer_taxa;
+my $refspec_name = '';
+my $taxon_global;
+my $fileobj;
+my $fa_dir_neu = '';
+my $gwrefprot;
+my $seqtype;
+my $align;
+my $rep;
+my $estflag;
+my $proteinflag;
+my $refseq;
+my $refspec_final = '';
+my $concat;
+my $seqs2store_file;
+#####determine the hostname#######
+my $hostname = `hostname`;
+chomp $hostname;
+print "hostname is $hostname\n";
+#################################
+if (@ARGV==0) {
+	$help = 1;
+}
+## help message
+my $helpmessage = "
+This program is freely distributed under a GPL. See -version for more info
+Copyright (c) GRL limited: portions of the code are from separate copyrights
+\nUSAGE: hamstrsearch_local.pl -sequence_file=<> -hmmset=<> -taxon=<>  -refspec=<> [-est|-protein] [-hmm=<>] [-representative] [-h]
+OPTIONS:
+-sequence_file:
+path and name of the file containing the sequences hmmer is run against
+	Per default, this file should be in the data directory.
+-est:
+set this flag if you are searching in ESTs
+-protein:
+set this flag if you are searching in protein sequences
+-hmmset:
+specifies the name of the core-ortholog set.
+The program will look for the files in the default directory 'core-orthologs' unless you specify
+	a different one.
+-taxon:
+You need to specify a default taxon name from which your ESTs or protein sequences are derived.
+-refspec:
+sets the reference species. Note, it has to be a species that contributed sequences
+to the hmms you are using. NO DEFAULT IS SET! For a list of possible reference
+	taxa you can have a look at the speclist.txt file in the default core-ortholog sets
+	that come with this distribution. Please use the 5 letter abreviations. If you choose
+	to use core-orthologs were not every taxon is represented in all core-orthologs, you
+	can provide a comma-separated list with the preferred refspec first. The lower-ranking
+reference species will only be used if a certain gene is not present in the preferred
+refspecies due to alternative paths in the transitive closure to define
+	the core-orthologs.
+CURRENTLY NO CHECK IS IMPLEMENTED!
+NOTE: A BLAST-DB FOR THE REFERENCE SPECIES IS REQUIRED!
+-eval_limit=<>
+This options allows to set the e-value cut-off for the HMM search.
+DEFAULT: 1
+-hmm:
+option to provide only a single hmm to be used for the search.
+Note, this file has to end with .hmm
+### the following options should only be used when you chose to alter the default structure of the
+### hamstrsearch_local directories. Currently, this has not been extensively tested.
+-fasta_file:
+path and name of the file containing the core-ortholog sequences
+	you don't have to use this option when you
+-hmmpath:
+sets the path to the hmm_dir. By default this is set to the current directory.
+-blastpath:
+sets the path where the blast-dbs are located. Per default this is ../blast_dir
+Note, the program expects the structure blastpath/refspec/refspec_prot.
+To overrule this structure you will have to edit the script.
+\n\n";
+GetOptions ("h"        => \$help,
+"hmm=s"    => \$hmm,
+"est"    => \$estflag,
+"protein"=> \$proteinflag,
+"sequence_file=s" => \$dbfile,
+"fasta_file=s" => \$fafile,
+"hmmset=s" => \$hmmset,
+"hmmpath=s" => \$hmmpath,
+"taxon_file=s" => \$taxon_file,
+"taxon=s"  => \$taxon_global,
+"eval_limit=s" => \$eval,
+"refspec=s" => \$refspec_string,
+"estfile=s" => \$estfile,
+"representative" => \$rep,
+"blastpath=s" => \$blastpath,
+	    "galaxyout=s" => \$seqs2store_file,
+	    "2galaxyout=s" => \$cds2store_file);
+if ($help) {
+print $helpmessage;
+exit;
+}
+## 1) check if all information is available to run HaMStR
+my ($check, @log) = &checkInput();
+if ($check == 0) {
+print join "\n", @log;
+print "$helpmessage";
+exit;
+}
+else {
+open (OUT, ">$logfile") or die "could not open logfile $logfile\n";
+print OUT join "\n", @log;
+close OUT;
+}
+### read in of the core-ortholog sequences
+my $co_seqs = parseSeqfile("$fafile");
+## 2) loop through the hmms
+## process each hmm file separately
+for (my $i = 0; $i < @hmms; $i++) {
+$fileobj = undef;
+my @seqs = qw();
+my @newseqs = qw();## var to contain the sequences to be added to the orthologous cluster
+my @newcds = qw();
+my $hmm = $hmms[$i];
+my $hmmout = $hmm;
+$hmmout =~ s/\.hmm/\.out/;
+## 3) run the hmm search
+if (!(-e "$hmmsearch_dir/$hmmout")) {
+print "now running $prog using $hmm\n";
+!`$prog $hmm_dir/$hmm $dbfile >$hmmsearch_dir/$hmmout` or die "Problem running hmmsearch\n";
+}
+else {
+print "an hmmresult $hmmout already exists. Using this one!\n";
+print "NOTE: in Galaxy the hmm results are stored in the directory of the dataset\n";
+}
+## 4) process the hmm search result
+my $hitcount = 0;
+## 4a) loop through the individual results
+## now the modified version for hmmer3 comes
+my ($query_name, @results) = parseHmmer3($hmmout, $hmmsearch_dir);
+if (! @results) {
+print "no hit found for $query_name\n";
+next;
+}
+chomp $query_name;
+print "Results for $query_name\n";
+for (my $k = 0; $k < @results; $k++) {
+my $hitname = $results[$k];
+print "$hitname\n";
+my $keep = 0;
+my $hitseq = '';
+$refseq = '';
+## 4b) test for the reciprocity criterion fulfilled
+($keep, $hitname, $hitseq, $refspec_final, $refseq)  = &check4reciprocity($query_name, $hitname, @refspec);
+if ($keep == 1) {
+## blast search with the hmm hit identifies the core-ortholog sequence of the reference species
+## check for the taxon from the taxon_file.txt.
+my $taxon = '';
+if ($taxon_check){
+	if ($taxon_check == 1) {
+	  $taxon = &getTaxon($hitname);
+	}
+	elsif ($taxon_check == 2) {
+	  $taxon = $taxon_global;
+	}
+}
+## put the info about the hits into an object for later post-processing
+$fileobj->{$taxon}->{prot}->[$hitcount] = $hitseq;
+$fileobj->{$taxon}->{ids}->[$hitcount] = $hitname;
+$fileobj->{$taxon}->{refseq} = $refseq;
+$hitcount++;
+}
+else {
+print "match to different protein from $refspec_final\n";
+}
+}
+## 5) do the rest only if at least one hit was obtained
+if (defined $fileobj) {
+## 5a) if the hits are derived from ESTs, get the best ORF
+if ($estflag) {
+$fileobj =  &predictORF();
+}
+## 5b) if the user has chosen to postprocess the results
+if ($rep) {
+&processHits($refseq, $concat);
+}
+## 6) prepare the output
+my @taxa = keys(%$fileobj);
+for (my $i = 0; $i< @taxa; $i++) {
+if ($rep) {
+#	push @newseqs, ">$query_name|$taxa[$i]|$fileobj->{$taxa[$i]}->{refid}";
+#Rearrange Order for Galaxy - want species first for phylotable format convert pipes to tabs
+	push @newseqs, ">$taxa[$i]\t$query_name\t$fileobj->{$taxa[$i]}->{refid}";
+	push @newseqs, $fileobj->{$taxa[$i]}->{refprot};
+	if ($estflag) {
+#	  push @newcds, ">$query_name|$taxa[$i]|$fileobj->{$taxa[$i]}->{refid}";
+#Rearrange Order for Galaxy - want species first for phylotable format
+	  push @newcds, ">$taxa[$i]\t$query_name\t$fileobj->{$taxa[$i]}->{refid}";
+	  push @newcds, $fileobj->{$taxa[$i]}->{refcds};
+	}
+}
+else {
+	my $idobj = $fileobj->{$taxa[$i]}->{ids};
+	my $protobj = $fileobj->{$taxa[$i]}->{prot};
+	my $cdsobj  = $fileobj->{$taxa[$i]}->{cds};
+	for (my $j = 0; $j < @$idobj; $j++) {
+#	  push @newseqs, ">$query_name|$taxa[$i]|$idobj->[$j]";
+#Rearrange Order for Galaxy - want species first for phylotable format also tabs not pipe
+	  push @newseqs, ">$taxa[$i]\t$query_name\t$idobj->[$j]";
+	  push @newseqs, $protobj->[$j];
+	  if ($estflag) {
+#	    push @newcds, ">$query_name|$taxa[$i]|$idobj->[$j]";
+#Rearrange Order for Galaxy - want species first for phylotable format
+	    push @newcds, ">$taxa[$i]\t$query_name\t$idobj->[$j]";
+	    push @newcds, $cdsobj->[$j];
+	  }
+	}
+}
+my $refs = $co_seqs->{$query_name};
+for (keys %$refs) {
+	my $line = ">$query_name|$_|" . $refs->{$_}->{seqid} . "\n" . $refs->{$_}->{seq};
+	push @seqs, $line;
+}
+chomp @seqs;
+print "\n";
+@seqs = (@seqs, @newseqs);
+open (OUT, ">$fa_dir_neu/$query_name.fa");
+print OUT join "\n", @seqs;
+print OUT "\n";
+close OUT;
+for (my $i = 0; $i < @newseqs; $i+= 2) {
+#	my $line = $newseqs[$i] . "|" . $newseqs[$i+1];
+#Galaxy uses tabs not pipes
+	my $line = $newseqs[$i] . "\t" . $newseqs[$i+1];
+	$line =~ s/>//;
+	push @seqs2store, $line;
+	if ($estflag) {
+#Galaxy uses tabs not pipes
+#	  my $cdsline = $newcds[$i] . "|" . $newcds[$i+1];
+	  my $cdsline = $newcds[$i] . "\t" . $newcds[$i+1];
+	  $cdsline =~ s/>//;
+	  push @cds2store, $cdsline;
+	}
+}
+}
+}
+}
+if (@seqs2store > 0) {
+open (OUT, ">$seqs2store_file") or die "failed to open output SEQS file\n";
+print OUT join "\n", @seqs2store;
+print OUT "\n";
+close OUT;
+if ($estflag) {
+open (OUT, ">$cds2store_file") or die "failed to open output CDS file\n";
+print OUT join "\n", @cds2store;
+print OUT "\n";
+close OUT;
+}
+}
+else {
+open (OUT, ">$seqs2store_file") or die "failed to open output SEQS file\n";
+print OUT "no hits found\n";
+}
+exit;
+##################### start sub ###############
+####### checkInput performs a number of checks whether sufficient information
+### and all data are available to run HaMStR
+sub checkInput {
+my @log;
+my $check = 1;
+$dbfile_short = $dbfile;
+$dbfile_short =~ s/\..*//;
+## 1) check for filetype
+print "Checking for filetype:\t";
+if (!defined $estflag and !defined $proteinflag) {
+push @log, "please determine the sequence type. Choose between the options -EST or -protein";
+print "failed\n";
+$check = 0;
+}
+else {
+if ($estflag) {
+$estfile = $dbfile;
+$dbfile = "$dbfile.tc";
+push @log, "HaMStR will run on the ESTs in $estfile";
+push @log, "Translating ESTs";
+if (!(-e "$dbfile")) {
+	print "translating $estfile, this may take a while\n";
+	`translate.pl -in=$estfile -out=$dbfile`;
+	open (LOG, "$logfile") or die "could not open logfile $logfile\n";
+	my @info = <LOG>;
+	@log = (@log, @info);
+	close LOG;
+}
+else {
+	push @log, "Translated file already exists, using this one\n";
+}
+if (! -e "$dbfile") {
+	push @log, "The translation of $estfile failed. Check the script translate.pl";
+	print "failed\n";
+	$check = 0;
+}
+}
+else {
+## file type is protein
+print "succeeded\n";
+}
+}
+## 2) Check for presence of blastall
+print "Checking for the blast program\t";
+if (!(`blastall`)) {
+push @log, "could not execute blastall. Please check if this program is installed and executable";
+print "failed\n";
+$check = 0;
+}
+else {
+push @log, "check for blastall succeeded";
+print "succeeded\n";
+}
+## 3) Check for presence of hmmsearch
+print "Checking for hmmsearch\t";
+if (! `$prog -h`) {
+push @log, "could not execute $prog. Please check if this program is installed and executable";
+print "failed\n";
+$check = 0;
+}
+else {
+push @log, "check for $prog succeeded\n";
+print "succeeded\n";
+}
+## 4) Check for reference taxon
+print "Checking for reference species and blast-dbs\t";
+if (!(defined $refspec_string)) {
+push @log, "Please provide a reference species for the reblast!";
+print "failed\n";
+$check = 0;
+}
+else {
+push @log, "Reference species for the re-blast: $refspec_string";
+@refspec = split /,/, $refspec_string;
+$refspec_name = $refspec[0];
+print "succeeded\n";
+}
+## 5) Check for presence of the required blast dbs
+print "checking for blast-dbs:\t";
+for (my $i = 0; $i < @refspec; $i++) {
+my $blastpathtmp = "$blastpath/$refspec[$i]/$refspec[$i]" . "_prot";
+if (! (-e "$blastpathtmp.pin")) {
+push @log, "please edit the blastpath. Could not find $blastpathtmp";
+print "$blastpathtmp failed\n";
+$check = 0;
+}
+else {
+push @log, "check for $blastpathtmp succeeded";
+print "succeeded\n";
+}
+}
+## 6) Check for presence of the directory structure
+print "checking for presence of the hmm files:\t";
+if (!(defined $hmmset)) {
+$hmmpath = '.';
+$hmmset = 'manual';
+}
+else {
+$hmmpath = "$hmmpath/$hmmset";
+$fafile = "$hmmpath/$hmmset" . '.fa';
+}
+$hmm_dir = "$hmmpath/$hmm_dir";
+$hmmsearch_dir = $dbfile_short . '_' . $hmmset;
+#CHANGED FOR GALAXY DIRECTORY
+# $fa_dir_neu = 'fa_dir_' . $dbfile_short . '_' . $hmmset . '_' . $refspec_name;
+$fa_dir_neu = $dbfile_short . '_' . $hmmset . '_' . $refspec_name;
+## 7) check for the presence of the hmm-files and the fasta-file
+if (!(-e "$hmm_dir")) {
+push @log, "Could not find $hmm_dir";
+print "failed\n";
+$check = 0;
+}
+else {
+if (defined $hmm) {
+if (! -e "$hmm_dir/$hmm") {
+	push @log, "$hmm has been defined but could not be found in $hmm_dir/$hmm";
+	$check = 0;
+}
+else {
+	push @log, "$hmm has been found";
+	if ($hmm =~ /\.hmm$/) {
+	  @hmms = ($hmm);
+	}
+}
+}
+else {
+push @log, "running HaMStR with all hmms in $hmm_dir";
+@hmms = `ls $hmm_dir`;
+}
+chomp @hmms;
+print "succeeded\n";
+}
+## 8) Test for presence of the fasta file containing the sequences of the core-ortholog cluster
+print "checking for presence of the core-ortholog file:\t";
+if (defined $fafile) {
+if (! -e "$fafile") {
+push @log, "Could not find the file $fafile";
+print "failed\n";
+$check = 0;
+}
+else {
+push @log, "check for $fafile succeeded";
+print "succeeded\n";
+}
+}
+else {
+push @log, "Please provide path and name of fasta file containing the core-ortholog sequences";
+$check = 0;
+print "failed\n";
+}
+## 9) Checks for the taxon_file
+print "testing whether the taxon has been determined:\t";
+if (!(defined $taxon_file) or (!(-e "$taxon_file"))) {
+if (defined $taxon_global) {
+push @log, "using default taxon $taxon_global for all sequences";
+print "succeeded\n";
+$taxon_check = 2;
+}
+else {
+push @log, "No taxon_file found. Please provide a global taxon name using the option -taxon";
+print "failed\n";
+$check = 0;
+}
+}
+else {
+push @log, "using the file $taxon_file as taxon_file";
+print "succeeded\n";
+$taxon_check = 1;
+}
+## 10) Set the file where the matched seqs are found
+#CHANGED BY THO FOR GALAXY TO ALLOW DETERMINATION OF OUTPUT FILE Made INPUT Option
+#  $seqs2store_file = 'hamstrsearch_' . $dbfile_short . '_' . $hmmset . '.out';
+#  $cds2store_file = 'hamstrsearch_' . $dbfile_short . '_' . $hmmset . '_cds.out';
+## 11) apply the evalue-cut-off to the hmmsearch program
+$prog = $prog . " -E $eval";
+push @log, "hmmsearch: $prog";
+## 12) setting up the directories where the output files will be put into.
+if ($check == 1) {
+if (!(-e "$hmmsearch_dir")) {
+`mkdir $hmmsearch_dir`;
+}
+if (!(-e "$fa_dir_neu")) {
+`mkdir $fa_dir_neu`;
+}
+if (!(-e "$tmpdir")) {
+`mkdir $tmpdir`;
+}
+}
+return ($check, @log);
+}
+#################
+## check4reciprocity is the second major part of the program. It checks
+## whether the protein sequence that has been identified by the hmmsearch
+## identifies in turn the protein from the reference taxon that was used to
+## build the hmm.
+sub check4reciprocity {
+my ($query_name, $hitname, @refspec) = @_;
+my $searchdb;
+## get the sequence from the db_file
+my $hitseq = `grep -m 1 -A 1 ">$hitname" $dbfile | tail -n 1`;
+if (!defined $hitseq) {
+print "could not retrieve a sequence for $hitname. Skipping...\n";
+return(0, '', '', '');
+}
+else {
+## now get the sequence used for building the hmm
+my @original;
+my $refspec_final;
+for (my $i = 0; $i < @refspec; $i++) {
+@original = `grep "^>$query_name|$refspec[$i]" $fafile |sed -e "s/.*$refspec[$i]\|//"`;
+chomp @original;
+if (@original > 0) {
+	$refspec_final = $refspec[$i];
+	$searchdb = "$blastpath/$refspec_final/$refspec_final" . "_prot";
+	last;
+}
+else {
+	print "original sequence not be found with grepping for ^>$query_name|$refspec[$i]. Proceeding with next refspec\n";
+}
+}
+if (@original == 0) {
+print "original sequence not be found\n";
+return (0, '', '', $refspec_final);
+}
+print "REFSPEC is $refspec_final\n";
+## continue with the blast
+chomp $hitseq;
+#    $hitname =~ s/\|.*//;
+## now run the blast
+open (OUT, ">$tmpdir/$$.fa") or die "could not open out for writing\n";
+print OUT ">$hitname\n$hitseq";
+close OUT;
+!`blastall -p blastp -d $searchdb -v 10 -b 10 -i $tmpdir/$$.fa -o $tmpdir/$$.blast` or die "Problem running blast\n";
+## now parse the best blast hit
+my @hits = &getBestBlasthit("$tmpdir/$$.blast");
+if (@hits > 0) {
+print "hmm-seq: ", join "\t", @original , "\n";
+## now loop through the best hits with the same evalue and check whether
+## among these I find the same seq as in $original
+for (my $i = 0; $i <@hits; $i++) {
+	print "blast-hit: $hits[$i]";
+	## now loop through all the refspec-sequences in the hmm file
+	for (my $j = 0; $j < @original; $j++) {
+	  if ($original[$j] eq $hits[$i]) {
+	    print "\tHit\n";
+	    my ($refseq) = `grep -A 1 "$query_name|$refspec_final|$original[$j]" $fafile |tail -n 1`;
+	    return (1, $hitname, $hitseq, $refspec_final, $refseq);
+	  }
+	  else {
+	    print "\nnot hitting $original[$j]\n";
+	  }
+	}
+}
+### if we end up here, we didn't find a hit that matches to the original sequence
+### in the top hits with the same eval
+return (0, '', '', $refspec_final);
+}
+else {
+print "no hit obtained\n";
+return(0, '', '', $refspec_final);
+}
+}
+}
+#############
+sub getBestBlasthit {
+my @hits;
+my ($file) = @_;
+my $searchio = Bio::SearchIO->new(-file        => $file,
+				      -format      => 'blast',
+				      -report_type => 'blastp') or die "parse failed";
+while( my $result = $searchio->next_result ){
+	my $count = 0;
+	my $sig;
+	my $sig_old;
+	while( my $hit = $result->next_hit){
+	    ## now I enter all top hits having the same evalue into the result
+	    $sig = $hit->score;
+	    if (!defined $sig_old) {
+		$sig_old = $sig;
+	    }
+	    if ($sig == $sig_old) {
+		push @hits, $hit->accession;
+	    }
+	    else {
+		last;
+	    }
+	}
+}
+return(@hits);
+}
+##################
+sub getTaxon {
+my ($hitname) = @_;
+#    my $q = "select name from taxon t, est_project e, est_info i, annotation_neu a where a.id = $hitname and a.contig_id = i.contig_id and i.project_id = e.project_id and e.taxon_id = t.taxon_id";
+if ($hitname =~ /\D/) {
+	$hitname =~ s/_.*//;
+}
+my $taxon = `grep -m 1 "^$hitname," $taxon_file | sed -e 's/^.*,//'`;
+chomp $taxon;
+$taxon =~ s/^[0-9]+,//;
+$taxon =~ s/\s*$//;
+$taxon =~ s/\s/_/g;
+if ($taxon) {
+	return ($taxon);
+}
+else {
+	return();
+}
+}
+###############
+sub processHits {
+my ($concat) = @_;
+## 1) align all hit sequences for a taxon against the reference species
+my @taxa = keys(%$fileobj);
+for (my $i = 0; $i < @taxa; $i++) {
+&orfRanking($taxa[$i]);
+}
+}
+################
+sub predictORF {
+my $fileobj_new;
+#  my ($refseq) = @_;
+my @taxa = keys(%$fileobj);
+for (my $i = 0; $i < @taxa; $i++) {
+my $protobj = $fileobj->{$taxa[$i]}->{prot};
+my $idobj = $fileobj->{$taxa[$i]}->{ids};
+my $refseq = $fileobj->{$taxa[$i]}->{refseq};
+my @ids = @$idobj;
+for (my $j = 0; $j < @ids; $j++) {
+## determine the reading frame
+my ($rf) = $ids[$j] =~ /.*_RF([0-9]+)/;
+	print "rf is $rf\n";
+$ids[$j] =~ s/_RF.*//;
+#      my $est = `grep -A 1 "$ids[$j]" $estfile |tail -n 1`;
+################new grep command from version 8 to fix bug
+my $est = `grep -A 1 ">$ids[$j]\\b" $estfile |tail -n 1`;
+if (! $est) {
+	die "error in retrieval of est sequence for $ids[$j] in subroutine processHits\n";
+}
+## the EST is translated in rev complement
+if ($rf > 3) {
+	$est = revComp($est);
+}
+my $gw = run_genewise->new($est, $refseq, "$tmpdir");
+my $translation = $gw->translation;
+my $cds = $gw->codons;
+$translation =~ s/[-!]//g;
+$fileobj_new->{$taxa[$i]}->{ids}->[$j] = $ids[$j];
+$fileobj_new->{$taxa[$i]}->{prot}->[$j] = $translation;
+$fileobj_new->{$taxa[$i]}->{cds}->[$j] = $cds;
+$fileobj_new->{$taxa[$i]}->{refseq} = $refseq;
+}
+}
+return($fileobj_new);
+}
+############################
+sub orfRanking {
+my ($spec) = @_;
+my $result;
+my $refprot;
+my $refcds;
+my @toalign;
+my $protobj = $fileobj->{$spec}->{prot};
+my $idobj = $fileobj->{$spec}->{ids};
+my $refcluster; ## variables to take the cluster and its id for later analysis
+my $refid;
+if (@$protobj == 1) {
+## nothing to chose from
+$refprot = $protobj->[0];
+$refcds = $fileobj->{$spec}->{cds}->[0];
+my $length = length($refprot);
+$refid = $idobj->[0] . "-" . $length;
+}
+else {
+## more than one cluster
+push @toalign, ">$refspec_final";
+push @toalign, $fileobj->{$spec}->{refseq};
+## now walk through all the contigs
+for (my $i = 0; $i < @$protobj; $i++) {
+my @testseq = (">$idobj->[$i]", $protobj->[$i]);
+@testseq = (@testseq, @toalign);
+open (OUT, ">$tmpdir/$pid.ref.fa") or die "could not open file for writing refseqs\n";
+print OUT join "\n", @testseq;
+close OUT;
+## run clustalw
+!(`$alignmentprog $tmpdir/$pid.ref.fa -output=fasta -outfile=$tmpdir/$pid.ref.aln 2>&1 >$tmpdir/$pid.ref.log`) or die "error running clustalw\n";
+## get the alignment score
+$result->[$i]->{score} =  `grep "Alignment Score" $tmpdir/$pid.ref.log |sed -e 's/[^0-9]//g'`;
+if (!$result->[$i]->{score}) {
+	die "error in determining alignment score\n";
+}
+chomp $result->[$i]->{score};
+## get the aligned sequence
+open (ALN, "$tmpdir/$pid.ref.aln") or die "failed to open alignment file\n";
+my @aln = <ALN>;
+close ALN;
+my $aseq = extractSeq($idobj->[$i], @aln);
+## remove the terminal gaps
+$aseq =~ s/-*$//;
+$result->[$i]->{aend} = length $aseq;
+my ($head) = $aseq =~ /^(-*).*/;
+($result->[$i]->{astart}) = length($head)+1;
+}
+### the results for all seqs has been gathered, now order them
+$result = sortRef($result);
+($refprot, $refcds, $refid) = &determineRef($result,$spec);
+}
+$fileobj->{$spec}->{refprot} = $refprot;
+$fileobj->{$spec}->{refcds}  = $refcds;
+$fileobj->{$spec}->{refid}   = $refid;
+return();
+}
+###########################
+sub sortRef {
+my $result = shift;
+my @sort;
+for (my $i = 0; $i < @$result; $i++) {
+	push @sort, "$i,$result->[$i]->{astart},$result->[$i]->{aend},$result->[$i]->{score}";
+}
+open (OUT, ">$tmpdir/$pid.sort") or die "failed to write for sorting\n";
+print OUT join "\n", @sort;
+close OUT;
+`sort -n -t ',' -k 2 $tmpdir/$pid.sort >$tmpdir/$pid.sort.out`;
+@sort = `less $tmpdir/$pid.sort`;
+chomp @sort;
+$result = undef;
+for (my $i = 0; $i < @sort; $i++) {
+	($result->[$i]->{id}, $result->[$i]->{start}, $result->[$i]->{end}, $result->[$i]->{score}) = split ',', $sort[$i];
+}
+return($result);
+}
+########################
+sub determineRef {
+my ($result, $spec) = @_;
+my $lastend = 0;
+my $lastscore = 0;
+my $final;
+my $count = 0;
+my $id = '';
+for (my $i = 0; $i < @$result; $i++) {
+if ($result->[$i]->{start} < $lastend or $lastend == 0) {
+if ($result->[$i]->{score} > $lastscore) {
+	$lastend = $result->[$i]->{end};
+	$lastscore = $result->[$i]->{score};
+	$id = $result->[$i]->{id};
+}
+}
+elsif ($result->[$i]->{start} > $lastend) {
+## a new part of the alignment is covered. Fix the results obtained so far
+$final->[$count]->{id} = $id;
+$lastend = $result->[$i]->{end};
+$id = $result->[$i]->{id};
+$count++;
+}
+}
+$final->[$count]->{id} = $id;
+## now concatenate the results
+my $refprot = '';
+my $refid = '';
+my $refcds = '';
+for (my $i = 0; $i < @$final; $i++) {
+my $seq = $fileobj->{$spec}->{prot}->[$final->[$i]->{id}];
+my $cdsseq = $fileobj->{$spec}->{cds}->[$final->[$i]->{id}];
+my $length = length($seq);
+$refid .= "$fileobj->{$spec}->{ids}->[$final->[$i]->{id}]-$length" . "PP";
+$refprot .= $seq;
+if ($estflag) {
+$refcds .= $cdsseq;
+}
+}
+$refid =~ s/PP$//;
+return($refprot, $refcds, $refid);
+}
+#############################
+sub extractSeq {
+my ($id, @aln) = @_;
+my $seq = '';
+my $start = 0;
+for (my $i = 0; $i < @aln; $i++) {
+if ($aln[$i] =~ $id) {
+$start = 1;
+}
+elsif ($aln[$i] =~ />/ and $start == 1) {
+last;
+}
+elsif ($start == 1) {
+$seq .= $aln[$i];
+}
+}
+$seq =~ s/\s//g;
+return ($seq);
+}
+##############################
+sub revComp {
+my ($seq) = @_;
+$seq =~ tr/AGCTYRKMWSagct/TCGARYMKWSTCGA/;
+$seq = reverse($seq);
+return($seq);
+}
+##############################
+sub parseHmmer3 {
+my ($file, $path) = @_;
+if (!defined $path) {
+$path = '.';
+}
+open (IN, "$path/$file") or die "failed to open $file\n";
+my @data = <IN>;
+close IN;
+### extract the hits
+my @hit;
+my $start = 0;
+my $stop = 0;
+my $i = 0;
+for (my $i = 0; $i < @data; $i++) {
+if (!($data[$i] =~ /\S/)) {
+next;
+}
+else {
+if ($data[$i] =~ /Scores for complete sequence/) {
+	$start = 1;
+	$i += 4;
+}
+elsif (($data[$i] =~ /inclusion threshold/) or ($data[$i] =~ /Domain and alignment/)) {
+	last;
+}
+if ($start == 1 and $stop == 0) {
+	$data[$i] =~ s/^\s+//;
+	my @list = split /\s+/, $data[$i];
+	push @hit, $list[8];
+	$start = 0; #Added by THO
+}
+}
+}
+### get the query_id
+my ($query) = grep /^Query:/, @data;
+$query =~ s/^Query:\s+//;
+$query =~ s/\s.*//;
+if (defined $hit[0]) {
+chomp @hit;
+return ($query, @hit);
+}
+else {
+return ($query);
+}
+}
+#####################
+sub parseSeqfile {
+my $seqref;
+my $id;
+my $spec;
+my $seqid;
+my $seq;
+my $file = shift;
+open (IN, "$file") or die "failed to open $file\n";
+my @seqs = <IN>;
+close IN;
+chomp @seqs;
+for (my $i = 0; $i < @seqs; $i++) {
+if ($seqs[$i] =~ />/) {
+	$seqs[$i] =~ s/>//;
+if (defined $id and defined $seq) {
+	$seqref->{$id}->{$spec}->{seqid} = $seqid;
+	$seqref->{$id}->{$spec}->{seq} = $seq;
+	$seq = undef;
+}
+($id, $spec, $seqid) = split (/\|/, $seqs[$i]);
+}
+else {
+$seq .= $seqs[$i];
+}
+}
+if (defined  $id and defined $seq) {
+	$seqref->{$id}->{$spec}->{seqid} = $seqid;
+	$seqref->{$id}->{$spec}->{seq} = $seq;
+	$seq = undef;
+}
+return ($seqref);
+}

Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics

comparison orthologs/ucsb_hamster/hamstrsearch_local-hmmer3.pl @ 0:5b9a38ec4a39 draft default tip