Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
view orthologs/ucsb_hamster/hamstrsearch_local-hmmer3.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
line wrap: on
line source
#!/usr/bin/perl -w use strict; use FindBin; use lib "$FindBin::Bin/lib"; use Getopt::Long; use Bio::SearchIO; use Bio::Search::Hit::BlastHit; use run_genewise; # PROGRAMNAME: hamstrsearch_local.pl # Copyright (C) 2009 INGO EBERSBERGER, ingo.ebersberger@univie.ac.at # This program is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published # by the Free Software Foundation; either version 3 of the License # or any later version. # This program is distributed in the hope that it will be useful # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; If not, see http://www.gnu.org/licenses # PROGRAM DESCRIPTION: This is the relevant version! # DATE: Wed Dec 19 10:41:09 CEST 2007 ######################## start main ############################# my $version = "\nhamstrsearch_local-v1.0.pl\nGalaxy implementation by Todd H. Oakley and Roger Ngo, UCSB\n"; print "$version\n"; ### EDIT THE FOLLOWING LINES TO CUSTOMIZE YOUR SCRIPT ## note: all the variables can also be set via the command line my $pid = $$; my $prog = 'hmmsearch'; #program for the hmm search my $alignmentprog = 'clustalw'; #PATH SETTINGS my $hmmpath = '.'; my $blastpath = '.'; #Uses Galaxy working directory my $tmpdir = 'tmp_' . $pid; my $eval = 1; # eval cutoff for the hmm search my $logfile = "hamstrsearch_" . $pid . '.log'; my $hmm_dir = 'hmm_dir'; my $fa_dir = ''; ############################## my $help; my $seq2store_file=''; my $cds2store_file=''; my $hmm; my @hmms; my $fa; my $fafile; my @seqs2store; my @cds2store; my $ep2eg; my $estfile; my $aln; my $idfile; my $taxon_check = 0; my $hmmset; my $hmmsearch_dir; my $dbfile = ''; # the file hmmsearch is run against my $dbfile_short; my $taxon_file; my $refspec_string; my @refspec = qw(); my @primer_taxa; my $refspec_name = ''; my $taxon_global; my $fileobj; my $fa_dir_neu = ''; my $gwrefprot; my $seqtype; my $align; my $rep; my $estflag; my $proteinflag; my $refseq; my $refspec_final = ''; my $concat; my $seqs2store_file; #####determine the hostname####### my $hostname = `hostname`; chomp $hostname; print "hostname is $hostname\n"; ################################# if (@ARGV==0) { $help = 1; } ## help message my $helpmessage = " This program is freely distributed under a GPL. See -version for more info Copyright (c) GRL limited: portions of the code are from separate copyrights \nUSAGE: hamstrsearch_local.pl -sequence_file=<> -hmmset=<> -taxon=<> -refspec=<> [-est|-protein] [-hmm=<>] [-representative] [-h] OPTIONS: -sequence_file: path and name of the file containing the sequences hmmer is run against Per default, this file should be in the data directory. -est: set this flag if you are searching in ESTs -protein: set this flag if you are searching in protein sequences -hmmset: specifies the name of the core-ortholog set. The program will look for the files in the default directory 'core-orthologs' unless you specify a different one. -taxon: You need to specify a default taxon name from which your ESTs or protein sequences are derived. -refspec: sets the reference species. Note, it has to be a species that contributed sequences to the hmms you are using. NO DEFAULT IS SET! For a list of possible reference taxa you can have a look at the speclist.txt file in the default core-ortholog sets that come with this distribution. Please use the 5 letter abreviations. If you choose to use core-orthologs were not every taxon is represented in all core-orthologs, you can provide a comma-separated list with the preferred refspec first. The lower-ranking reference species will only be used if a certain gene is not present in the preferred refspecies due to alternative paths in the transitive closure to define the core-orthologs. CURRENTLY NO CHECK IS IMPLEMENTED! NOTE: A BLAST-DB FOR THE REFERENCE SPECIES IS REQUIRED! -eval_limit=<> This options allows to set the e-value cut-off for the HMM search. DEFAULT: 1 -hmm: option to provide only a single hmm to be used for the search. Note, this file has to end with .hmm ### the following options should only be used when you chose to alter the default structure of the ### hamstrsearch_local directories. Currently, this has not been extensively tested. -fasta_file: path and name of the file containing the core-ortholog sequences you don't have to use this option when you -hmmpath: sets the path to the hmm_dir. By default this is set to the current directory. -blastpath: sets the path where the blast-dbs are located. Per default this is ../blast_dir Note, the program expects the structure blastpath/refspec/refspec_prot. To overrule this structure you will have to edit the script. \n\n"; GetOptions ("h" => \$help, "hmm=s" => \$hmm, "est" => \$estflag, "protein"=> \$proteinflag, "sequence_file=s" => \$dbfile, "fasta_file=s" => \$fafile, "hmmset=s" => \$hmmset, "hmmpath=s" => \$hmmpath, "taxon_file=s" => \$taxon_file, "taxon=s" => \$taxon_global, "eval_limit=s" => \$eval, "refspec=s" => \$refspec_string, "estfile=s" => \$estfile, "representative" => \$rep, "blastpath=s" => \$blastpath, "galaxyout=s" => \$seqs2store_file, "2galaxyout=s" => \$cds2store_file); if ($help) { print $helpmessage; exit; } ## 1) check if all information is available to run HaMStR my ($check, @log) = &checkInput(); if ($check == 0) { print join "\n", @log; print "$helpmessage"; exit; } else { open (OUT, ">$logfile") or die "could not open logfile $logfile\n"; print OUT join "\n", @log; close OUT; } ### read in of the core-ortholog sequences my $co_seqs = parseSeqfile("$fafile"); ## 2) loop through the hmms ## process each hmm file separately for (my $i = 0; $i < @hmms; $i++) { $fileobj = undef; my @seqs = qw(); my @newseqs = qw();## var to contain the sequences to be added to the orthologous cluster my @newcds = qw(); my $hmm = $hmms[$i]; my $hmmout = $hmm; $hmmout =~ s/\.hmm/\.out/; ## 3) run the hmm search if (!(-e "$hmmsearch_dir/$hmmout")) { print "now running $prog using $hmm\n"; !`$prog $hmm_dir/$hmm $dbfile >$hmmsearch_dir/$hmmout` or die "Problem running hmmsearch\n"; } else { print "an hmmresult $hmmout already exists. Using this one!\n"; print "NOTE: in Galaxy the hmm results are stored in the directory of the dataset\n"; } ## 4) process the hmm search result my $hitcount = 0; ## 4a) loop through the individual results ## now the modified version for hmmer3 comes my ($query_name, @results) = parseHmmer3($hmmout, $hmmsearch_dir); if (! @results) { print "no hit found for $query_name\n"; next; } chomp $query_name; print "Results for $query_name\n"; for (my $k = 0; $k < @results; $k++) { my $hitname = $results[$k]; print "$hitname\n"; my $keep = 0; my $hitseq = ''; $refseq = ''; ## 4b) test for the reciprocity criterion fulfilled ($keep, $hitname, $hitseq, $refspec_final, $refseq) = &check4reciprocity($query_name, $hitname, @refspec); if ($keep == 1) { ## blast search with the hmm hit identifies the core-ortholog sequence of the reference species ## check for the taxon from the taxon_file.txt. my $taxon = ''; if ($taxon_check){ if ($taxon_check == 1) { $taxon = &getTaxon($hitname); } elsif ($taxon_check == 2) { $taxon = $taxon_global; } } ## put the info about the hits into an object for later post-processing $fileobj->{$taxon}->{prot}->[$hitcount] = $hitseq; $fileobj->{$taxon}->{ids}->[$hitcount] = $hitname; $fileobj->{$taxon}->{refseq} = $refseq; $hitcount++; } else { print "match to different protein from $refspec_final\n"; } } ## 5) do the rest only if at least one hit was obtained if (defined $fileobj) { ## 5a) if the hits are derived from ESTs, get the best ORF if ($estflag) { $fileobj = &predictORF(); } ## 5b) if the user has chosen to postprocess the results if ($rep) { &processHits($refseq, $concat); } ## 6) prepare the output my @taxa = keys(%$fileobj); for (my $i = 0; $i< @taxa; $i++) { if ($rep) { # push @newseqs, ">$query_name|$taxa[$i]|$fileobj->{$taxa[$i]}->{refid}"; #Rearrange Order for Galaxy - want species first for phylotable format convert pipes to tabs push @newseqs, ">$taxa[$i]\t$query_name\t$fileobj->{$taxa[$i]}->{refid}"; push @newseqs, $fileobj->{$taxa[$i]}->{refprot}; if ($estflag) { # push @newcds, ">$query_name|$taxa[$i]|$fileobj->{$taxa[$i]}->{refid}"; #Rearrange Order for Galaxy - want species first for phylotable format push @newcds, ">$taxa[$i]\t$query_name\t$fileobj->{$taxa[$i]}->{refid}"; push @newcds, $fileobj->{$taxa[$i]}->{refcds}; } } else { my $idobj = $fileobj->{$taxa[$i]}->{ids}; my $protobj = $fileobj->{$taxa[$i]}->{prot}; my $cdsobj = $fileobj->{$taxa[$i]}->{cds}; for (my $j = 0; $j < @$idobj; $j++) { # push @newseqs, ">$query_name|$taxa[$i]|$idobj->[$j]"; #Rearrange Order for Galaxy - want species first for phylotable format also tabs not pipe push @newseqs, ">$taxa[$i]\t$query_name\t$idobj->[$j]"; push @newseqs, $protobj->[$j]; if ($estflag) { # push @newcds, ">$query_name|$taxa[$i]|$idobj->[$j]"; #Rearrange Order for Galaxy - want species first for phylotable format push @newcds, ">$taxa[$i]\t$query_name\t$idobj->[$j]"; push @newcds, $cdsobj->[$j]; } } } my $refs = $co_seqs->{$query_name}; for (keys %$refs) { my $line = ">$query_name|$_|" . $refs->{$_}->{seqid} . "\n" . $refs->{$_}->{seq}; push @seqs, $line; } chomp @seqs; print "\n"; @seqs = (@seqs, @newseqs); open (OUT, ">$fa_dir_neu/$query_name.fa"); print OUT join "\n", @seqs; print OUT "\n"; close OUT; for (my $i = 0; $i < @newseqs; $i+= 2) { # my $line = $newseqs[$i] . "|" . $newseqs[$i+1]; #Galaxy uses tabs not pipes my $line = $newseqs[$i] . "\t" . $newseqs[$i+1]; $line =~ s/>//; push @seqs2store, $line; if ($estflag) { #Galaxy uses tabs not pipes # my $cdsline = $newcds[$i] . "|" . $newcds[$i+1]; my $cdsline = $newcds[$i] . "\t" . $newcds[$i+1]; $cdsline =~ s/>//; push @cds2store, $cdsline; } } } } } if (@seqs2store > 0) { open (OUT, ">$seqs2store_file") or die "failed to open output SEQS file\n"; print OUT join "\n", @seqs2store; print OUT "\n"; close OUT; if ($estflag) { open (OUT, ">$cds2store_file") or die "failed to open output CDS file\n"; print OUT join "\n", @cds2store; print OUT "\n"; close OUT; } } else { open (OUT, ">$seqs2store_file") or die "failed to open output SEQS file\n"; print OUT "no hits found\n"; } exit; ##################### start sub ############### ####### checkInput performs a number of checks whether sufficient information ### and all data are available to run HaMStR sub checkInput { my @log; my $check = 1; $dbfile_short = $dbfile; $dbfile_short =~ s/\..*//; ## 1) check for filetype print "Checking for filetype:\t"; if (!defined $estflag and !defined $proteinflag) { push @log, "please determine the sequence type. Choose between the options -EST or -protein"; print "failed\n"; $check = 0; } else { if ($estflag) { $estfile = $dbfile; $dbfile = "$dbfile.tc"; push @log, "HaMStR will run on the ESTs in $estfile"; push @log, "Translating ESTs"; if (!(-e "$dbfile")) { print "translating $estfile, this may take a while\n"; `translate.pl -in=$estfile -out=$dbfile`; open (LOG, "$logfile") or die "could not open logfile $logfile\n"; my @info = <LOG>; @log = (@log, @info); close LOG; } else { push @log, "Translated file already exists, using this one\n"; } if (! -e "$dbfile") { push @log, "The translation of $estfile failed. Check the script translate.pl"; print "failed\n"; $check = 0; } } else { ## file type is protein print "succeeded\n"; } } ## 2) Check for presence of blastall print "Checking for the blast program\t"; if (!(`blastall`)) { push @log, "could not execute blastall. Please check if this program is installed and executable"; print "failed\n"; $check = 0; } else { push @log, "check for blastall succeeded"; print "succeeded\n"; } ## 3) Check for presence of hmmsearch print "Checking for hmmsearch\t"; if (! `$prog -h`) { push @log, "could not execute $prog. Please check if this program is installed and executable"; print "failed\n"; $check = 0; } else { push @log, "check for $prog succeeded\n"; print "succeeded\n"; } ## 4) Check for reference taxon print "Checking for reference species and blast-dbs\t"; if (!(defined $refspec_string)) { push @log, "Please provide a reference species for the reblast!"; print "failed\n"; $check = 0; } else { push @log, "Reference species for the re-blast: $refspec_string"; @refspec = split /,/, $refspec_string; $refspec_name = $refspec[0]; print "succeeded\n"; } ## 5) Check for presence of the required blast dbs print "checking for blast-dbs:\t"; for (my $i = 0; $i < @refspec; $i++) { my $blastpathtmp = "$blastpath/$refspec[$i]/$refspec[$i]" . "_prot"; if (! (-e "$blastpathtmp.pin")) { push @log, "please edit the blastpath. Could not find $blastpathtmp"; print "$blastpathtmp failed\n"; $check = 0; } else { push @log, "check for $blastpathtmp succeeded"; print "succeeded\n"; } } ## 6) Check for presence of the directory structure print "checking for presence of the hmm files:\t"; if (!(defined $hmmset)) { $hmmpath = '.'; $hmmset = 'manual'; } else { $hmmpath = "$hmmpath/$hmmset"; $fafile = "$hmmpath/$hmmset" . '.fa'; } $hmm_dir = "$hmmpath/$hmm_dir"; $hmmsearch_dir = $dbfile_short . '_' . $hmmset; #CHANGED FOR GALAXY DIRECTORY # $fa_dir_neu = 'fa_dir_' . $dbfile_short . '_' . $hmmset . '_' . $refspec_name; $fa_dir_neu = $dbfile_short . '_' . $hmmset . '_' . $refspec_name; ## 7) check for the presence of the hmm-files and the fasta-file if (!(-e "$hmm_dir")) { push @log, "Could not find $hmm_dir"; print "failed\n"; $check = 0; } else { if (defined $hmm) { if (! -e "$hmm_dir/$hmm") { push @log, "$hmm has been defined but could not be found in $hmm_dir/$hmm"; $check = 0; } else { push @log, "$hmm has been found"; if ($hmm =~ /\.hmm$/) { @hmms = ($hmm); } } } else { push @log, "running HaMStR with all hmms in $hmm_dir"; @hmms = `ls $hmm_dir`; } chomp @hmms; print "succeeded\n"; } ## 8) Test for presence of the fasta file containing the sequences of the core-ortholog cluster print "checking for presence of the core-ortholog file:\t"; if (defined $fafile) { if (! -e "$fafile") { push @log, "Could not find the file $fafile"; print "failed\n"; $check = 0; } else { push @log, "check for $fafile succeeded"; print "succeeded\n"; } } else { push @log, "Please provide path and name of fasta file containing the core-ortholog sequences"; $check = 0; print "failed\n"; } ## 9) Checks for the taxon_file print "testing whether the taxon has been determined:\t"; if (!(defined $taxon_file) or (!(-e "$taxon_file"))) { if (defined $taxon_global) { push @log, "using default taxon $taxon_global for all sequences"; print "succeeded\n"; $taxon_check = 2; } else { push @log, "No taxon_file found. Please provide a global taxon name using the option -taxon"; print "failed\n"; $check = 0; } } else { push @log, "using the file $taxon_file as taxon_file"; print "succeeded\n"; $taxon_check = 1; } ## 10) Set the file where the matched seqs are found #CHANGED BY THO FOR GALAXY TO ALLOW DETERMINATION OF OUTPUT FILE Made INPUT Option # $seqs2store_file = 'hamstrsearch_' . $dbfile_short . '_' . $hmmset . '.out'; # $cds2store_file = 'hamstrsearch_' . $dbfile_short . '_' . $hmmset . '_cds.out'; ## 11) apply the evalue-cut-off to the hmmsearch program $prog = $prog . " -E $eval"; push @log, "hmmsearch: $prog"; ## 12) setting up the directories where the output files will be put into. if ($check == 1) { if (!(-e "$hmmsearch_dir")) { `mkdir $hmmsearch_dir`; } if (!(-e "$fa_dir_neu")) { `mkdir $fa_dir_neu`; } if (!(-e "$tmpdir")) { `mkdir $tmpdir`; } } return ($check, @log); } ################# ## check4reciprocity is the second major part of the program. It checks ## whether the protein sequence that has been identified by the hmmsearch ## identifies in turn the protein from the reference taxon that was used to ## build the hmm. sub check4reciprocity { my ($query_name, $hitname, @refspec) = @_; my $searchdb; ## get the sequence from the db_file my $hitseq = `grep -m 1 -A 1 ">$hitname" $dbfile | tail -n 1`; if (!defined $hitseq) { print "could not retrieve a sequence for $hitname. Skipping...\n"; return(0, '', '', ''); } else { ## now get the sequence used for building the hmm my @original; my $refspec_final; for (my $i = 0; $i < @refspec; $i++) { @original = `grep "^>$query_name|$refspec[$i]" $fafile |sed -e "s/.*$refspec[$i]\|//"`; chomp @original; if (@original > 0) { $refspec_final = $refspec[$i]; $searchdb = "$blastpath/$refspec_final/$refspec_final" . "_prot"; last; } else { print "original sequence not be found with grepping for ^>$query_name|$refspec[$i]. Proceeding with next refspec\n"; } } if (@original == 0) { print "original sequence not be found\n"; return (0, '', '', $refspec_final); } print "REFSPEC is $refspec_final\n"; ## continue with the blast chomp $hitseq; # $hitname =~ s/\|.*//; ## now run the blast open (OUT, ">$tmpdir/$$.fa") or die "could not open out for writing\n"; print OUT ">$hitname\n$hitseq"; close OUT; !`blastall -p blastp -d $searchdb -v 10 -b 10 -i $tmpdir/$$.fa -o $tmpdir/$$.blast` or die "Problem running blast\n"; ## now parse the best blast hit my @hits = &getBestBlasthit("$tmpdir/$$.blast"); if (@hits > 0) { print "hmm-seq: ", join "\t", @original , "\n"; ## now loop through the best hits with the same evalue and check whether ## among these I find the same seq as in $original for (my $i = 0; $i <@hits; $i++) { print "blast-hit: $hits[$i]"; ## now loop through all the refspec-sequences in the hmm file for (my $j = 0; $j < @original; $j++) { if ($original[$j] eq $hits[$i]) { print "\tHit\n"; my ($refseq) = `grep -A 1 "$query_name|$refspec_final|$original[$j]" $fafile |tail -n 1`; return (1, $hitname, $hitseq, $refspec_final, $refseq); } else { print "\nnot hitting $original[$j]\n"; } } } ### if we end up here, we didn't find a hit that matches to the original sequence ### in the top hits with the same eval return (0, '', '', $refspec_final); } else { print "no hit obtained\n"; return(0, '', '', $refspec_final); } } } ############# sub getBestBlasthit { my @hits; my ($file) = @_; my $searchio = Bio::SearchIO->new(-file => $file, -format => 'blast', -report_type => 'blastp') or die "parse failed"; while( my $result = $searchio->next_result ){ my $count = 0; my $sig; my $sig_old; while( my $hit = $result->next_hit){ ## now I enter all top hits having the same evalue into the result $sig = $hit->score; if (!defined $sig_old) { $sig_old = $sig; } if ($sig == $sig_old) { push @hits, $hit->accession; } else { last; } } } return(@hits); } ################## sub getTaxon { my ($hitname) = @_; # my $q = "select name from taxon t, est_project e, est_info i, annotation_neu a where a.id = $hitname and a.contig_id = i.contig_id and i.project_id = e.project_id and e.taxon_id = t.taxon_id"; if ($hitname =~ /\D/) { $hitname =~ s/_.*//; } my $taxon = `grep -m 1 "^$hitname," $taxon_file | sed -e 's/^.*,//'`; chomp $taxon; $taxon =~ s/^[0-9]+,//; $taxon =~ s/\s*$//; $taxon =~ s/\s/_/g; if ($taxon) { return ($taxon); } else { return(); } } ############### sub processHits { my ($concat) = @_; ## 1) align all hit sequences for a taxon against the reference species my @taxa = keys(%$fileobj); for (my $i = 0; $i < @taxa; $i++) { &orfRanking($taxa[$i]); } } ################ sub predictORF { my $fileobj_new; # my ($refseq) = @_; my @taxa = keys(%$fileobj); for (my $i = 0; $i < @taxa; $i++) { my $protobj = $fileobj->{$taxa[$i]}->{prot}; my $idobj = $fileobj->{$taxa[$i]}->{ids}; my $refseq = $fileobj->{$taxa[$i]}->{refseq}; my @ids = @$idobj; for (my $j = 0; $j < @ids; $j++) { ## determine the reading frame my ($rf) = $ids[$j] =~ /.*_RF([0-9]+)/; print "rf is $rf\n"; $ids[$j] =~ s/_RF.*//; # my $est = `grep -A 1 "$ids[$j]" $estfile |tail -n 1`; ################new grep command from version 8 to fix bug my $est = `grep -A 1 ">$ids[$j]\\b" $estfile |tail -n 1`; if (! $est) { die "error in retrieval of est sequence for $ids[$j] in subroutine processHits\n"; } ## the EST is translated in rev complement if ($rf > 3) { $est = revComp($est); } my $gw = run_genewise->new($est, $refseq, "$tmpdir"); my $translation = $gw->translation; my $cds = $gw->codons; $translation =~ s/[-!]//g; $fileobj_new->{$taxa[$i]}->{ids}->[$j] = $ids[$j]; $fileobj_new->{$taxa[$i]}->{prot}->[$j] = $translation; $fileobj_new->{$taxa[$i]}->{cds}->[$j] = $cds; $fileobj_new->{$taxa[$i]}->{refseq} = $refseq; } } return($fileobj_new); } ############################ sub orfRanking { my ($spec) = @_; my $result; my $refprot; my $refcds; my @toalign; my $protobj = $fileobj->{$spec}->{prot}; my $idobj = $fileobj->{$spec}->{ids}; my $refcluster; ## variables to take the cluster and its id for later analysis my $refid; if (@$protobj == 1) { ## nothing to chose from $refprot = $protobj->[0]; $refcds = $fileobj->{$spec}->{cds}->[0]; my $length = length($refprot); $refid = $idobj->[0] . "-" . $length; } else { ## more than one cluster push @toalign, ">$refspec_final"; push @toalign, $fileobj->{$spec}->{refseq}; ## now walk through all the contigs for (my $i = 0; $i < @$protobj; $i++) { my @testseq = (">$idobj->[$i]", $protobj->[$i]); @testseq = (@testseq, @toalign); open (OUT, ">$tmpdir/$pid.ref.fa") or die "could not open file for writing refseqs\n"; print OUT join "\n", @testseq; close OUT; ## run clustalw !(`$alignmentprog $tmpdir/$pid.ref.fa -output=fasta -outfile=$tmpdir/$pid.ref.aln 2>&1 >$tmpdir/$pid.ref.log`) or die "error running clustalw\n"; ## get the alignment score $result->[$i]->{score} = `grep "Alignment Score" $tmpdir/$pid.ref.log |sed -e 's/[^0-9]//g'`; if (!$result->[$i]->{score}) { die "error in determining alignment score\n"; } chomp $result->[$i]->{score}; ## get the aligned sequence open (ALN, "$tmpdir/$pid.ref.aln") or die "failed to open alignment file\n"; my @aln = <ALN>; close ALN; my $aseq = extractSeq($idobj->[$i], @aln); ## remove the terminal gaps $aseq =~ s/-*$//; $result->[$i]->{aend} = length $aseq; my ($head) = $aseq =~ /^(-*).*/; ($result->[$i]->{astart}) = length($head)+1; } ### the results for all seqs has been gathered, now order them $result = sortRef($result); ($refprot, $refcds, $refid) = &determineRef($result,$spec); } $fileobj->{$spec}->{refprot} = $refprot; $fileobj->{$spec}->{refcds} = $refcds; $fileobj->{$spec}->{refid} = $refid; return(); } ########################### sub sortRef { my $result = shift; my @sort; for (my $i = 0; $i < @$result; $i++) { push @sort, "$i,$result->[$i]->{astart},$result->[$i]->{aend},$result->[$i]->{score}"; } open (OUT, ">$tmpdir/$pid.sort") or die "failed to write for sorting\n"; print OUT join "\n", @sort; close OUT; `sort -n -t ',' -k 2 $tmpdir/$pid.sort >$tmpdir/$pid.sort.out`; @sort = `less $tmpdir/$pid.sort`; chomp @sort; $result = undef; for (my $i = 0; $i < @sort; $i++) { ($result->[$i]->{id}, $result->[$i]->{start}, $result->[$i]->{end}, $result->[$i]->{score}) = split ',', $sort[$i]; } return($result); } ######################## sub determineRef { my ($result, $spec) = @_; my $lastend = 0; my $lastscore = 0; my $final; my $count = 0; my $id = ''; for (my $i = 0; $i < @$result; $i++) { if ($result->[$i]->{start} < $lastend or $lastend == 0) { if ($result->[$i]->{score} > $lastscore) { $lastend = $result->[$i]->{end}; $lastscore = $result->[$i]->{score}; $id = $result->[$i]->{id}; } } elsif ($result->[$i]->{start} > $lastend) { ## a new part of the alignment is covered. Fix the results obtained so far $final->[$count]->{id} = $id; $lastend = $result->[$i]->{end}; $id = $result->[$i]->{id}; $count++; } } $final->[$count]->{id} = $id; ## now concatenate the results my $refprot = ''; my $refid = ''; my $refcds = ''; for (my $i = 0; $i < @$final; $i++) { my $seq = $fileobj->{$spec}->{prot}->[$final->[$i]->{id}]; my $cdsseq = $fileobj->{$spec}->{cds}->[$final->[$i]->{id}]; my $length = length($seq); $refid .= "$fileobj->{$spec}->{ids}->[$final->[$i]->{id}]-$length" . "PP"; $refprot .= $seq; if ($estflag) { $refcds .= $cdsseq; } } $refid =~ s/PP$//; return($refprot, $refcds, $refid); } ############################# sub extractSeq { my ($id, @aln) = @_; my $seq = ''; my $start = 0; for (my $i = 0; $i < @aln; $i++) { if ($aln[$i] =~ $id) { $start = 1; } elsif ($aln[$i] =~ />/ and $start == 1) { last; } elsif ($start == 1) { $seq .= $aln[$i]; } } $seq =~ s/\s//g; return ($seq); } ############################## sub revComp { my ($seq) = @_; $seq =~ tr/AGCTYRKMWSagct/TCGARYMKWSTCGA/; $seq = reverse($seq); return($seq); } ############################## sub parseHmmer3 { my ($file, $path) = @_; if (!defined $path) { $path = '.'; } open (IN, "$path/$file") or die "failed to open $file\n"; my @data = <IN>; close IN; ### extract the hits my @hit; my $start = 0; my $stop = 0; my $i = 0; for (my $i = 0; $i < @data; $i++) { if (!($data[$i] =~ /\S/)) { next; } else { if ($data[$i] =~ /Scores for complete sequence/) { $start = 1; $i += 4; } elsif (($data[$i] =~ /inclusion threshold/) or ($data[$i] =~ /Domain and alignment/)) { last; } if ($start == 1 and $stop == 0) { $data[$i] =~ s/^\s+//; my @list = split /\s+/, $data[$i]; push @hit, $list[8]; $start = 0; #Added by THO } } } ### get the query_id my ($query) = grep /^Query:/, @data; $query =~ s/^Query:\s+//; $query =~ s/\s.*//; if (defined $hit[0]) { chomp @hit; return ($query, @hit); } else { return ($query); } } ##################### sub parseSeqfile { my $seqref; my $id; my $spec; my $seqid; my $seq; my $file = shift; open (IN, "$file") or die "failed to open $file\n"; my @seqs = <IN>; close IN; chomp @seqs; for (my $i = 0; $i < @seqs; $i++) { if ($seqs[$i] =~ />/) { $seqs[$i] =~ s/>//; if (defined $id and defined $seq) { $seqref->{$id}->{$spec}->{seqid} = $seqid; $seqref->{$id}->{$spec}->{seq} = $seq; $seq = undef; } ($id, $spec, $seqid) = split (/\|/, $seqs[$i]); } else { $seq .= $seqs[$i]; } } if (defined $id and defined $seq) { $seqref->{$id}->{$spec}->{seqid} = $seqid; $seqref->{$id}->{$spec}->{seq} = $seq; $seq = undef; } return ($seqref); }