comparison orthologs/ucsb_hamster/hamstrsearch_local-hmmer3.pl @ 0:5b9a38ec4a39 draft default tip

First commit of old repositories
author osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu>
date Tue, 11 Mar 2014 12:19:13 -0700
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:5b9a38ec4a39
1 #!/usr/bin/perl -w
2 use strict;
3
4 use FindBin;
5 use lib "$FindBin::Bin/lib";
6 use Getopt::Long;
7 use Bio::SearchIO;
8 use Bio::Search::Hit::BlastHit;
9 use run_genewise;
10
11 # PROGRAMNAME: hamstrsearch_local.pl
12
13 # Copyright (C) 2009 INGO EBERSBERGER, ingo.ebersberger@univie.ac.at
14 # This program is free software; you can redistribute it and/or modify it
15 # under the terms of the GNU General Public License as published
16 # by the Free Software Foundation; either version 3 of the License
17 # or any later version.
18
19 # This program is distributed in the hope that it will be useful
20 # but WITHOUT ANY WARRANTY; without even the implied warranty of
21 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22 # General Public License for more details.
23 # You should have received a copy of the GNU General Public License
24 # along with this program; If not, see http://www.gnu.org/licenses
25
26 # PROGRAM DESCRIPTION: This is the relevant version!
27
28 # DATE: Wed Dec 19 10:41:09 CEST 2007
29
30
31 ######################## start main #############################
32 my $version = "\nhamstrsearch_local-v1.0.pl\nGalaxy implementation by Todd H. Oakley and Roger Ngo, UCSB\n";
33 print "$version\n";
34 ### EDIT THE FOLLOWING LINES TO CUSTOMIZE YOUR SCRIPT
35 ## note: all the variables can also be set via the command line
36 my $pid = $$;
37 my $prog = 'hmmsearch'; #program for the hmm search
38 my $alignmentprog = 'clustalw';
39
40
41 #PATH SETTINGS
42 my $hmmpath = '.';
43 my $blastpath = '.'; #Uses Galaxy working directory
44 my $tmpdir = 'tmp_' . $pid;
45 my $eval = 1; # eval cutoff for the hmm search
46 my $logfile = "hamstrsearch_" . $pid . '.log';
47 my $hmm_dir = 'hmm_dir';
48 my $fa_dir = '';
49 ##############################
50 my $help;
51 my $seq2store_file='';
52 my $cds2store_file='';
53 my $hmm;
54 my @hmms;
55 my $fa;
56 my $fafile;
57 my @seqs2store;
58 my @cds2store;
59 my $ep2eg;
60 my $estfile;
61 my $aln;
62 my $idfile;
63 my $taxon_check = 0;
64 my $hmmset;
65 my $hmmsearch_dir;
66 my $dbfile = ''; # the file hmmsearch is run against
67 my $dbfile_short;
68 my $taxon_file;
69 my $refspec_string;
70 my @refspec = qw();
71 my @primer_taxa;
72 my $refspec_name = '';
73 my $taxon_global;
74 my $fileobj;
75 my $fa_dir_neu = '';
76 my $gwrefprot;
77 my $seqtype;
78 my $align;
79 my $rep;
80 my $estflag;
81 my $proteinflag;
82 my $refseq;
83 my $refspec_final = '';
84 my $concat;
85 my $seqs2store_file;
86
87 #####determine the hostname#######
88 my $hostname = `hostname`;
89 chomp $hostname;
90 print "hostname is $hostname\n";
91
92 #################################
93 if (@ARGV==0) {
94 $help = 1;
95 }
96 ## help message
97 my $helpmessage = "
98 This program is freely distributed under a GPL. See -version for more info
99 Copyright (c) GRL limited: portions of the code are from separate copyrights
100
101 \nUSAGE: hamstrsearch_local.pl -sequence_file=<> -hmmset=<> -taxon=<> -refspec=<> [-est|-protein] [-hmm=<>] [-representative] [-h]
102
103 OPTIONS:
104
105 -sequence_file:
106 path and name of the file containing the sequences hmmer is run against
107 Per default, this file should be in the data directory.
108 -est:
109 set this flag if you are searching in ESTs
110 -protein:
111 set this flag if you are searching in protein sequences
112 -hmmset:
113 specifies the name of the core-ortholog set.
114 The program will look for the files in the default directory 'core-orthologs' unless you specify
115 a different one.
116 -taxon:
117 You need to specify a default taxon name from which your ESTs or protein sequences are derived.
118 -refspec:
119 sets the reference species. Note, it has to be a species that contributed sequences
120 to the hmms you are using. NO DEFAULT IS SET! For a list of possible reference
121 taxa you can have a look at the speclist.txt file in the default core-ortholog sets
122 that come with this distribution. Please use the 5 letter abreviations. If you choose
123 to use core-orthologs were not every taxon is represented in all core-orthologs, you
124 can provide a comma-separated list with the preferred refspec first. The lower-ranking
125 reference species will only be used if a certain gene is not present in the preferred
126 refspecies due to alternative paths in the transitive closure to define
127 the core-orthologs.
128 CURRENTLY NO CHECK IS IMPLEMENTED!
129 NOTE: A BLAST-DB FOR THE REFERENCE SPECIES IS REQUIRED!
130 -eval_limit=<>
131 This options allows to set the e-value cut-off for the HMM search.
132 DEFAULT: 1
133 -hmm:
134 option to provide only a single hmm to be used for the search.
135 Note, this file has to end with .hmm
136
137 ### the following options should only be used when you chose to alter the default structure of the
138 ### hamstrsearch_local directories. Currently, this has not been extensively tested.
139 -fasta_file:
140 path and name of the file containing the core-ortholog sequences
141 you don't have to use this option when you
142 -hmmpath:
143 sets the path to the hmm_dir. By default this is set to the current directory.
144 -blastpath:
145 sets the path where the blast-dbs are located. Per default this is ../blast_dir
146 Note, the program expects the structure blastpath/refspec/refspec_prot.
147 To overrule this structure you will have to edit the script.
148 \n\n";
149 GetOptions ("h" => \$help,
150 "hmm=s" => \$hmm,
151 "est" => \$estflag,
152 "protein"=> \$proteinflag,
153 "sequence_file=s" => \$dbfile,
154 "fasta_file=s" => \$fafile,
155 "hmmset=s" => \$hmmset,
156 "hmmpath=s" => \$hmmpath,
157 "taxon_file=s" => \$taxon_file,
158 "taxon=s" => \$taxon_global,
159 "eval_limit=s" => \$eval,
160 "refspec=s" => \$refspec_string,
161 "estfile=s" => \$estfile,
162 "representative" => \$rep,
163 "blastpath=s" => \$blastpath,
164 "galaxyout=s" => \$seqs2store_file,
165 "2galaxyout=s" => \$cds2store_file);
166
167 if ($help) {
168 print $helpmessage;
169 exit;
170 }
171
172 ## 1) check if all information is available to run HaMStR
173 my ($check, @log) = &checkInput();
174 if ($check == 0) {
175 print join "\n", @log;
176 print "$helpmessage";
177 exit;
178 }
179 else {
180 open (OUT, ">$logfile") or die "could not open logfile $logfile\n";
181 print OUT join "\n", @log;
182 close OUT;
183 }
184 ### read in of the core-ortholog sequences
185 my $co_seqs = parseSeqfile("$fafile");
186
187 ## 2) loop through the hmms
188 ## process each hmm file separately
189 for (my $i = 0; $i < @hmms; $i++) {
190 $fileobj = undef;
191 my @seqs = qw();
192 my @newseqs = qw();## var to contain the sequences to be added to the orthologous cluster
193 my @newcds = qw();
194 my $hmm = $hmms[$i];
195 my $hmmout = $hmm;
196 $hmmout =~ s/\.hmm/\.out/;
197 ## 3) run the hmm search
198 if (!(-e "$hmmsearch_dir/$hmmout")) {
199 print "now running $prog using $hmm\n";
200 !`$prog $hmm_dir/$hmm $dbfile >$hmmsearch_dir/$hmmout` or die "Problem running hmmsearch\n";
201 }
202 else {
203 print "an hmmresult $hmmout already exists. Using this one!\n";
204 print "NOTE: in Galaxy the hmm results are stored in the directory of the dataset\n";
205 }
206
207 ## 4) process the hmm search result
208 my $hitcount = 0;
209 ## 4a) loop through the individual results
210 ## now the modified version for hmmer3 comes
211 my ($query_name, @results) = parseHmmer3($hmmout, $hmmsearch_dir);
212 if (! @results) {
213 print "no hit found for $query_name\n";
214 next;
215 }
216 chomp $query_name;
217 print "Results for $query_name\n";
218 for (my $k = 0; $k < @results; $k++) {
219 my $hitname = $results[$k];
220 print "$hitname\n";
221 my $keep = 0;
222 my $hitseq = '';
223 $refseq = '';
224 ## 4b) test for the reciprocity criterion fulfilled
225 ($keep, $hitname, $hitseq, $refspec_final, $refseq) = &check4reciprocity($query_name, $hitname, @refspec);
226 if ($keep == 1) {
227 ## blast search with the hmm hit identifies the core-ortholog sequence of the reference species
228 ## check for the taxon from the taxon_file.txt.
229 my $taxon = '';
230 if ($taxon_check){
231 if ($taxon_check == 1) {
232 $taxon = &getTaxon($hitname);
233 }
234 elsif ($taxon_check == 2) {
235 $taxon = $taxon_global;
236 }
237 }
238 ## put the info about the hits into an object for later post-processing
239 $fileobj->{$taxon}->{prot}->[$hitcount] = $hitseq;
240 $fileobj->{$taxon}->{ids}->[$hitcount] = $hitname;
241 $fileobj->{$taxon}->{refseq} = $refseq;
242 $hitcount++;
243 }
244 else {
245 print "match to different protein from $refspec_final\n";
246 }
247 }
248 ## 5) do the rest only if at least one hit was obtained
249 if (defined $fileobj) {
250 ## 5a) if the hits are derived from ESTs, get the best ORF
251 if ($estflag) {
252 $fileobj = &predictORF();
253 }
254 ## 5b) if the user has chosen to postprocess the results
255 if ($rep) {
256 &processHits($refseq, $concat);
257 }
258 ## 6) prepare the output
259 my @taxa = keys(%$fileobj);
260 for (my $i = 0; $i< @taxa; $i++) {
261 if ($rep) {
262 # push @newseqs, ">$query_name|$taxa[$i]|$fileobj->{$taxa[$i]}->{refid}";
263 #Rearrange Order for Galaxy - want species first for phylotable format convert pipes to tabs
264 push @newseqs, ">$taxa[$i]\t$query_name\t$fileobj->{$taxa[$i]}->{refid}";
265 push @newseqs, $fileobj->{$taxa[$i]}->{refprot};
266 if ($estflag) {
267 # push @newcds, ">$query_name|$taxa[$i]|$fileobj->{$taxa[$i]}->{refid}";
268 #Rearrange Order for Galaxy - want species first for phylotable format
269 push @newcds, ">$taxa[$i]\t$query_name\t$fileobj->{$taxa[$i]}->{refid}";
270 push @newcds, $fileobj->{$taxa[$i]}->{refcds};
271 }
272 }
273 else {
274 my $idobj = $fileobj->{$taxa[$i]}->{ids};
275 my $protobj = $fileobj->{$taxa[$i]}->{prot};
276 my $cdsobj = $fileobj->{$taxa[$i]}->{cds};
277 for (my $j = 0; $j < @$idobj; $j++) {
278 # push @newseqs, ">$query_name|$taxa[$i]|$idobj->[$j]";
279 #Rearrange Order for Galaxy - want species first for phylotable format also tabs not pipe
280 push @newseqs, ">$taxa[$i]\t$query_name\t$idobj->[$j]";
281 push @newseqs, $protobj->[$j];
282 if ($estflag) {
283 # push @newcds, ">$query_name|$taxa[$i]|$idobj->[$j]";
284 #Rearrange Order for Galaxy - want species first for phylotable format
285 push @newcds, ">$taxa[$i]\t$query_name\t$idobj->[$j]";
286 push @newcds, $cdsobj->[$j];
287 }
288 }
289 }
290 my $refs = $co_seqs->{$query_name};
291 for (keys %$refs) {
292 my $line = ">$query_name|$_|" . $refs->{$_}->{seqid} . "\n" . $refs->{$_}->{seq};
293 push @seqs, $line;
294 }
295 chomp @seqs;
296 print "\n";
297 @seqs = (@seqs, @newseqs);
298 open (OUT, ">$fa_dir_neu/$query_name.fa");
299 print OUT join "\n", @seqs;
300 print OUT "\n";
301 close OUT;
302 for (my $i = 0; $i < @newseqs; $i+= 2) {
303 # my $line = $newseqs[$i] . "|" . $newseqs[$i+1];
304 #Galaxy uses tabs not pipes
305 my $line = $newseqs[$i] . "\t" . $newseqs[$i+1];
306 $line =~ s/>//;
307 push @seqs2store, $line;
308 if ($estflag) {
309 #Galaxy uses tabs not pipes
310 # my $cdsline = $newcds[$i] . "|" . $newcds[$i+1];
311 my $cdsline = $newcds[$i] . "\t" . $newcds[$i+1];
312 $cdsline =~ s/>//;
313 push @cds2store, $cdsline;
314 }
315 }
316 }
317 }
318 }
319 if (@seqs2store > 0) {
320 open (OUT, ">$seqs2store_file") or die "failed to open output SEQS file\n";
321 print OUT join "\n", @seqs2store;
322 print OUT "\n";
323 close OUT;
324 if ($estflag) {
325 open (OUT, ">$cds2store_file") or die "failed to open output CDS file\n";
326 print OUT join "\n", @cds2store;
327 print OUT "\n";
328 close OUT;
329 }
330 }
331 else {
332 open (OUT, ">$seqs2store_file") or die "failed to open output SEQS file\n";
333 print OUT "no hits found\n";
334 }
335 exit;
336 ##################### start sub ###############
337 ####### checkInput performs a number of checks whether sufficient information
338 ### and all data are available to run HaMStR
339 sub checkInput {
340 my @log;
341 my $check = 1;
342 $dbfile_short = $dbfile;
343 $dbfile_short =~ s/\..*//;
344 ## 1) check for filetype
345 print "Checking for filetype:\t";
346 if (!defined $estflag and !defined $proteinflag) {
347 push @log, "please determine the sequence type. Choose between the options -EST or -protein";
348 print "failed\n";
349 $check = 0;
350 }
351 else {
352 if ($estflag) {
353 $estfile = $dbfile;
354 $dbfile = "$dbfile.tc";
355 push @log, "HaMStR will run on the ESTs in $estfile";
356 push @log, "Translating ESTs";
357 if (!(-e "$dbfile")) {
358 print "translating $estfile, this may take a while\n";
359 `translate.pl -in=$estfile -out=$dbfile`;
360 open (LOG, "$logfile") or die "could not open logfile $logfile\n";
361 my @info = <LOG>;
362 @log = (@log, @info);
363 close LOG;
364 }
365 else {
366 push @log, "Translated file already exists, using this one\n";
367 }
368 if (! -e "$dbfile") {
369 push @log, "The translation of $estfile failed. Check the script translate.pl";
370 print "failed\n";
371 $check = 0;
372 }
373 }
374 else {
375 ## file type is protein
376 print "succeeded\n";
377 }
378 }
379 ## 2) Check for presence of blastall
380 print "Checking for the blast program\t";
381 if (!(`blastall`)) {
382 push @log, "could not execute blastall. Please check if this program is installed and executable";
383 print "failed\n";
384 $check = 0;
385 }
386 else {
387 push @log, "check for blastall succeeded";
388 print "succeeded\n";
389 }
390 ## 3) Check for presence of hmmsearch
391 print "Checking for hmmsearch\t";
392 if (! `$prog -h`) {
393 push @log, "could not execute $prog. Please check if this program is installed and executable";
394 print "failed\n";
395 $check = 0;
396 }
397 else {
398 push @log, "check for $prog succeeded\n";
399 print "succeeded\n";
400 }
401 ## 4) Check for reference taxon
402 print "Checking for reference species and blast-dbs\t";
403 if (!(defined $refspec_string)) {
404 push @log, "Please provide a reference species for the reblast!";
405 print "failed\n";
406 $check = 0;
407 }
408 else {
409 push @log, "Reference species for the re-blast: $refspec_string";
410 @refspec = split /,/, $refspec_string;
411 $refspec_name = $refspec[0];
412 print "succeeded\n";
413 }
414 ## 5) Check for presence of the required blast dbs
415 print "checking for blast-dbs:\t";
416 for (my $i = 0; $i < @refspec; $i++) {
417 my $blastpathtmp = "$blastpath/$refspec[$i]/$refspec[$i]" . "_prot";
418 if (! (-e "$blastpathtmp.pin")) {
419 push @log, "please edit the blastpath. Could not find $blastpathtmp";
420 print "$blastpathtmp failed\n";
421 $check = 0;
422 }
423 else {
424 push @log, "check for $blastpathtmp succeeded";
425 print "succeeded\n";
426 }
427 }
428 ## 6) Check for presence of the directory structure
429 print "checking for presence of the hmm files:\t";
430 if (!(defined $hmmset)) {
431 $hmmpath = '.';
432 $hmmset = 'manual';
433 }
434 else {
435 $hmmpath = "$hmmpath/$hmmset";
436 $fafile = "$hmmpath/$hmmset" . '.fa';
437 }
438 $hmm_dir = "$hmmpath/$hmm_dir";
439 $hmmsearch_dir = $dbfile_short . '_' . $hmmset;
440
441 #CHANGED FOR GALAXY DIRECTORY
442 # $fa_dir_neu = 'fa_dir_' . $dbfile_short . '_' . $hmmset . '_' . $refspec_name;
443 $fa_dir_neu = $dbfile_short . '_' . $hmmset . '_' . $refspec_name;
444 ## 7) check for the presence of the hmm-files and the fasta-file
445 if (!(-e "$hmm_dir")) {
446 push @log, "Could not find $hmm_dir";
447 print "failed\n";
448 $check = 0;
449 }
450 else {
451 if (defined $hmm) {
452 if (! -e "$hmm_dir/$hmm") {
453 push @log, "$hmm has been defined but could not be found in $hmm_dir/$hmm";
454 $check = 0;
455 }
456 else {
457 push @log, "$hmm has been found";
458 if ($hmm =~ /\.hmm$/) {
459 @hmms = ($hmm);
460 }
461 }
462 }
463 else {
464 push @log, "running HaMStR with all hmms in $hmm_dir";
465 @hmms = `ls $hmm_dir`;
466 }
467 chomp @hmms;
468 print "succeeded\n";
469 }
470
471 ## 8) Test for presence of the fasta file containing the sequences of the core-ortholog cluster
472 print "checking for presence of the core-ortholog file:\t";
473 if (defined $fafile) {
474 if (! -e "$fafile") {
475 push @log, "Could not find the file $fafile";
476 print "failed\n";
477 $check = 0;
478 }
479 else {
480 push @log, "check for $fafile succeeded";
481 print "succeeded\n";
482 }
483 }
484 else {
485 push @log, "Please provide path and name of fasta file containing the core-ortholog sequences";
486 $check = 0;
487 print "failed\n";
488 }
489 ## 9) Checks for the taxon_file
490 print "testing whether the taxon has been determined:\t";
491 if (!(defined $taxon_file) or (!(-e "$taxon_file"))) {
492 if (defined $taxon_global) {
493 push @log, "using default taxon $taxon_global for all sequences";
494 print "succeeded\n";
495 $taxon_check = 2;
496 }
497 else {
498 push @log, "No taxon_file found. Please provide a global taxon name using the option -taxon";
499 print "failed\n";
500 $check = 0;
501 }
502 }
503 else {
504 push @log, "using the file $taxon_file as taxon_file";
505 print "succeeded\n";
506 $taxon_check = 1;
507 }
508 ## 10) Set the file where the matched seqs are found
509 #CHANGED BY THO FOR GALAXY TO ALLOW DETERMINATION OF OUTPUT FILE Made INPUT Option
510 # $seqs2store_file = 'hamstrsearch_' . $dbfile_short . '_' . $hmmset . '.out';
511 # $cds2store_file = 'hamstrsearch_' . $dbfile_short . '_' . $hmmset . '_cds.out';
512
513 ## 11) apply the evalue-cut-off to the hmmsearch program
514 $prog = $prog . " -E $eval";
515 push @log, "hmmsearch: $prog";
516
517 ## 12) setting up the directories where the output files will be put into.
518 if ($check == 1) {
519 if (!(-e "$hmmsearch_dir")) {
520 `mkdir $hmmsearch_dir`;
521 }
522 if (!(-e "$fa_dir_neu")) {
523 `mkdir $fa_dir_neu`;
524 }
525 if (!(-e "$tmpdir")) {
526 `mkdir $tmpdir`;
527 }
528 }
529 return ($check, @log);
530 }
531 #################
532 ## check4reciprocity is the second major part of the program. It checks
533 ## whether the protein sequence that has been identified by the hmmsearch
534 ## identifies in turn the protein from the reference taxon that was used to
535 ## build the hmm.
536 sub check4reciprocity {
537 my ($query_name, $hitname, @refspec) = @_;
538 my $searchdb;
539 ## get the sequence from the db_file
540 my $hitseq = `grep -m 1 -A 1 ">$hitname" $dbfile | tail -n 1`;
541 if (!defined $hitseq) {
542 print "could not retrieve a sequence for $hitname. Skipping...\n";
543 return(0, '', '', '');
544 }
545 else {
546 ## now get the sequence used for building the hmm
547 my @original;
548 my $refspec_final;
549 for (my $i = 0; $i < @refspec; $i++) {
550 @original = `grep "^>$query_name|$refspec[$i]" $fafile |sed -e "s/.*$refspec[$i]\|//"`;
551 chomp @original;
552
553 if (@original > 0) {
554 $refspec_final = $refspec[$i];
555 $searchdb = "$blastpath/$refspec_final/$refspec_final" . "_prot";
556 last;
557 }
558 else {
559 print "original sequence not be found with grepping for ^>$query_name|$refspec[$i]. Proceeding with next refspec\n";
560 }
561 }
562 if (@original == 0) {
563 print "original sequence not be found\n";
564 return (0, '', '', $refspec_final);
565 }
566 print "REFSPEC is $refspec_final\n";
567 ## continue with the blast
568 chomp $hitseq;
569 # $hitname =~ s/\|.*//;
570 ## now run the blast
571 open (OUT, ">$tmpdir/$$.fa") or die "could not open out for writing\n";
572 print OUT ">$hitname\n$hitseq";
573 close OUT;
574 !`blastall -p blastp -d $searchdb -v 10 -b 10 -i $tmpdir/$$.fa -o $tmpdir/$$.blast` or die "Problem running blast\n";
575 ## now parse the best blast hit
576 my @hits = &getBestBlasthit("$tmpdir/$$.blast");
577 if (@hits > 0) {
578
579 print "hmm-seq: ", join "\t", @original , "\n";
580 ## now loop through the best hits with the same evalue and check whether
581 ## among these I find the same seq as in $original
582 for (my $i = 0; $i <@hits; $i++) {
583 print "blast-hit: $hits[$i]";
584 ## now loop through all the refspec-sequences in the hmm file
585 for (my $j = 0; $j < @original; $j++) {
586 if ($original[$j] eq $hits[$i]) {
587 print "\tHit\n";
588 my ($refseq) = `grep -A 1 "$query_name|$refspec_final|$original[$j]" $fafile |tail -n 1`;
589 return (1, $hitname, $hitseq, $refspec_final, $refseq);
590 }
591 else {
592 print "\nnot hitting $original[$j]\n";
593 }
594 }
595 }
596 ### if we end up here, we didn't find a hit that matches to the original sequence
597 ### in the top hits with the same eval
598 return (0, '', '', $refspec_final);
599 }
600 else {
601 print "no hit obtained\n";
602 return(0, '', '', $refspec_final);
603 }
604 }
605 }
606 #############
607 sub getBestBlasthit {
608 my @hits;
609 my ($file) = @_;
610 my $searchio = Bio::SearchIO->new(-file => $file,
611 -format => 'blast',
612 -report_type => 'blastp') or die "parse failed";
613 while( my $result = $searchio->next_result ){
614 my $count = 0;
615 my $sig;
616 my $sig_old;
617 while( my $hit = $result->next_hit){
618 ## now I enter all top hits having the same evalue into the result
619 $sig = $hit->score;
620 if (!defined $sig_old) {
621 $sig_old = $sig;
622 }
623 if ($sig == $sig_old) {
624 push @hits, $hit->accession;
625 }
626 else {
627 last;
628 }
629 }
630 }
631 return(@hits);
632 }
633 ##################
634 sub getTaxon {
635 my ($hitname) = @_;
636 # my $q = "select name from taxon t, est_project e, est_info i, annotation_neu a where a.id = $hitname and a.contig_id = i.contig_id and i.project_id = e.project_id and e.taxon_id = t.taxon_id";
637 if ($hitname =~ /\D/) {
638 $hitname =~ s/_.*//;
639 }
640 my $taxon = `grep -m 1 "^$hitname," $taxon_file | sed -e 's/^.*,//'`;
641 chomp $taxon;
642 $taxon =~ s/^[0-9]+,//;
643 $taxon =~ s/\s*$//;
644 $taxon =~ s/\s/_/g;
645 if ($taxon) {
646 return ($taxon);
647 }
648 else {
649 return();
650 }
651 }
652 ###############
653 sub processHits {
654 my ($concat) = @_;
655 ## 1) align all hit sequences for a taxon against the reference species
656 my @taxa = keys(%$fileobj);
657 for (my $i = 0; $i < @taxa; $i++) {
658 &orfRanking($taxa[$i]);
659 }
660 }
661
662 ################
663 sub predictORF {
664 my $fileobj_new;
665 # my ($refseq) = @_;
666 my @taxa = keys(%$fileobj);
667 for (my $i = 0; $i < @taxa; $i++) {
668 my $protobj = $fileobj->{$taxa[$i]}->{prot};
669 my $idobj = $fileobj->{$taxa[$i]}->{ids};
670 my $refseq = $fileobj->{$taxa[$i]}->{refseq};
671 my @ids = @$idobj;
672 for (my $j = 0; $j < @ids; $j++) {
673 ## determine the reading frame
674 my ($rf) = $ids[$j] =~ /.*_RF([0-9]+)/;
675 print "rf is $rf\n";
676 $ids[$j] =~ s/_RF.*//;
677 # my $est = `grep -A 1 "$ids[$j]" $estfile |tail -n 1`;
678 ################new grep command from version 8 to fix bug
679 my $est = `grep -A 1 ">$ids[$j]\\b" $estfile |tail -n 1`;
680 if (! $est) {
681 die "error in retrieval of est sequence for $ids[$j] in subroutine processHits\n";
682 }
683 ## the EST is translated in rev complement
684 if ($rf > 3) {
685 $est = revComp($est);
686 }
687
688 my $gw = run_genewise->new($est, $refseq, "$tmpdir");
689 my $translation = $gw->translation;
690 my $cds = $gw->codons;
691 $translation =~ s/[-!]//g;
692 $fileobj_new->{$taxa[$i]}->{ids}->[$j] = $ids[$j];
693 $fileobj_new->{$taxa[$i]}->{prot}->[$j] = $translation;
694 $fileobj_new->{$taxa[$i]}->{cds}->[$j] = $cds;
695 $fileobj_new->{$taxa[$i]}->{refseq} = $refseq;
696 }
697 }
698 return($fileobj_new);
699 }
700 ############################
701 sub orfRanking {
702 my ($spec) = @_;
703 my $result;
704 my $refprot;
705 my $refcds;
706 my @toalign;
707 my $protobj = $fileobj->{$spec}->{prot};
708 my $idobj = $fileobj->{$spec}->{ids};
709 my $refcluster; ## variables to take the cluster and its id for later analysis
710 my $refid;
711 if (@$protobj == 1) {
712 ## nothing to chose from
713 $refprot = $protobj->[0];
714 $refcds = $fileobj->{$spec}->{cds}->[0];
715 my $length = length($refprot);
716 $refid = $idobj->[0] . "-" . $length;
717 }
718 else {
719 ## more than one cluster
720 push @toalign, ">$refspec_final";
721 push @toalign, $fileobj->{$spec}->{refseq};
722 ## now walk through all the contigs
723 for (my $i = 0; $i < @$protobj; $i++) {
724 my @testseq = (">$idobj->[$i]", $protobj->[$i]);
725 @testseq = (@testseq, @toalign);
726 open (OUT, ">$tmpdir/$pid.ref.fa") or die "could not open file for writing refseqs\n";
727 print OUT join "\n", @testseq;
728 close OUT;
729 ## run clustalw
730 !(`$alignmentprog $tmpdir/$pid.ref.fa -output=fasta -outfile=$tmpdir/$pid.ref.aln 2>&1 >$tmpdir/$pid.ref.log`) or die "error running clustalw\n";
731 ## get the alignment score
732 $result->[$i]->{score} = `grep "Alignment Score" $tmpdir/$pid.ref.log |sed -e 's/[^0-9]//g'`;
733 if (!$result->[$i]->{score}) {
734 die "error in determining alignment score\n";
735 }
736 chomp $result->[$i]->{score};
737 ## get the aligned sequence
738 open (ALN, "$tmpdir/$pid.ref.aln") or die "failed to open alignment file\n";
739 my @aln = <ALN>;
740 close ALN;
741 my $aseq = extractSeq($idobj->[$i], @aln);
742 ## remove the terminal gaps
743 $aseq =~ s/-*$//;
744 $result->[$i]->{aend} = length $aseq;
745 my ($head) = $aseq =~ /^(-*).*/;
746 ($result->[$i]->{astart}) = length($head)+1;
747 }
748 ### the results for all seqs has been gathered, now order them
749 $result = sortRef($result);
750 ($refprot, $refcds, $refid) = &determineRef($result,$spec);
751 }
752 $fileobj->{$spec}->{refprot} = $refprot;
753 $fileobj->{$spec}->{refcds} = $refcds;
754 $fileobj->{$spec}->{refid} = $refid;
755 return();
756 }
757 ###########################
758 sub sortRef {
759 my $result = shift;
760 my @sort;
761 for (my $i = 0; $i < @$result; $i++) {
762 push @sort, "$i,$result->[$i]->{astart},$result->[$i]->{aend},$result->[$i]->{score}";
763 }
764 open (OUT, ">$tmpdir/$pid.sort") or die "failed to write for sorting\n";
765 print OUT join "\n", @sort;
766 close OUT;
767 `sort -n -t ',' -k 2 $tmpdir/$pid.sort >$tmpdir/$pid.sort.out`;
768 @sort = `less $tmpdir/$pid.sort`;
769 chomp @sort;
770 $result = undef;
771 for (my $i = 0; $i < @sort; $i++) {
772 ($result->[$i]->{id}, $result->[$i]->{start}, $result->[$i]->{end}, $result->[$i]->{score}) = split ',', $sort[$i];
773 }
774 return($result);
775 }
776 ########################
777 sub determineRef {
778 my ($result, $spec) = @_;
779 my $lastend = 0;
780 my $lastscore = 0;
781 my $final;
782 my $count = 0;
783 my $id = '';
784 for (my $i = 0; $i < @$result; $i++) {
785 if ($result->[$i]->{start} < $lastend or $lastend == 0) {
786 if ($result->[$i]->{score} > $lastscore) {
787 $lastend = $result->[$i]->{end};
788 $lastscore = $result->[$i]->{score};
789 $id = $result->[$i]->{id};
790 }
791 }
792 elsif ($result->[$i]->{start} > $lastend) {
793 ## a new part of the alignment is covered. Fix the results obtained so far
794 $final->[$count]->{id} = $id;
795 $lastend = $result->[$i]->{end};
796 $id = $result->[$i]->{id};
797 $count++;
798 }
799 }
800 $final->[$count]->{id} = $id;
801 ## now concatenate the results
802 my $refprot = '';
803 my $refid = '';
804 my $refcds = '';
805 for (my $i = 0; $i < @$final; $i++) {
806 my $seq = $fileobj->{$spec}->{prot}->[$final->[$i]->{id}];
807 my $cdsseq = $fileobj->{$spec}->{cds}->[$final->[$i]->{id}];
808 my $length = length($seq);
809 $refid .= "$fileobj->{$spec}->{ids}->[$final->[$i]->{id}]-$length" . "PP";
810 $refprot .= $seq;
811 if ($estflag) {
812 $refcds .= $cdsseq;
813 }
814 }
815 $refid =~ s/PP$//;
816 return($refprot, $refcds, $refid);
817 }
818 #############################
819 sub extractSeq {
820 my ($id, @aln) = @_;
821 my $seq = '';
822 my $start = 0;
823 for (my $i = 0; $i < @aln; $i++) {
824 if ($aln[$i] =~ $id) {
825 $start = 1;
826 }
827 elsif ($aln[$i] =~ />/ and $start == 1) {
828 last;
829 }
830 elsif ($start == 1) {
831 $seq .= $aln[$i];
832 }
833 }
834 $seq =~ s/\s//g;
835 return ($seq);
836 }
837 ##############################
838 sub revComp {
839 my ($seq) = @_;
840 $seq =~ tr/AGCTYRKMWSagct/TCGARYMKWSTCGA/;
841 $seq = reverse($seq);
842 return($seq);
843 }
844 ##############################
845 sub parseHmmer3 {
846 my ($file, $path) = @_;
847 if (!defined $path) {
848 $path = '.';
849 }
850 open (IN, "$path/$file") or die "failed to open $file\n";
851 my @data = <IN>;
852 close IN;
853 ### extract the hits
854 my @hit;
855 my $start = 0;
856 my $stop = 0;
857 my $i = 0;
858 for (my $i = 0; $i < @data; $i++) {
859 if (!($data[$i] =~ /\S/)) {
860 next;
861 }
862 else {
863 if ($data[$i] =~ /Scores for complete sequence/) {
864 $start = 1;
865 $i += 4;
866 }
867 elsif (($data[$i] =~ /inclusion threshold/) or ($data[$i] =~ /Domain and alignment/)) {
868 last;
869 }
870 if ($start == 1 and $stop == 0) {
871 $data[$i] =~ s/^\s+//;
872 my @list = split /\s+/, $data[$i];
873 push @hit, $list[8];
874 $start = 0; #Added by THO
875 }
876 }
877 }
878 ### get the query_id
879 my ($query) = grep /^Query:/, @data;
880 $query =~ s/^Query:\s+//;
881 $query =~ s/\s.*//;
882 if (defined $hit[0]) {
883 chomp @hit;
884 return ($query, @hit);
885 }
886 else {
887 return ($query);
888 }
889 }
890 #####################
891 sub parseSeqfile {
892 my $seqref;
893 my $id;
894 my $spec;
895 my $seqid;
896 my $seq;
897 my $file = shift;
898 open (IN, "$file") or die "failed to open $file\n";
899 my @seqs = <IN>;
900 close IN;
901 chomp @seqs;
902 for (my $i = 0; $i < @seqs; $i++) {
903 if ($seqs[$i] =~ />/) {
904 $seqs[$i] =~ s/>//;
905 if (defined $id and defined $seq) {
906 $seqref->{$id}->{$spec}->{seqid} = $seqid;
907 $seqref->{$id}->{$spec}->{seq} = $seq;
908 $seq = undef;
909 }
910 ($id, $spec, $seqid) = split (/\|/, $seqs[$i]);
911 }
912 else {
913 $seq .= $seqs[$i];
914 }
915 }
916 if (defined $id and defined $seq) {
917 $seqref->{$id}->{$spec}->{seqid} = $seqid;
918 $seqref->{$id}->{$spec}->{seq} = $seq;
919 $seq = undef;
920 }
921 return ($seqref);
922 }