Mercurial > repos > ucsb-phylogenetics > osiris_phylogenetics
comparison orthologs/ucsb_hamster/hamstrsearch_local-hmmer3.pl @ 0:5b9a38ec4a39 draft default tip
First commit of old repositories
author | osiris_phylogenetics <ucsb_phylogenetics@lifesci.ucsb.edu> |
---|---|
date | Tue, 11 Mar 2014 12:19:13 -0700 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5b9a38ec4a39 |
---|---|
1 #!/usr/bin/perl -w | |
2 use strict; | |
3 | |
4 use FindBin; | |
5 use lib "$FindBin::Bin/lib"; | |
6 use Getopt::Long; | |
7 use Bio::SearchIO; | |
8 use Bio::Search::Hit::BlastHit; | |
9 use run_genewise; | |
10 | |
11 # PROGRAMNAME: hamstrsearch_local.pl | |
12 | |
13 # Copyright (C) 2009 INGO EBERSBERGER, ingo.ebersberger@univie.ac.at | |
14 # This program is free software; you can redistribute it and/or modify it | |
15 # under the terms of the GNU General Public License as published | |
16 # by the Free Software Foundation; either version 3 of the License | |
17 # or any later version. | |
18 | |
19 # This program is distributed in the hope that it will be useful | |
20 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
21 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
22 # General Public License for more details. | |
23 # You should have received a copy of the GNU General Public License | |
24 # along with this program; If not, see http://www.gnu.org/licenses | |
25 | |
26 # PROGRAM DESCRIPTION: This is the relevant version! | |
27 | |
28 # DATE: Wed Dec 19 10:41:09 CEST 2007 | |
29 | |
30 | |
31 ######################## start main ############################# | |
32 my $version = "\nhamstrsearch_local-v1.0.pl\nGalaxy implementation by Todd H. Oakley and Roger Ngo, UCSB\n"; | |
33 print "$version\n"; | |
34 ### EDIT THE FOLLOWING LINES TO CUSTOMIZE YOUR SCRIPT | |
35 ## note: all the variables can also be set via the command line | |
36 my $pid = $$; | |
37 my $prog = 'hmmsearch'; #program for the hmm search | |
38 my $alignmentprog = 'clustalw'; | |
39 | |
40 | |
41 #PATH SETTINGS | |
42 my $hmmpath = '.'; | |
43 my $blastpath = '.'; #Uses Galaxy working directory | |
44 my $tmpdir = 'tmp_' . $pid; | |
45 my $eval = 1; # eval cutoff for the hmm search | |
46 my $logfile = "hamstrsearch_" . $pid . '.log'; | |
47 my $hmm_dir = 'hmm_dir'; | |
48 my $fa_dir = ''; | |
49 ############################## | |
50 my $help; | |
51 my $seq2store_file=''; | |
52 my $cds2store_file=''; | |
53 my $hmm; | |
54 my @hmms; | |
55 my $fa; | |
56 my $fafile; | |
57 my @seqs2store; | |
58 my @cds2store; | |
59 my $ep2eg; | |
60 my $estfile; | |
61 my $aln; | |
62 my $idfile; | |
63 my $taxon_check = 0; | |
64 my $hmmset; | |
65 my $hmmsearch_dir; | |
66 my $dbfile = ''; # the file hmmsearch is run against | |
67 my $dbfile_short; | |
68 my $taxon_file; | |
69 my $refspec_string; | |
70 my @refspec = qw(); | |
71 my @primer_taxa; | |
72 my $refspec_name = ''; | |
73 my $taxon_global; | |
74 my $fileobj; | |
75 my $fa_dir_neu = ''; | |
76 my $gwrefprot; | |
77 my $seqtype; | |
78 my $align; | |
79 my $rep; | |
80 my $estflag; | |
81 my $proteinflag; | |
82 my $refseq; | |
83 my $refspec_final = ''; | |
84 my $concat; | |
85 my $seqs2store_file; | |
86 | |
87 #####determine the hostname####### | |
88 my $hostname = `hostname`; | |
89 chomp $hostname; | |
90 print "hostname is $hostname\n"; | |
91 | |
92 ################################# | |
93 if (@ARGV==0) { | |
94 $help = 1; | |
95 } | |
96 ## help message | |
97 my $helpmessage = " | |
98 This program is freely distributed under a GPL. See -version for more info | |
99 Copyright (c) GRL limited: portions of the code are from separate copyrights | |
100 | |
101 \nUSAGE: hamstrsearch_local.pl -sequence_file=<> -hmmset=<> -taxon=<> -refspec=<> [-est|-protein] [-hmm=<>] [-representative] [-h] | |
102 | |
103 OPTIONS: | |
104 | |
105 -sequence_file: | |
106 path and name of the file containing the sequences hmmer is run against | |
107 Per default, this file should be in the data directory. | |
108 -est: | |
109 set this flag if you are searching in ESTs | |
110 -protein: | |
111 set this flag if you are searching in protein sequences | |
112 -hmmset: | |
113 specifies the name of the core-ortholog set. | |
114 The program will look for the files in the default directory 'core-orthologs' unless you specify | |
115 a different one. | |
116 -taxon: | |
117 You need to specify a default taxon name from which your ESTs or protein sequences are derived. | |
118 -refspec: | |
119 sets the reference species. Note, it has to be a species that contributed sequences | |
120 to the hmms you are using. NO DEFAULT IS SET! For a list of possible reference | |
121 taxa you can have a look at the speclist.txt file in the default core-ortholog sets | |
122 that come with this distribution. Please use the 5 letter abreviations. If you choose | |
123 to use core-orthologs were not every taxon is represented in all core-orthologs, you | |
124 can provide a comma-separated list with the preferred refspec first. The lower-ranking | |
125 reference species will only be used if a certain gene is not present in the preferred | |
126 refspecies due to alternative paths in the transitive closure to define | |
127 the core-orthologs. | |
128 CURRENTLY NO CHECK IS IMPLEMENTED! | |
129 NOTE: A BLAST-DB FOR THE REFERENCE SPECIES IS REQUIRED! | |
130 -eval_limit=<> | |
131 This options allows to set the e-value cut-off for the HMM search. | |
132 DEFAULT: 1 | |
133 -hmm: | |
134 option to provide only a single hmm to be used for the search. | |
135 Note, this file has to end with .hmm | |
136 | |
137 ### the following options should only be used when you chose to alter the default structure of the | |
138 ### hamstrsearch_local directories. Currently, this has not been extensively tested. | |
139 -fasta_file: | |
140 path and name of the file containing the core-ortholog sequences | |
141 you don't have to use this option when you | |
142 -hmmpath: | |
143 sets the path to the hmm_dir. By default this is set to the current directory. | |
144 -blastpath: | |
145 sets the path where the blast-dbs are located. Per default this is ../blast_dir | |
146 Note, the program expects the structure blastpath/refspec/refspec_prot. | |
147 To overrule this structure you will have to edit the script. | |
148 \n\n"; | |
149 GetOptions ("h" => \$help, | |
150 "hmm=s" => \$hmm, | |
151 "est" => \$estflag, | |
152 "protein"=> \$proteinflag, | |
153 "sequence_file=s" => \$dbfile, | |
154 "fasta_file=s" => \$fafile, | |
155 "hmmset=s" => \$hmmset, | |
156 "hmmpath=s" => \$hmmpath, | |
157 "taxon_file=s" => \$taxon_file, | |
158 "taxon=s" => \$taxon_global, | |
159 "eval_limit=s" => \$eval, | |
160 "refspec=s" => \$refspec_string, | |
161 "estfile=s" => \$estfile, | |
162 "representative" => \$rep, | |
163 "blastpath=s" => \$blastpath, | |
164 "galaxyout=s" => \$seqs2store_file, | |
165 "2galaxyout=s" => \$cds2store_file); | |
166 | |
167 if ($help) { | |
168 print $helpmessage; | |
169 exit; | |
170 } | |
171 | |
172 ## 1) check if all information is available to run HaMStR | |
173 my ($check, @log) = &checkInput(); | |
174 if ($check == 0) { | |
175 print join "\n", @log; | |
176 print "$helpmessage"; | |
177 exit; | |
178 } | |
179 else { | |
180 open (OUT, ">$logfile") or die "could not open logfile $logfile\n"; | |
181 print OUT join "\n", @log; | |
182 close OUT; | |
183 } | |
184 ### read in of the core-ortholog sequences | |
185 my $co_seqs = parseSeqfile("$fafile"); | |
186 | |
187 ## 2) loop through the hmms | |
188 ## process each hmm file separately | |
189 for (my $i = 0; $i < @hmms; $i++) { | |
190 $fileobj = undef; | |
191 my @seqs = qw(); | |
192 my @newseqs = qw();## var to contain the sequences to be added to the orthologous cluster | |
193 my @newcds = qw(); | |
194 my $hmm = $hmms[$i]; | |
195 my $hmmout = $hmm; | |
196 $hmmout =~ s/\.hmm/\.out/; | |
197 ## 3) run the hmm search | |
198 if (!(-e "$hmmsearch_dir/$hmmout")) { | |
199 print "now running $prog using $hmm\n"; | |
200 !`$prog $hmm_dir/$hmm $dbfile >$hmmsearch_dir/$hmmout` or die "Problem running hmmsearch\n"; | |
201 } | |
202 else { | |
203 print "an hmmresult $hmmout already exists. Using this one!\n"; | |
204 print "NOTE: in Galaxy the hmm results are stored in the directory of the dataset\n"; | |
205 } | |
206 | |
207 ## 4) process the hmm search result | |
208 my $hitcount = 0; | |
209 ## 4a) loop through the individual results | |
210 ## now the modified version for hmmer3 comes | |
211 my ($query_name, @results) = parseHmmer3($hmmout, $hmmsearch_dir); | |
212 if (! @results) { | |
213 print "no hit found for $query_name\n"; | |
214 next; | |
215 } | |
216 chomp $query_name; | |
217 print "Results for $query_name\n"; | |
218 for (my $k = 0; $k < @results; $k++) { | |
219 my $hitname = $results[$k]; | |
220 print "$hitname\n"; | |
221 my $keep = 0; | |
222 my $hitseq = ''; | |
223 $refseq = ''; | |
224 ## 4b) test for the reciprocity criterion fulfilled | |
225 ($keep, $hitname, $hitseq, $refspec_final, $refseq) = &check4reciprocity($query_name, $hitname, @refspec); | |
226 if ($keep == 1) { | |
227 ## blast search with the hmm hit identifies the core-ortholog sequence of the reference species | |
228 ## check for the taxon from the taxon_file.txt. | |
229 my $taxon = ''; | |
230 if ($taxon_check){ | |
231 if ($taxon_check == 1) { | |
232 $taxon = &getTaxon($hitname); | |
233 } | |
234 elsif ($taxon_check == 2) { | |
235 $taxon = $taxon_global; | |
236 } | |
237 } | |
238 ## put the info about the hits into an object for later post-processing | |
239 $fileobj->{$taxon}->{prot}->[$hitcount] = $hitseq; | |
240 $fileobj->{$taxon}->{ids}->[$hitcount] = $hitname; | |
241 $fileobj->{$taxon}->{refseq} = $refseq; | |
242 $hitcount++; | |
243 } | |
244 else { | |
245 print "match to different protein from $refspec_final\n"; | |
246 } | |
247 } | |
248 ## 5) do the rest only if at least one hit was obtained | |
249 if (defined $fileobj) { | |
250 ## 5a) if the hits are derived from ESTs, get the best ORF | |
251 if ($estflag) { | |
252 $fileobj = &predictORF(); | |
253 } | |
254 ## 5b) if the user has chosen to postprocess the results | |
255 if ($rep) { | |
256 &processHits($refseq, $concat); | |
257 } | |
258 ## 6) prepare the output | |
259 my @taxa = keys(%$fileobj); | |
260 for (my $i = 0; $i< @taxa; $i++) { | |
261 if ($rep) { | |
262 # push @newseqs, ">$query_name|$taxa[$i]|$fileobj->{$taxa[$i]}->{refid}"; | |
263 #Rearrange Order for Galaxy - want species first for phylotable format convert pipes to tabs | |
264 push @newseqs, ">$taxa[$i]\t$query_name\t$fileobj->{$taxa[$i]}->{refid}"; | |
265 push @newseqs, $fileobj->{$taxa[$i]}->{refprot}; | |
266 if ($estflag) { | |
267 # push @newcds, ">$query_name|$taxa[$i]|$fileobj->{$taxa[$i]}->{refid}"; | |
268 #Rearrange Order for Galaxy - want species first for phylotable format | |
269 push @newcds, ">$taxa[$i]\t$query_name\t$fileobj->{$taxa[$i]}->{refid}"; | |
270 push @newcds, $fileobj->{$taxa[$i]}->{refcds}; | |
271 } | |
272 } | |
273 else { | |
274 my $idobj = $fileobj->{$taxa[$i]}->{ids}; | |
275 my $protobj = $fileobj->{$taxa[$i]}->{prot}; | |
276 my $cdsobj = $fileobj->{$taxa[$i]}->{cds}; | |
277 for (my $j = 0; $j < @$idobj; $j++) { | |
278 # push @newseqs, ">$query_name|$taxa[$i]|$idobj->[$j]"; | |
279 #Rearrange Order for Galaxy - want species first for phylotable format also tabs not pipe | |
280 push @newseqs, ">$taxa[$i]\t$query_name\t$idobj->[$j]"; | |
281 push @newseqs, $protobj->[$j]; | |
282 if ($estflag) { | |
283 # push @newcds, ">$query_name|$taxa[$i]|$idobj->[$j]"; | |
284 #Rearrange Order for Galaxy - want species first for phylotable format | |
285 push @newcds, ">$taxa[$i]\t$query_name\t$idobj->[$j]"; | |
286 push @newcds, $cdsobj->[$j]; | |
287 } | |
288 } | |
289 } | |
290 my $refs = $co_seqs->{$query_name}; | |
291 for (keys %$refs) { | |
292 my $line = ">$query_name|$_|" . $refs->{$_}->{seqid} . "\n" . $refs->{$_}->{seq}; | |
293 push @seqs, $line; | |
294 } | |
295 chomp @seqs; | |
296 print "\n"; | |
297 @seqs = (@seqs, @newseqs); | |
298 open (OUT, ">$fa_dir_neu/$query_name.fa"); | |
299 print OUT join "\n", @seqs; | |
300 print OUT "\n"; | |
301 close OUT; | |
302 for (my $i = 0; $i < @newseqs; $i+= 2) { | |
303 # my $line = $newseqs[$i] . "|" . $newseqs[$i+1]; | |
304 #Galaxy uses tabs not pipes | |
305 my $line = $newseqs[$i] . "\t" . $newseqs[$i+1]; | |
306 $line =~ s/>//; | |
307 push @seqs2store, $line; | |
308 if ($estflag) { | |
309 #Galaxy uses tabs not pipes | |
310 # my $cdsline = $newcds[$i] . "|" . $newcds[$i+1]; | |
311 my $cdsline = $newcds[$i] . "\t" . $newcds[$i+1]; | |
312 $cdsline =~ s/>//; | |
313 push @cds2store, $cdsline; | |
314 } | |
315 } | |
316 } | |
317 } | |
318 } | |
319 if (@seqs2store > 0) { | |
320 open (OUT, ">$seqs2store_file") or die "failed to open output SEQS file\n"; | |
321 print OUT join "\n", @seqs2store; | |
322 print OUT "\n"; | |
323 close OUT; | |
324 if ($estflag) { | |
325 open (OUT, ">$cds2store_file") or die "failed to open output CDS file\n"; | |
326 print OUT join "\n", @cds2store; | |
327 print OUT "\n"; | |
328 close OUT; | |
329 } | |
330 } | |
331 else { | |
332 open (OUT, ">$seqs2store_file") or die "failed to open output SEQS file\n"; | |
333 print OUT "no hits found\n"; | |
334 } | |
335 exit; | |
336 ##################### start sub ############### | |
337 ####### checkInput performs a number of checks whether sufficient information | |
338 ### and all data are available to run HaMStR | |
339 sub checkInput { | |
340 my @log; | |
341 my $check = 1; | |
342 $dbfile_short = $dbfile; | |
343 $dbfile_short =~ s/\..*//; | |
344 ## 1) check for filetype | |
345 print "Checking for filetype:\t"; | |
346 if (!defined $estflag and !defined $proteinflag) { | |
347 push @log, "please determine the sequence type. Choose between the options -EST or -protein"; | |
348 print "failed\n"; | |
349 $check = 0; | |
350 } | |
351 else { | |
352 if ($estflag) { | |
353 $estfile = $dbfile; | |
354 $dbfile = "$dbfile.tc"; | |
355 push @log, "HaMStR will run on the ESTs in $estfile"; | |
356 push @log, "Translating ESTs"; | |
357 if (!(-e "$dbfile")) { | |
358 print "translating $estfile, this may take a while\n"; | |
359 `translate.pl -in=$estfile -out=$dbfile`; | |
360 open (LOG, "$logfile") or die "could not open logfile $logfile\n"; | |
361 my @info = <LOG>; | |
362 @log = (@log, @info); | |
363 close LOG; | |
364 } | |
365 else { | |
366 push @log, "Translated file already exists, using this one\n"; | |
367 } | |
368 if (! -e "$dbfile") { | |
369 push @log, "The translation of $estfile failed. Check the script translate.pl"; | |
370 print "failed\n"; | |
371 $check = 0; | |
372 } | |
373 } | |
374 else { | |
375 ## file type is protein | |
376 print "succeeded\n"; | |
377 } | |
378 } | |
379 ## 2) Check for presence of blastall | |
380 print "Checking for the blast program\t"; | |
381 if (!(`blastall`)) { | |
382 push @log, "could not execute blastall. Please check if this program is installed and executable"; | |
383 print "failed\n"; | |
384 $check = 0; | |
385 } | |
386 else { | |
387 push @log, "check for blastall succeeded"; | |
388 print "succeeded\n"; | |
389 } | |
390 ## 3) Check for presence of hmmsearch | |
391 print "Checking for hmmsearch\t"; | |
392 if (! `$prog -h`) { | |
393 push @log, "could not execute $prog. Please check if this program is installed and executable"; | |
394 print "failed\n"; | |
395 $check = 0; | |
396 } | |
397 else { | |
398 push @log, "check for $prog succeeded\n"; | |
399 print "succeeded\n"; | |
400 } | |
401 ## 4) Check for reference taxon | |
402 print "Checking for reference species and blast-dbs\t"; | |
403 if (!(defined $refspec_string)) { | |
404 push @log, "Please provide a reference species for the reblast!"; | |
405 print "failed\n"; | |
406 $check = 0; | |
407 } | |
408 else { | |
409 push @log, "Reference species for the re-blast: $refspec_string"; | |
410 @refspec = split /,/, $refspec_string; | |
411 $refspec_name = $refspec[0]; | |
412 print "succeeded\n"; | |
413 } | |
414 ## 5) Check for presence of the required blast dbs | |
415 print "checking for blast-dbs:\t"; | |
416 for (my $i = 0; $i < @refspec; $i++) { | |
417 my $blastpathtmp = "$blastpath/$refspec[$i]/$refspec[$i]" . "_prot"; | |
418 if (! (-e "$blastpathtmp.pin")) { | |
419 push @log, "please edit the blastpath. Could not find $blastpathtmp"; | |
420 print "$blastpathtmp failed\n"; | |
421 $check = 0; | |
422 } | |
423 else { | |
424 push @log, "check for $blastpathtmp succeeded"; | |
425 print "succeeded\n"; | |
426 } | |
427 } | |
428 ## 6) Check for presence of the directory structure | |
429 print "checking for presence of the hmm files:\t"; | |
430 if (!(defined $hmmset)) { | |
431 $hmmpath = '.'; | |
432 $hmmset = 'manual'; | |
433 } | |
434 else { | |
435 $hmmpath = "$hmmpath/$hmmset"; | |
436 $fafile = "$hmmpath/$hmmset" . '.fa'; | |
437 } | |
438 $hmm_dir = "$hmmpath/$hmm_dir"; | |
439 $hmmsearch_dir = $dbfile_short . '_' . $hmmset; | |
440 | |
441 #CHANGED FOR GALAXY DIRECTORY | |
442 # $fa_dir_neu = 'fa_dir_' . $dbfile_short . '_' . $hmmset . '_' . $refspec_name; | |
443 $fa_dir_neu = $dbfile_short . '_' . $hmmset . '_' . $refspec_name; | |
444 ## 7) check for the presence of the hmm-files and the fasta-file | |
445 if (!(-e "$hmm_dir")) { | |
446 push @log, "Could not find $hmm_dir"; | |
447 print "failed\n"; | |
448 $check = 0; | |
449 } | |
450 else { | |
451 if (defined $hmm) { | |
452 if (! -e "$hmm_dir/$hmm") { | |
453 push @log, "$hmm has been defined but could not be found in $hmm_dir/$hmm"; | |
454 $check = 0; | |
455 } | |
456 else { | |
457 push @log, "$hmm has been found"; | |
458 if ($hmm =~ /\.hmm$/) { | |
459 @hmms = ($hmm); | |
460 } | |
461 } | |
462 } | |
463 else { | |
464 push @log, "running HaMStR with all hmms in $hmm_dir"; | |
465 @hmms = `ls $hmm_dir`; | |
466 } | |
467 chomp @hmms; | |
468 print "succeeded\n"; | |
469 } | |
470 | |
471 ## 8) Test for presence of the fasta file containing the sequences of the core-ortholog cluster | |
472 print "checking for presence of the core-ortholog file:\t"; | |
473 if (defined $fafile) { | |
474 if (! -e "$fafile") { | |
475 push @log, "Could not find the file $fafile"; | |
476 print "failed\n"; | |
477 $check = 0; | |
478 } | |
479 else { | |
480 push @log, "check for $fafile succeeded"; | |
481 print "succeeded\n"; | |
482 } | |
483 } | |
484 else { | |
485 push @log, "Please provide path and name of fasta file containing the core-ortholog sequences"; | |
486 $check = 0; | |
487 print "failed\n"; | |
488 } | |
489 ## 9) Checks for the taxon_file | |
490 print "testing whether the taxon has been determined:\t"; | |
491 if (!(defined $taxon_file) or (!(-e "$taxon_file"))) { | |
492 if (defined $taxon_global) { | |
493 push @log, "using default taxon $taxon_global for all sequences"; | |
494 print "succeeded\n"; | |
495 $taxon_check = 2; | |
496 } | |
497 else { | |
498 push @log, "No taxon_file found. Please provide a global taxon name using the option -taxon"; | |
499 print "failed\n"; | |
500 $check = 0; | |
501 } | |
502 } | |
503 else { | |
504 push @log, "using the file $taxon_file as taxon_file"; | |
505 print "succeeded\n"; | |
506 $taxon_check = 1; | |
507 } | |
508 ## 10) Set the file where the matched seqs are found | |
509 #CHANGED BY THO FOR GALAXY TO ALLOW DETERMINATION OF OUTPUT FILE Made INPUT Option | |
510 # $seqs2store_file = 'hamstrsearch_' . $dbfile_short . '_' . $hmmset . '.out'; | |
511 # $cds2store_file = 'hamstrsearch_' . $dbfile_short . '_' . $hmmset . '_cds.out'; | |
512 | |
513 ## 11) apply the evalue-cut-off to the hmmsearch program | |
514 $prog = $prog . " -E $eval"; | |
515 push @log, "hmmsearch: $prog"; | |
516 | |
517 ## 12) setting up the directories where the output files will be put into. | |
518 if ($check == 1) { | |
519 if (!(-e "$hmmsearch_dir")) { | |
520 `mkdir $hmmsearch_dir`; | |
521 } | |
522 if (!(-e "$fa_dir_neu")) { | |
523 `mkdir $fa_dir_neu`; | |
524 } | |
525 if (!(-e "$tmpdir")) { | |
526 `mkdir $tmpdir`; | |
527 } | |
528 } | |
529 return ($check, @log); | |
530 } | |
531 ################# | |
532 ## check4reciprocity is the second major part of the program. It checks | |
533 ## whether the protein sequence that has been identified by the hmmsearch | |
534 ## identifies in turn the protein from the reference taxon that was used to | |
535 ## build the hmm. | |
536 sub check4reciprocity { | |
537 my ($query_name, $hitname, @refspec) = @_; | |
538 my $searchdb; | |
539 ## get the sequence from the db_file | |
540 my $hitseq = `grep -m 1 -A 1 ">$hitname" $dbfile | tail -n 1`; | |
541 if (!defined $hitseq) { | |
542 print "could not retrieve a sequence for $hitname. Skipping...\n"; | |
543 return(0, '', '', ''); | |
544 } | |
545 else { | |
546 ## now get the sequence used for building the hmm | |
547 my @original; | |
548 my $refspec_final; | |
549 for (my $i = 0; $i < @refspec; $i++) { | |
550 @original = `grep "^>$query_name|$refspec[$i]" $fafile |sed -e "s/.*$refspec[$i]\|//"`; | |
551 chomp @original; | |
552 | |
553 if (@original > 0) { | |
554 $refspec_final = $refspec[$i]; | |
555 $searchdb = "$blastpath/$refspec_final/$refspec_final" . "_prot"; | |
556 last; | |
557 } | |
558 else { | |
559 print "original sequence not be found with grepping for ^>$query_name|$refspec[$i]. Proceeding with next refspec\n"; | |
560 } | |
561 } | |
562 if (@original == 0) { | |
563 print "original sequence not be found\n"; | |
564 return (0, '', '', $refspec_final); | |
565 } | |
566 print "REFSPEC is $refspec_final\n"; | |
567 ## continue with the blast | |
568 chomp $hitseq; | |
569 # $hitname =~ s/\|.*//; | |
570 ## now run the blast | |
571 open (OUT, ">$tmpdir/$$.fa") or die "could not open out for writing\n"; | |
572 print OUT ">$hitname\n$hitseq"; | |
573 close OUT; | |
574 !`blastall -p blastp -d $searchdb -v 10 -b 10 -i $tmpdir/$$.fa -o $tmpdir/$$.blast` or die "Problem running blast\n"; | |
575 ## now parse the best blast hit | |
576 my @hits = &getBestBlasthit("$tmpdir/$$.blast"); | |
577 if (@hits > 0) { | |
578 | |
579 print "hmm-seq: ", join "\t", @original , "\n"; | |
580 ## now loop through the best hits with the same evalue and check whether | |
581 ## among these I find the same seq as in $original | |
582 for (my $i = 0; $i <@hits; $i++) { | |
583 print "blast-hit: $hits[$i]"; | |
584 ## now loop through all the refspec-sequences in the hmm file | |
585 for (my $j = 0; $j < @original; $j++) { | |
586 if ($original[$j] eq $hits[$i]) { | |
587 print "\tHit\n"; | |
588 my ($refseq) = `grep -A 1 "$query_name|$refspec_final|$original[$j]" $fafile |tail -n 1`; | |
589 return (1, $hitname, $hitseq, $refspec_final, $refseq); | |
590 } | |
591 else { | |
592 print "\nnot hitting $original[$j]\n"; | |
593 } | |
594 } | |
595 } | |
596 ### if we end up here, we didn't find a hit that matches to the original sequence | |
597 ### in the top hits with the same eval | |
598 return (0, '', '', $refspec_final); | |
599 } | |
600 else { | |
601 print "no hit obtained\n"; | |
602 return(0, '', '', $refspec_final); | |
603 } | |
604 } | |
605 } | |
606 ############# | |
607 sub getBestBlasthit { | |
608 my @hits; | |
609 my ($file) = @_; | |
610 my $searchio = Bio::SearchIO->new(-file => $file, | |
611 -format => 'blast', | |
612 -report_type => 'blastp') or die "parse failed"; | |
613 while( my $result = $searchio->next_result ){ | |
614 my $count = 0; | |
615 my $sig; | |
616 my $sig_old; | |
617 while( my $hit = $result->next_hit){ | |
618 ## now I enter all top hits having the same evalue into the result | |
619 $sig = $hit->score; | |
620 if (!defined $sig_old) { | |
621 $sig_old = $sig; | |
622 } | |
623 if ($sig == $sig_old) { | |
624 push @hits, $hit->accession; | |
625 } | |
626 else { | |
627 last; | |
628 } | |
629 } | |
630 } | |
631 return(@hits); | |
632 } | |
633 ################## | |
634 sub getTaxon { | |
635 my ($hitname) = @_; | |
636 # my $q = "select name from taxon t, est_project e, est_info i, annotation_neu a where a.id = $hitname and a.contig_id = i.contig_id and i.project_id = e.project_id and e.taxon_id = t.taxon_id"; | |
637 if ($hitname =~ /\D/) { | |
638 $hitname =~ s/_.*//; | |
639 } | |
640 my $taxon = `grep -m 1 "^$hitname," $taxon_file | sed -e 's/^.*,//'`; | |
641 chomp $taxon; | |
642 $taxon =~ s/^[0-9]+,//; | |
643 $taxon =~ s/\s*$//; | |
644 $taxon =~ s/\s/_/g; | |
645 if ($taxon) { | |
646 return ($taxon); | |
647 } | |
648 else { | |
649 return(); | |
650 } | |
651 } | |
652 ############### | |
653 sub processHits { | |
654 my ($concat) = @_; | |
655 ## 1) align all hit sequences for a taxon against the reference species | |
656 my @taxa = keys(%$fileobj); | |
657 for (my $i = 0; $i < @taxa; $i++) { | |
658 &orfRanking($taxa[$i]); | |
659 } | |
660 } | |
661 | |
662 ################ | |
663 sub predictORF { | |
664 my $fileobj_new; | |
665 # my ($refseq) = @_; | |
666 my @taxa = keys(%$fileobj); | |
667 for (my $i = 0; $i < @taxa; $i++) { | |
668 my $protobj = $fileobj->{$taxa[$i]}->{prot}; | |
669 my $idobj = $fileobj->{$taxa[$i]}->{ids}; | |
670 my $refseq = $fileobj->{$taxa[$i]}->{refseq}; | |
671 my @ids = @$idobj; | |
672 for (my $j = 0; $j < @ids; $j++) { | |
673 ## determine the reading frame | |
674 my ($rf) = $ids[$j] =~ /.*_RF([0-9]+)/; | |
675 print "rf is $rf\n"; | |
676 $ids[$j] =~ s/_RF.*//; | |
677 # my $est = `grep -A 1 "$ids[$j]" $estfile |tail -n 1`; | |
678 ################new grep command from version 8 to fix bug | |
679 my $est = `grep -A 1 ">$ids[$j]\\b" $estfile |tail -n 1`; | |
680 if (! $est) { | |
681 die "error in retrieval of est sequence for $ids[$j] in subroutine processHits\n"; | |
682 } | |
683 ## the EST is translated in rev complement | |
684 if ($rf > 3) { | |
685 $est = revComp($est); | |
686 } | |
687 | |
688 my $gw = run_genewise->new($est, $refseq, "$tmpdir"); | |
689 my $translation = $gw->translation; | |
690 my $cds = $gw->codons; | |
691 $translation =~ s/[-!]//g; | |
692 $fileobj_new->{$taxa[$i]}->{ids}->[$j] = $ids[$j]; | |
693 $fileobj_new->{$taxa[$i]}->{prot}->[$j] = $translation; | |
694 $fileobj_new->{$taxa[$i]}->{cds}->[$j] = $cds; | |
695 $fileobj_new->{$taxa[$i]}->{refseq} = $refseq; | |
696 } | |
697 } | |
698 return($fileobj_new); | |
699 } | |
700 ############################ | |
701 sub orfRanking { | |
702 my ($spec) = @_; | |
703 my $result; | |
704 my $refprot; | |
705 my $refcds; | |
706 my @toalign; | |
707 my $protobj = $fileobj->{$spec}->{prot}; | |
708 my $idobj = $fileobj->{$spec}->{ids}; | |
709 my $refcluster; ## variables to take the cluster and its id for later analysis | |
710 my $refid; | |
711 if (@$protobj == 1) { | |
712 ## nothing to chose from | |
713 $refprot = $protobj->[0]; | |
714 $refcds = $fileobj->{$spec}->{cds}->[0]; | |
715 my $length = length($refprot); | |
716 $refid = $idobj->[0] . "-" . $length; | |
717 } | |
718 else { | |
719 ## more than one cluster | |
720 push @toalign, ">$refspec_final"; | |
721 push @toalign, $fileobj->{$spec}->{refseq}; | |
722 ## now walk through all the contigs | |
723 for (my $i = 0; $i < @$protobj; $i++) { | |
724 my @testseq = (">$idobj->[$i]", $protobj->[$i]); | |
725 @testseq = (@testseq, @toalign); | |
726 open (OUT, ">$tmpdir/$pid.ref.fa") or die "could not open file for writing refseqs\n"; | |
727 print OUT join "\n", @testseq; | |
728 close OUT; | |
729 ## run clustalw | |
730 !(`$alignmentprog $tmpdir/$pid.ref.fa -output=fasta -outfile=$tmpdir/$pid.ref.aln 2>&1 >$tmpdir/$pid.ref.log`) or die "error running clustalw\n"; | |
731 ## get the alignment score | |
732 $result->[$i]->{score} = `grep "Alignment Score" $tmpdir/$pid.ref.log |sed -e 's/[^0-9]//g'`; | |
733 if (!$result->[$i]->{score}) { | |
734 die "error in determining alignment score\n"; | |
735 } | |
736 chomp $result->[$i]->{score}; | |
737 ## get the aligned sequence | |
738 open (ALN, "$tmpdir/$pid.ref.aln") or die "failed to open alignment file\n"; | |
739 my @aln = <ALN>; | |
740 close ALN; | |
741 my $aseq = extractSeq($idobj->[$i], @aln); | |
742 ## remove the terminal gaps | |
743 $aseq =~ s/-*$//; | |
744 $result->[$i]->{aend} = length $aseq; | |
745 my ($head) = $aseq =~ /^(-*).*/; | |
746 ($result->[$i]->{astart}) = length($head)+1; | |
747 } | |
748 ### the results for all seqs has been gathered, now order them | |
749 $result = sortRef($result); | |
750 ($refprot, $refcds, $refid) = &determineRef($result,$spec); | |
751 } | |
752 $fileobj->{$spec}->{refprot} = $refprot; | |
753 $fileobj->{$spec}->{refcds} = $refcds; | |
754 $fileobj->{$spec}->{refid} = $refid; | |
755 return(); | |
756 } | |
757 ########################### | |
758 sub sortRef { | |
759 my $result = shift; | |
760 my @sort; | |
761 for (my $i = 0; $i < @$result; $i++) { | |
762 push @sort, "$i,$result->[$i]->{astart},$result->[$i]->{aend},$result->[$i]->{score}"; | |
763 } | |
764 open (OUT, ">$tmpdir/$pid.sort") or die "failed to write for sorting\n"; | |
765 print OUT join "\n", @sort; | |
766 close OUT; | |
767 `sort -n -t ',' -k 2 $tmpdir/$pid.sort >$tmpdir/$pid.sort.out`; | |
768 @sort = `less $tmpdir/$pid.sort`; | |
769 chomp @sort; | |
770 $result = undef; | |
771 for (my $i = 0; $i < @sort; $i++) { | |
772 ($result->[$i]->{id}, $result->[$i]->{start}, $result->[$i]->{end}, $result->[$i]->{score}) = split ',', $sort[$i]; | |
773 } | |
774 return($result); | |
775 } | |
776 ######################## | |
777 sub determineRef { | |
778 my ($result, $spec) = @_; | |
779 my $lastend = 0; | |
780 my $lastscore = 0; | |
781 my $final; | |
782 my $count = 0; | |
783 my $id = ''; | |
784 for (my $i = 0; $i < @$result; $i++) { | |
785 if ($result->[$i]->{start} < $lastend or $lastend == 0) { | |
786 if ($result->[$i]->{score} > $lastscore) { | |
787 $lastend = $result->[$i]->{end}; | |
788 $lastscore = $result->[$i]->{score}; | |
789 $id = $result->[$i]->{id}; | |
790 } | |
791 } | |
792 elsif ($result->[$i]->{start} > $lastend) { | |
793 ## a new part of the alignment is covered. Fix the results obtained so far | |
794 $final->[$count]->{id} = $id; | |
795 $lastend = $result->[$i]->{end}; | |
796 $id = $result->[$i]->{id}; | |
797 $count++; | |
798 } | |
799 } | |
800 $final->[$count]->{id} = $id; | |
801 ## now concatenate the results | |
802 my $refprot = ''; | |
803 my $refid = ''; | |
804 my $refcds = ''; | |
805 for (my $i = 0; $i < @$final; $i++) { | |
806 my $seq = $fileobj->{$spec}->{prot}->[$final->[$i]->{id}]; | |
807 my $cdsseq = $fileobj->{$spec}->{cds}->[$final->[$i]->{id}]; | |
808 my $length = length($seq); | |
809 $refid .= "$fileobj->{$spec}->{ids}->[$final->[$i]->{id}]-$length" . "PP"; | |
810 $refprot .= $seq; | |
811 if ($estflag) { | |
812 $refcds .= $cdsseq; | |
813 } | |
814 } | |
815 $refid =~ s/PP$//; | |
816 return($refprot, $refcds, $refid); | |
817 } | |
818 ############################# | |
819 sub extractSeq { | |
820 my ($id, @aln) = @_; | |
821 my $seq = ''; | |
822 my $start = 0; | |
823 for (my $i = 0; $i < @aln; $i++) { | |
824 if ($aln[$i] =~ $id) { | |
825 $start = 1; | |
826 } | |
827 elsif ($aln[$i] =~ />/ and $start == 1) { | |
828 last; | |
829 } | |
830 elsif ($start == 1) { | |
831 $seq .= $aln[$i]; | |
832 } | |
833 } | |
834 $seq =~ s/\s//g; | |
835 return ($seq); | |
836 } | |
837 ############################## | |
838 sub revComp { | |
839 my ($seq) = @_; | |
840 $seq =~ tr/AGCTYRKMWSagct/TCGARYMKWSTCGA/; | |
841 $seq = reverse($seq); | |
842 return($seq); | |
843 } | |
844 ############################## | |
845 sub parseHmmer3 { | |
846 my ($file, $path) = @_; | |
847 if (!defined $path) { | |
848 $path = '.'; | |
849 } | |
850 open (IN, "$path/$file") or die "failed to open $file\n"; | |
851 my @data = <IN>; | |
852 close IN; | |
853 ### extract the hits | |
854 my @hit; | |
855 my $start = 0; | |
856 my $stop = 0; | |
857 my $i = 0; | |
858 for (my $i = 0; $i < @data; $i++) { | |
859 if (!($data[$i] =~ /\S/)) { | |
860 next; | |
861 } | |
862 else { | |
863 if ($data[$i] =~ /Scores for complete sequence/) { | |
864 $start = 1; | |
865 $i += 4; | |
866 } | |
867 elsif (($data[$i] =~ /inclusion threshold/) or ($data[$i] =~ /Domain and alignment/)) { | |
868 last; | |
869 } | |
870 if ($start == 1 and $stop == 0) { | |
871 $data[$i] =~ s/^\s+//; | |
872 my @list = split /\s+/, $data[$i]; | |
873 push @hit, $list[8]; | |
874 $start = 0; #Added by THO | |
875 } | |
876 } | |
877 } | |
878 ### get the query_id | |
879 my ($query) = grep /^Query:/, @data; | |
880 $query =~ s/^Query:\s+//; | |
881 $query =~ s/\s.*//; | |
882 if (defined $hit[0]) { | |
883 chomp @hit; | |
884 return ($query, @hit); | |
885 } | |
886 else { | |
887 return ($query); | |
888 } | |
889 } | |
890 ##################### | |
891 sub parseSeqfile { | |
892 my $seqref; | |
893 my $id; | |
894 my $spec; | |
895 my $seqid; | |
896 my $seq; | |
897 my $file = shift; | |
898 open (IN, "$file") or die "failed to open $file\n"; | |
899 my @seqs = <IN>; | |
900 close IN; | |
901 chomp @seqs; | |
902 for (my $i = 0; $i < @seqs; $i++) { | |
903 if ($seqs[$i] =~ />/) { | |
904 $seqs[$i] =~ s/>//; | |
905 if (defined $id and defined $seq) { | |
906 $seqref->{$id}->{$spec}->{seqid} = $seqid; | |
907 $seqref->{$id}->{$spec}->{seq} = $seq; | |
908 $seq = undef; | |
909 } | |
910 ($id, $spec, $seqid) = split (/\|/, $seqs[$i]); | |
911 } | |
912 else { | |
913 $seq .= $seqs[$i]; | |
914 } | |
915 } | |
916 if (defined $id and defined $seq) { | |
917 $seqref->{$id}->{$spec}->{seqid} = $seqid; | |
918 $seqref->{$id}->{$spec}->{seq} = $seq; | |
919 $seq = undef; | |
920 } | |
921 return ($seqref); | |
922 } |