Mercurial > repos > peterjc > tmhmm_and_signalp
diff tools/protein_analysis/rxlr_motifs.py @ 16:7de64c8b258d draft
Uploaded v0.2.5, MIT licence, RST for README, citation information, development moved to GitHub
author | peterjc |
---|---|
date | Wed, 18 Sep 2013 06:16:58 -0400 |
parents | a290c6d4e658 |
children | eb6ac44d4b8e |
line wrap: on
line diff
--- a/tools/protein_analysis/rxlr_motifs.py Thu Apr 25 12:25:52 2013 -0400 +++ b/tools/protein_analysis/rxlr_motifs.py Wed Sep 18 06:16:58 2013 -0400 @@ -111,25 +111,7 @@ stop_err("Missing HMM file for Whisson et al. (2007)") if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): stop_err("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_searcher) - #I've left the code to handle HMMER 3 in situ, in case - #we revisit the choice to insist on HMMER 2. - hmmer3 = (3 == get_hmmer_version(hmmer_search)) - #Using zero (or 5.6?) for bitscore threshold - if hmmer3: - #The HMMER3 table output is easy to parse - #In HMMER3 can't use both -T and -E - cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \ - % (hmmer_search, hmm_output_file, hmm_file, fasta_file) - else: - #For HMMER2 we are stuck with parsing stdout - #Put 1e6 to effectively have no expectation threshold (otherwise - #HMMER defaults to 10 and the calculated e-value depends on the - #input FASTA file, and we can loose hits of interest). - cmd = "%s -T 0 -E 1e6 %s %s > %s" \ - % (hmmer_search, hmm_file, fasta_file, hmm_output_file) - return_code = os.system(cmd) - if return_code: - stop_err("Error %i from hmmsearch:\n%s" % (return_code, cmd)) + hmm_hits = set() valid_ids = set() for title, seq in fasta_iterator(fasta_file): @@ -138,29 +120,53 @@ stop_err("Duplicated identifier %r" % name) else: valid_ids.add(name) - handle = open(hmm_output_file) - for line in handle: - if not line.strip(): - #We expect blank lines in the HMMER2 stdout - continue - elif line.startswith("#"): - #Header - continue + if not valid_ids: + #Special case, don't need to run HMMER if there are no sequences + pass + else: + #I've left the code to handle HMMER 3 in situ, in case + #we revisit the choice to insist on HMMER 2. + hmmer3 = (3 == get_hmmer_version(hmmer_search)) + #Using zero (or 5.6?) for bitscore threshold + if hmmer3: + #The HMMER3 table output is easy to parse + #In HMMER3 can't use both -T and -E + cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \ + % (hmmer_search, hmm_output_file, hmm_file, fasta_file) else: - name = line.split(None,1)[0] - #Should be a sequence name in the HMMER3 table output. - #Could be anything in the HMMER2 stdout. - if name in valid_ids: - hmm_hits.add(name) - elif hmmer3: - stop_err("Unexpected identifer %r in hmmsearch output" % name) - handle.close() - #if hmmer3: - # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) - #else: - # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) - #print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) - os.remove(hmm_output_file) + #For HMMER2 we are stuck with parsing stdout + #Put 1e6 to effectively have no expectation threshold (otherwise + #HMMER defaults to 10 and the calculated e-value depends on the + #input FASTA file, and we can loose hits of interest). + cmd = "%s -T 0 -E 1e6 %s %s > %s" \ + % (hmmer_search, hmm_file, fasta_file, hmm_output_file) + return_code = os.system(cmd) + if return_code: + stop_err("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) + + handle = open(hmm_output_file) + for line in handle: + if not line.strip(): + #We expect blank lines in the HMMER2 stdout + continue + elif line.startswith("#"): + #Header + continue + else: + name = line.split(None,1)[0] + #Should be a sequence name in the HMMER3 table output. + #Could be anything in the HMMER2 stdout. + if name in valid_ids: + hmm_hits.add(name) + elif hmmer3: + stop_err("Unexpected identifer %r in hmmsearch output" % name) + handle.close() + #if hmmer3: + # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) + #else: + # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) + #print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) + os.remove(hmm_output_file) del valid_ids