comparison tools/protein_analysis/rxlr_motifs.py @ 16:7de64c8b258d draft

Uploaded v0.2.5, MIT licence, RST for README, citation information, development moved to GitHub
author peterjc
date Wed, 18 Sep 2013 06:16:58 -0400
parents a290c6d4e658
children eb6ac44d4b8e
comparison
equal deleted inserted replaced
15:6abd809cefdd 16:7de64c8b258d
109 "whisson_et_al_rxlr_eer_cropped.hmm") 109 "whisson_et_al_rxlr_eer_cropped.hmm")
110 if not os.path.isfile(hmm_file): 110 if not os.path.isfile(hmm_file):
111 stop_err("Missing HMM file for Whisson et al. (2007)") 111 stop_err("Missing HMM file for Whisson et al. (2007)")
112 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): 112 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"):
113 stop_err("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_searcher) 113 stop_err("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_searcher)
114 #I've left the code to handle HMMER 3 in situ, in case 114
115 #we revisit the choice to insist on HMMER 2.
116 hmmer3 = (3 == get_hmmer_version(hmmer_search))
117 #Using zero (or 5.6?) for bitscore threshold
118 if hmmer3:
119 #The HMMER3 table output is easy to parse
120 #In HMMER3 can't use both -T and -E
121 cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \
122 % (hmmer_search, hmm_output_file, hmm_file, fasta_file)
123 else:
124 #For HMMER2 we are stuck with parsing stdout
125 #Put 1e6 to effectively have no expectation threshold (otherwise
126 #HMMER defaults to 10 and the calculated e-value depends on the
127 #input FASTA file, and we can loose hits of interest).
128 cmd = "%s -T 0 -E 1e6 %s %s > %s" \
129 % (hmmer_search, hmm_file, fasta_file, hmm_output_file)
130 return_code = os.system(cmd)
131 if return_code:
132 stop_err("Error %i from hmmsearch:\n%s" % (return_code, cmd))
133 hmm_hits = set() 115 hmm_hits = set()
134 valid_ids = set() 116 valid_ids = set()
135 for title, seq in fasta_iterator(fasta_file): 117 for title, seq in fasta_iterator(fasta_file):
136 name = title.split(None,1)[0] 118 name = title.split(None,1)[0]
137 if name in valid_ids: 119 if name in valid_ids:
138 stop_err("Duplicated identifier %r" % name) 120 stop_err("Duplicated identifier %r" % name)
139 else: 121 else:
140 valid_ids.add(name) 122 valid_ids.add(name)
141 handle = open(hmm_output_file) 123 if not valid_ids:
142 for line in handle: 124 #Special case, don't need to run HMMER if there are no sequences
143 if not line.strip(): 125 pass
144 #We expect blank lines in the HMMER2 stdout 126 else:
145 continue 127 #I've left the code to handle HMMER 3 in situ, in case
146 elif line.startswith("#"): 128 #we revisit the choice to insist on HMMER 2.
147 #Header 129 hmmer3 = (3 == get_hmmer_version(hmmer_search))
148 continue 130 #Using zero (or 5.6?) for bitscore threshold
131 if hmmer3:
132 #The HMMER3 table output is easy to parse
133 #In HMMER3 can't use both -T and -E
134 cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \
135 % (hmmer_search, hmm_output_file, hmm_file, fasta_file)
149 else: 136 else:
150 name = line.split(None,1)[0] 137 #For HMMER2 we are stuck with parsing stdout
151 #Should be a sequence name in the HMMER3 table output. 138 #Put 1e6 to effectively have no expectation threshold (otherwise
152 #Could be anything in the HMMER2 stdout. 139 #HMMER defaults to 10 and the calculated e-value depends on the
153 if name in valid_ids: 140 #input FASTA file, and we can loose hits of interest).
154 hmm_hits.add(name) 141 cmd = "%s -T 0 -E 1e6 %s %s > %s" \
155 elif hmmer3: 142 % (hmmer_search, hmm_file, fasta_file, hmm_output_file)
156 stop_err("Unexpected identifer %r in hmmsearch output" % name) 143 return_code = os.system(cmd)
157 handle.close() 144 if return_code:
158 #if hmmer3: 145 stop_err("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code)
159 # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) 146
160 #else: 147 handle = open(hmm_output_file)
161 # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) 148 for line in handle:
162 #print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) 149 if not line.strip():
163 os.remove(hmm_output_file) 150 #We expect blank lines in the HMMER2 stdout
151 continue
152 elif line.startswith("#"):
153 #Header
154 continue
155 else:
156 name = line.split(None,1)[0]
157 #Should be a sequence name in the HMMER3 table output.
158 #Could be anything in the HMMER2 stdout.
159 if name in valid_ids:
160 hmm_hits.add(name)
161 elif hmmer3:
162 stop_err("Unexpected identifer %r in hmmsearch output" % name)
163 handle.close()
164 #if hmmer3:
165 # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids))
166 #else:
167 # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids))
168 #print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids))
169 os.remove(hmm_output_file)
164 del valid_ids 170 del valid_ids
165 171
166 172
167 #Prepare short list of candidates containing RXLR to pass to SignalP 173 #Prepare short list of candidates containing RXLR to pass to SignalP
168 assert min_rxlr_start > 0, "Min value one, since zero based counting" 174 assert min_rxlr_start > 0, "Min value one, since zero based counting"