comparison tools/protein_analysis/rxlr_motifs.py @ 21:238eae32483c draft

"Check this is up to date with all 2020 changes (black etc)"
author peterjc
date Thu, 17 Jun 2021 08:21:06 +0000
parents a19b3ded8f33
children e1996f0f4e85
comparison
equal deleted inserted replaced
20:a19b3ded8f33 21:238eae32483c
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 """Implements assorted RXLR motif methods from the literature 2 """Implements assorted RXLR motif methods from the literature.
3 3
4 This script takes exactly four command line arguments: 4 This script takes exactly four command line arguments:
5 * Protein FASTA filename 5 * Protein FASTA filename
6 * Number of threads 6 * Number of threads
7 * Model name (Bhattacharjee2006, Win2007, Whisson2007) 7 * Model name (Bhattacharjee2006, Win2007, Whisson2007)
8 * Output tabular filename 8 * Output tabular filename
9 9
10 The model names are: 10 The model names are:
11 11 * Bhattacharjee2006: Simple regular expression search for RXLR
12 Bhattacharjee2006: Simple regular expression search for RXLR 12 with additional requirements for positioning and signal peptide.
13 with additional requirements for positioning and signal peptide. 13 * Win2007: Simple regular expression search for RXLR, but with
14 14 different positional requirements.
15 Win2007: Simple regular expression search for RXLR, but with 15 * Whisson2007: As Bhattacharjee2006 but with a more complex regular
16 different positional requirements. 16 expression to look for RXLR-EER domain, and additionally calls HMMER.
17
18 Whisson2007: As Bhattacharjee2006 but with a more complex regular
19 expression to look for RXLR-EER domain, and additionally calls HMMER.
20 17
21 See the help text in the accompanying Galaxy tool XML file for more 18 See the help text in the accompanying Galaxy tool XML file for more
22 details including the full references. 19 details including the full references.
23 20
24 Note: 21 Note
25 22 ----
26 Bhattacharjee et al. (2006) and Win et al. (2007) used SignalP v2.0, 23 Bhattacharjee et al. (2006) and Win et al. (2007) used SignalP v2.0,
27 which is no longer available. The current release is SignalP v3.0 24 which is no longer available. The current release is SignalP v3.0
28 (Mar 5, 2007). We have therefore opted to use the NN Ymax position for 25 (Mar 5, 2007). We have therefore opted to use the NN Ymax position for
29 the predicted cleavage site, as this is expected to be more accurate. 26 the predicted cleavage site, as this is expected to be more accurate.
30 Also note that the HMM score values have changed from v2.0 to v3.0. 27 Also note that the HMM score values have changed from v2.0 to v3.0.
53 import sys 50 import sys
54 51
55 from seq_analysis_utils import fasta_iterator 52 from seq_analysis_utils import fasta_iterator
56 53
57 if "-v" in sys.argv: 54 if "-v" in sys.argv:
58 print("RXLR Motifs v0.0.14") 55 print("RXLR Motifs v0.0.16")
59 sys.exit(0) 56 sys.exit(0)
60 57
61 if len(sys.argv) != 5: 58 if len(sys.argv) != 5:
62 sys.exit("Requires four arguments: protein FASTA filename, threads, model, and output filename") 59 sys.exit(
60 "Requires four arguments: protein FASTA filename, threads, "
61 "model, and output filename"
62 )
63 63
64 fasta_file, threads, model, tabular_file = sys.argv[1:] 64 fasta_file, threads, model, tabular_file = sys.argv[1:]
65 hmm_output_file = tabular_file + ".hmm.tmp" 65 hmm_output_file = tabular_file + ".hmm.tmp"
66 signalp_input_file = tabular_file + ".fasta.tmp" 66 signalp_input_file = tabular_file + ".fasta.tmp"
67 signalp_output_file = tabular_file + ".tabular.tmp" 67 signalp_output_file = tabular_file + ".tabular.tmp"
96 max_sp = 40 96 max_sp = 40
97 max_sp_rxlr = 100 97 max_sp_rxlr = 100
98 min_rxlr_start = 1 98 min_rxlr_start = 1
99 max_rxlr_start = max_sp + max_sp_rxlr 99 max_rxlr_start = max_sp + max_sp_rxlr
100 else: 100 else:
101 sys.exit("Did not recognise the model name %r\n" 101 sys.exit(
102 "Use Bhattacharjee2006, Win2007, or Whisson2007" % model) 102 "Did not recognise the model name %r\n"
103 "Use Bhattacharjee2006, Win2007, or Whisson2007" % model
104 )
103 105
104 106
105 def get_hmmer_version(exe, required=None): 107 def get_hmmer_version(exe, required=None):
106 try: 108 try:
107 child = subprocess.Popen([exe, "-h"], 109 child = subprocess.Popen(
108 universal_newlines=True, 110 [exe, "-h"],
109 stdout=subprocess.PIPE, stderr=subprocess.PIPE) 111 universal_newlines=True,
112 stdout=subprocess.PIPE,
113 stderr=subprocess.PIPE,
114 )
110 except OSError: 115 except OSError:
111 raise ValueError("Could not run %s" % exe) 116 raise ValueError("Could not run %s" % exe)
112 stdout, stderr = child.communicate() 117 stdout, stderr = child.communicate()
113 if required: 118 if required:
114 return required in stdout 119 return required in stdout
120 raise ValueError("Could not determine version of %s" % exe) 125 raise ValueError("Could not determine version of %s" % exe)
121 126
122 127
123 # Run hmmsearch for Whisson et al. (2007) 128 # Run hmmsearch for Whisson et al. (2007)
124 if model == "Whisson2007": 129 if model == "Whisson2007":
125 hmm_file = os.path.join(os.path.split(sys.argv[0])[0], 130 hmm_file = os.path.join(
126 "whisson_et_al_rxlr_eer_cropped.hmm") 131 os.path.split(sys.argv[0])[0], "whisson_et_al_rxlr_eer_cropped.hmm"
132 )
127 if not os.path.isfile(hmm_file): 133 if not os.path.isfile(hmm_file):
128 sys.exit("Missing HMM file for Whisson et al. (2007)") 134 sys.exit("Missing HMM file for Whisson et al. (2007)")
129 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): 135 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"):
130 sys.exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search) 136 sys.exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search)
131 137
141 # Special case, don't need to run HMMER if there are no sequences 147 # Special case, don't need to run HMMER if there are no sequences
142 pass 148 pass
143 else: 149 else:
144 # I've left the code to handle HMMER 3 in situ, in case 150 # I've left the code to handle HMMER 3 in situ, in case
145 # we revisit the choice to insist on HMMER 2. 151 # we revisit the choice to insist on HMMER 2.
146 hmmer3 = (3 == get_hmmer_version(hmmer_search)) 152 hmmer3 = 3 == get_hmmer_version(hmmer_search)
147 # Using zero (or 5.6?) for bitscore threshold 153 # Using zero (or 5.6?) for bitscore threshold
148 if hmmer3: 154 if hmmer3:
149 # The HMMER3 table output is easy to parse 155 # The HMMER3 table output is easy to parse
150 # In HMMER3 can't use both -T and -E 156 # In HMMER3 can't use both -T and -E
151 cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \ 157 cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" % (
152 % (hmmer_search, hmm_output_file, hmm_file, fasta_file) 158 hmmer_search,
159 hmm_output_file,
160 hmm_file,
161 fasta_file,
162 )
153 else: 163 else:
154 # For HMMER2 we are stuck with parsing stdout 164 # For HMMER2 we are stuck with parsing stdout
155 # Put 1e6 to effectively have no expectation threshold (otherwise 165 # Put 1e6 to effectively have no expectation threshold (otherwise
156 # HMMER defaults to 10 and the calculated e-value depends on the 166 # HMMER defaults to 10 and the calculated e-value depends on the
157 # input FASTA file, and we can loose hits of interest). 167 # input FASTA file, and we can loose hits of interest).
158 cmd = "%s -T 0 -E 1e6 %s %s > %s" \ 168 cmd = "%s -T 0 -E 1e6 %s %s > %s" % (
159 % (hmmer_search, hmm_file, fasta_file, hmm_output_file) 169 hmmer_search,
170 hmm_file,
171 fasta_file,
172 hmm_output_file,
173 )
160 return_code = os.system(cmd) 174 return_code = os.system(cmd)
161 if return_code: 175 if return_code:
162 sys.exit("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) 176 sys.stderr.write("Error %i from hmmsearch:\n%s\n" % (return_code, cmd))
177 sys.exit(return_code)
163 178
164 handle = open(hmm_output_file) 179 handle = open(hmm_output_file)
165 for line in handle: 180 for line in handle:
166 if not line.strip(): 181 if not line.strip():
167 # We expect blank lines in the HMMER2 stdout 182 # We expect blank lines in the HMMER2 stdout
193 total = 0 208 total = 0
194 handle = open(signalp_input_file, "w") 209 handle = open(signalp_input_file, "w")
195 for title, seq in fasta_iterator(fasta_file): 210 for title, seq in fasta_iterator(fasta_file):
196 total += 1 211 total += 1
197 name = title.split(None, 1)[0] 212 name = title.split(None, 1)[0]
198 match = re_rxlr.search(seq[min_rxlr_start - 1:].upper()) 213 match = re_rxlr.search(seq[min_rxlr_start - 1 :].upper())
199 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: 214 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start:
200 # This is a potential RXLR, depending on the SignalP results. 215 # This is a potential RXLR, depending on the SignalP results.
201 # Might as well truncate the sequence now, makes the temp file smaller 216 # Might as well truncate the sequence now, makes the temp file smaller
202 if signalp_trunc: 217 if signalp_trunc:
203 handle.write(">%s (truncated)\n%s\n" % (name, seq[:signalp_trunc])) 218 handle.write(">%s (truncated)\n%s\n" % (name, seq[:signalp_trunc]))
211 226
212 # Run SignalP (using our wrapper script to get multi-core support etc) 227 # Run SignalP (using our wrapper script to get multi-core support etc)
213 signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py") 228 signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py")
214 if not os.path.isfile(signalp_script): 229 if not os.path.isfile(signalp_script):
215 sys.exit("Error - missing signalp3.py script") 230 sys.exit("Error - missing signalp3.py script")
216 cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file) 231 cmd = "python '%s' 'euk' '%i' '%s' '%s' '%s'" % (
232 signalp_script,
233 signalp_trunc,
234 threads,
235 signalp_input_file,
236 signalp_output_file,
237 )
217 return_code = os.system(cmd) 238 return_code = os.system(cmd)
218 if return_code: 239 if return_code:
219 sys.exit("Error %i from SignalP:\n%s" % (return_code, cmd)) 240 sys.exit("Error %i from SignalP:\n%s" % (return_code, cmd))
220 # print "SignalP done" 241 # print "SignalP done"
221 242
222 243
223 def parse_signalp(filename): 244 def parse_signalp(filename):
224 """Parse SignalP output, yield tuples of ID, HMM_Sprob_score and NN predicted signal peptide length. 245 """Parse SignalP output, yield tuples of values.
246
247 Returns tuples of ID, HMM_Sprob_score and NN predicted signal
248 peptide length.
225 249
226 For signal peptide length we use NN_Ymax_pos (minus one). 250 For signal peptide length we use NN_Ymax_pos (minus one).
227 """ 251 """
228 handle = open(filename) 252 handle = open(filename)
229 line = handle.readline() 253 line = handle.readline()
235 handle.close() 259 handle.close()
236 260
237 261
238 # Parse SignalP results and apply the strict RXLR criteria 262 # Parse SignalP results and apply the strict RXLR criteria
239 total = 0 263 total = 0
240 tally = dict() 264 tally = {}
241 handle = open(tabular_file, "w") 265 handle = open(tabular_file, "w")
242 handle.write("#ID\t%s\n" % model) 266 handle.write("#ID\t%s\n" % model)
243 signalp_results = parse_signalp(signalp_output_file) 267 signalp_results = parse_signalp(signalp_output_file)
244 for title, seq in fasta_iterator(fasta_file): 268 for title, seq in fasta_iterator(fasta_file):
245 total += 1 269 total += 1
246 rxlr = "N" 270 rxlr = "N"
247 name = title.split(None, 1)[0] 271 name = title.split(None, 1)[0]
248 match = re_rxlr.search(seq[min_rxlr_start - 1:].upper()) 272 match = re_rxlr.search(seq[min_rxlr_start - 1 :].upper())
249 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: 273 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start:
250 del match 274 del match
251 # This was the criteria for calling SignalP, 275 # This was the criteria for calling SignalP,
252 # so it will be in the SignalP results. 276 # so it will be in the SignalP results.
253 sp_id, sp_hmm_score, sp_nn_len = signalp_results.next() 277 sp_id, sp_hmm_score, sp_nn_len = next(signalp_results)
254 assert name == sp_id, "%s vs %s" % (name, sp_id) 278 assert name == sp_id, "%s vs %s" % (name, sp_id)
255 if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp: 279 if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp:
256 match = re_rxlr.search(seq[sp_nn_len:].upper()) 280 match = re_rxlr.search(seq[sp_nn_len:].upper())
257 if match and match.start() + 1 <= max_sp_rxlr: # 1-based counting 281 if match and match.start() + 1 <= max_sp_rxlr: # 1-based counting
258 rxlr_start = sp_nn_len + match.start() + 1 282 rxlr_start = sp_nn_len + match.start() + 1
276 handle.close() 300 handle.close()
277 assert sum(tally.values()) == total 301 assert sum(tally.values()) == total
278 302
279 # Check the iterator is finished 303 # Check the iterator is finished
280 try: 304 try:
281 signalp_results.next() 305 next(signalp_results)
282 assert False, "Unexpected data in SignalP output" 306 assert False, "Unexpected data in SignalP output"
283 except StopIteration: 307 except StopIteration:
284 pass 308 pass
285 309
286 # Cleanup 310 # Cleanup
287 os.remove(signalp_input_file) 311 os.remove(signalp_input_file)
288 os.remove(signalp_output_file) 312 os.remove(signalp_output_file)
289 313
290 # Short summary to stdout for Galaxy's info display 314 # Short summary to stdout for Galaxy's info display
291 print("%s for %i sequences:" % (model, total)) 315 print("%s for %i sequences:" % (model, total))
292 print(", ".join("%s = %i" % kv for kv in sorted(tally.iteritems()))) 316 print(", ".join("%s = %i" % kv for kv in sorted(tally.items())))