comparison tools/protein_analysis/rxlr_motifs.py @ 6:a290c6d4e658

Migrated tool version 0.0.9 from old tool shed archive to new tool shed repository
author peterjc
date Tue, 07 Jun 2011 18:07:09 -0400
parents
children 7de64c8b258d
comparison
equal deleted inserted replaced
5:0f1c61998b22 6:a290c6d4e658
1 #!/usr/bin/env python
2 """Implements assorted RXLR motif methods from the literature
3
4 This script takes exactly four command line arguments:
5 * Protein FASTA filename
6 * Number of threads
7 * Model name (Bhattacharjee2006, Win2007, Whisson2007)
8 * Output tabular filename
9
10 The model names are:
11
12 Bhattacharjee2006: Simple regular expression search for RXLR
13 with additional requirements for positioning and signal peptide.
14
15 Win2007: Simple regular expression search for RXLR, but with
16 different positional requirements.
17
18 Whisson2007: As Bhattacharjee2006 but with a more complex regular
19 expression to look for RXLR-EER domain, and additionally calls HMMER.
20
21 See the help text in the accompanying Galaxy tool XML file for more
22 details including the full references.
23
24 Note:
25
26 Bhattacharjee et al. (2006) and Win et al. (2007) used SignalP v2.0,
27 which is no longer available. The current release is SignalP v3.0
28 (Mar 5, 2007). We have therefore opted to use the NN Ymax position for
29 the predicted cleavage site, as this is expected to be more accurate.
30 Also note that the HMM score values have changed from v2.0 to v3.0.
31 Whisson et al. (2007) used SignalP v3.0 anyway.
32
33 Whisson et al. (2007) used HMMER 2.3.2, and althought their HMM model
34 can still be used with hmmsearch from HMMER 3 this this does give
35 slightly different results. We expect the hmmsearch from HMMER 2.3.2
36 (the last stable release of HMMER 2) to be present on the path under
37 the name hmmsearch2 (allowing it to co-exist with HMMER 3).
38 """
39 import os
40 import sys
41 import re
42 import subprocess
43 from seq_analysis_utils import stop_err, fasta_iterator
44
45 if len(sys.argv) != 5:
46 stop_err("Requires four arguments: protein FASTA filename, threads, model, and output filename")
47
48 fasta_file, threads, model, tabular_file = sys.argv[1:]
49 hmm_output_file = tabular_file + ".hmm.tmp"
50 signalp_input_file = tabular_file + ".fasta.tmp"
51 signalp_output_file = tabular_file + ".tabular.tmp"
52 min_signalp_hmm = 0.9
53 hmmer_search = "hmmsearch2"
54
55 if model == "Bhattacharjee2006":
56 signalp_trunc = 70
57 re_rxlr = re.compile("R.LR")
58 min_sp = 10
59 max_sp = 40
60 max_sp_rxlr = 100
61 min_rxlr_start = 1
62 #Allow signal peptide to be at most 40aa, and want RXLR to be
63 #within 100aa, therefore for the prescreen the max start is 140:
64 max_rxlr_start = max_sp + max_sp_rxlr
65 elif model == "Win2007":
66 signalp_trunc = 70
67 re_rxlr = re.compile("R.LR")
68 min_sp = 10
69 max_sp = 40
70 min_rxlr_start = 30
71 max_rxlr_start = 60
72 #No explicit limit on separation of signal peptide clevage
73 #and RXLR, but shortest signal peptide is 10, and furthest
74 #away RXLR is 60, so effectively limit is 50.
75 max_sp_rxlr = max_rxlr_start - min_sp + 1
76 elif model == "Whisson2007":
77 signalp_trunc = 0 #zero for no truncation
78 re_rxlr = re.compile("R.LR.{,40}[ED][ED][KR]")
79 min_sp = 10
80 max_sp = 40
81 max_sp_rxlr = 100
82 min_rxlr_start = 1
83 max_rxlr_start = max_sp + max_sp_rxlr
84 else:
85 stop_err("Did not recognise the model name %r\n"
86 "Use Bhattacharjee2006, Win2007, or Whisson2007" % model)
87
88
89 def get_hmmer_version(exe, required=None):
90 cmd = "%s -h" % exe
91 try:
92 child = subprocess.Popen([exe, "-h"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
93 except OSError:
94 raise ValueError("Could not run %s" % exe)
95 stdout, stderr = child.communicate()
96 if required:
97 return required in stdout
98 elif "HMMER 2" in stdout:
99 return 2
100 elif "HMMER 3" in stdout:
101 return 3
102 else:
103 raise ValueError("Could not determine version of %s" % exe)
104
105
106 #Run hmmsearch for Whisson et al. (2007)
107 if model == "Whisson2007":
108 hmm_file = os.path.join(os.path.split(sys.argv[0])[0],
109 "whisson_et_al_rxlr_eer_cropped.hmm")
110 if not os.path.isfile(hmm_file):
111 stop_err("Missing HMM file for Whisson et al. (2007)")
112 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"):
113 stop_err("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_searcher)
114 #I've left the code to handle HMMER 3 in situ, in case
115 #we revisit the choice to insist on HMMER 2.
116 hmmer3 = (3 == get_hmmer_version(hmmer_search))
117 #Using zero (or 5.6?) for bitscore threshold
118 if hmmer3:
119 #The HMMER3 table output is easy to parse
120 #In HMMER3 can't use both -T and -E
121 cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \
122 % (hmmer_search, hmm_output_file, hmm_file, fasta_file)
123 else:
124 #For HMMER2 we are stuck with parsing stdout
125 #Put 1e6 to effectively have no expectation threshold (otherwise
126 #HMMER defaults to 10 and the calculated e-value depends on the
127 #input FASTA file, and we can loose hits of interest).
128 cmd = "%s -T 0 -E 1e6 %s %s > %s" \
129 % (hmmer_search, hmm_file, fasta_file, hmm_output_file)
130 return_code = os.system(cmd)
131 if return_code:
132 stop_err("Error %i from hmmsearch:\n%s" % (return_code, cmd))
133 hmm_hits = set()
134 valid_ids = set()
135 for title, seq in fasta_iterator(fasta_file):
136 name = title.split(None,1)[0]
137 if name in valid_ids:
138 stop_err("Duplicated identifier %r" % name)
139 else:
140 valid_ids.add(name)
141 handle = open(hmm_output_file)
142 for line in handle:
143 if not line.strip():
144 #We expect blank lines in the HMMER2 stdout
145 continue
146 elif line.startswith("#"):
147 #Header
148 continue
149 else:
150 name = line.split(None,1)[0]
151 #Should be a sequence name in the HMMER3 table output.
152 #Could be anything in the HMMER2 stdout.
153 if name in valid_ids:
154 hmm_hits.add(name)
155 elif hmmer3:
156 stop_err("Unexpected identifer %r in hmmsearch output" % name)
157 handle.close()
158 #if hmmer3:
159 # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids))
160 #else:
161 # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids))
162 #print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids))
163 os.remove(hmm_output_file)
164 del valid_ids
165
166
167 #Prepare short list of candidates containing RXLR to pass to SignalP
168 assert min_rxlr_start > 0, "Min value one, since zero based counting"
169 count = 0
170 total = 0
171 handle = open(signalp_input_file, "w")
172 for title, seq in fasta_iterator(fasta_file):
173 total += 1
174 name = title.split(None,1)[0]
175 match = re_rxlr.search(seq[min_rxlr_start-1:].upper())
176 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start:
177 #This is a potential RXLR, depending on the SignalP results.
178 #Might as well truncate the sequence now, makes the temp file smaller
179 if signalp_trunc:
180 handle.write(">%s (truncated)\n%s\n" % (name, seq[:signalp_trunc]))
181 else:
182 #Does it matter we don't line wrap?
183 handle.write(">%s\n%s\n" % (name, seq))
184 count += 1
185 handle.close()
186 #print "Running SignalP on %i/%i potentials." % (count, total)
187
188
189 #Run SignalP (using our wrapper script to get multi-core support etc)
190 signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py")
191 if not os.path.isfile(signalp_script):
192 stop_err("Error - missing signalp3.py script")
193 cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file)
194 return_code = os.system(cmd)
195 if return_code:
196 stop_err("Error %i from SignalP:\n%s" % (return_code, cmd))
197 #print "SignalP done"
198
199 def parse_signalp(filename):
200 """Parse SignalP output, yield tuples of ID, HMM_Sprob_score and NN predicted signal peptide length.
201
202 For signal peptide length we use NN_Ymax_pos (minus one).
203 """
204 handle = open(filename)
205 line = handle.readline()
206 assert line.startswith("#ID\t"), line
207 for line in handle:
208 parts = line.rstrip("\t").split("\t")
209 assert len(parts)==20, repr(line)
210 yield parts[0], float(parts[18]), int(parts[5])-1
211 handle.close()
212
213
214 #Parse SignalP results and apply the strict RXLR criteria
215 total = 0
216 tally = dict()
217 handle = open(tabular_file, "w")
218 handle.write("#ID\t%s\n" % model)
219 signalp_results = parse_signalp(signalp_output_file)
220 for title, seq in fasta_iterator(fasta_file):
221 total += 1
222 rxlr = "N"
223 name = title.split(None,1)[0]
224 match = re_rxlr.search(seq[min_rxlr_start-1:].upper())
225 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start:
226 del match
227 #This was the criteria for calling SignalP,
228 #so it will be in the SignalP results.
229 sp_id, sp_hmm_score, sp_nn_len = signalp_results.next()
230 assert name == sp_id, "%s vs %s" % (name, sp_id)
231 if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp:
232 match = re_rxlr.search(seq[sp_nn_len:].upper())
233 if match and match.start() + 1 <= max_sp_rxlr: #1-based counting
234 rxlr_start = sp_nn_len + match.start() + 1
235 if min_rxlr_start <= rxlr_start <= max_rxlr_start:
236 rxlr = "Y"
237 if model == "Whisson2007":
238 #Combine the signalp with regular expression heuristic and the HMM
239 if name in hmm_hits and rxlr == "N":
240 rxlr = "hmm" #HMM only
241 elif rxlr == "N":
242 rxlr = "neither" #Don't use N (no)
243 elif name not in hmm_hits and rxlr == "Y":
244 rxlr = "re" #Heuristic only
245 #Now have a four way classifier: Y, hmm, re, neither
246 #and count is the number of Y results (both HMM and heuristic)
247 handle.write("%s\t%s\n" % (name, rxlr))
248 try:
249 tally[rxlr] += 1
250 except KeyError:
251 tally[rxlr] = 1
252 handle.close()
253 assert sum(tally.values()) == total
254
255 #Check the iterator is finished
256 try:
257 signalp_results.next()
258 assert False, "Unexpected data in SignalP output"
259 except StopIteration:
260 pass
261
262 #Cleanup
263 os.remove(signalp_input_file)
264 os.remove(signalp_output_file)
265
266 #Short summary to stdout for Galaxy's info display
267 print "%s for %i sequences:" % (model, total)
268 print ", ".join("%s = %i" % kv for kv in sorted(tally.iteritems()))