comparison tools/protein_analysis/rxlr_motifs.py @ 18:eb6ac44d4b8e draft

Suite v0.2.8, record Promoter 2 verion + misc internal updates
author peterjc
date Tue, 01 Sep 2015 09:56:36 -0400
parents 7de64c8b258d
children f3ecd80850e2
comparison
equal deleted inserted replaced
17:e6cc27d182a8 18:eb6ac44d4b8e
38 """ 38 """
39 import os 39 import os
40 import sys 40 import sys
41 import re 41 import re
42 import subprocess 42 import subprocess
43 from seq_analysis_utils import stop_err, fasta_iterator 43 from seq_analysis_utils import sys_exit, fasta_iterator
44
45 if "-v" in sys.argv:
46 print("RXLR Motifs v0.0.10")
47 sys.exit(0)
44 48
45 if len(sys.argv) != 5: 49 if len(sys.argv) != 5:
46 stop_err("Requires four arguments: protein FASTA filename, threads, model, and output filename") 50 sys_exit("Requires four arguments: protein FASTA filename, threads, model, and output filename")
47 51
48 fasta_file, threads, model, tabular_file = sys.argv[1:] 52 fasta_file, threads, model, tabular_file = sys.argv[1:]
49 hmm_output_file = tabular_file + ".hmm.tmp" 53 hmm_output_file = tabular_file + ".hmm.tmp"
50 signalp_input_file = tabular_file + ".fasta.tmp" 54 signalp_input_file = tabular_file + ".fasta.tmp"
51 signalp_output_file = tabular_file + ".tabular.tmp" 55 signalp_output_file = tabular_file + ".tabular.tmp"
52 min_signalp_hmm = 0.9 56 min_signalp_hmm = 0.9
53 hmmer_search = "hmmsearch2" 57 hmmer_search = "hmmsearch2"
54 58
55 if model == "Bhattacharjee2006": 59 if model == "Bhattacharjee2006":
56 signalp_trunc = 70 60 signalp_trunc = 70
57 re_rxlr = re.compile("R.LR") 61 re_rxlr = re.compile("R.LR")
58 min_sp = 10 62 min_sp = 10
59 max_sp = 40 63 max_sp = 40
60 max_sp_rxlr = 100 64 max_sp_rxlr = 100
61 min_rxlr_start = 1 65 min_rxlr_start = 1
62 #Allow signal peptide to be at most 40aa, and want RXLR to be 66 # Allow signal peptide to be at most 40aa, and want RXLR to be
63 #within 100aa, therefore for the prescreen the max start is 140: 67 # within 100aa, therefore for the prescreen the max start is 140:
64 max_rxlr_start = max_sp + max_sp_rxlr 68 max_rxlr_start = max_sp + max_sp_rxlr
65 elif model == "Win2007": 69 elif model == "Win2007":
66 signalp_trunc = 70 70 signalp_trunc = 70
67 re_rxlr = re.compile("R.LR") 71 re_rxlr = re.compile("R.LR")
68 min_sp = 10 72 min_sp = 10
69 max_sp = 40 73 max_sp = 40
70 min_rxlr_start = 30 74 min_rxlr_start = 30
71 max_rxlr_start = 60 75 max_rxlr_start = 60
72 #No explicit limit on separation of signal peptide clevage 76 # No explicit limit on separation of signal peptide clevage
73 #and RXLR, but shortest signal peptide is 10, and furthest 77 # and RXLR, but shortest signal peptide is 10, and furthest
74 #away RXLR is 60, so effectively limit is 50. 78 # away RXLR is 60, so effectively limit is 50.
75 max_sp_rxlr = max_rxlr_start - min_sp + 1 79 max_sp_rxlr = max_rxlr_start - min_sp + 1
76 elif model == "Whisson2007": 80 elif model == "Whisson2007":
77 signalp_trunc = 0 #zero for no truncation 81 signalp_trunc = 0 # zero for no truncation
78 re_rxlr = re.compile("R.LR.{,40}[ED][ED][KR]") 82 re_rxlr = re.compile("R.LR.{,40}[ED][ED][KR]")
79 min_sp = 10 83 min_sp = 10
80 max_sp = 40 84 max_sp = 40
81 max_sp_rxlr = 100 85 max_sp_rxlr = 100
82 min_rxlr_start = 1 86 min_rxlr_start = 1
83 max_rxlr_start = max_sp + max_sp_rxlr 87 max_rxlr_start = max_sp + max_sp_rxlr
84 else: 88 else:
85 stop_err("Did not recognise the model name %r\n" 89 sys_exit("Did not recognise the model name %r\n"
86 "Use Bhattacharjee2006, Win2007, or Whisson2007" % model) 90 "Use Bhattacharjee2006, Win2007, or Whisson2007" % model)
87 91
88 92
89 def get_hmmer_version(exe, required=None): 93 def get_hmmer_version(exe, required=None):
90 cmd = "%s -h" % exe 94 cmd = "%s -h" % exe
106 #Run hmmsearch for Whisson et al. (2007) 110 #Run hmmsearch for Whisson et al. (2007)
107 if model == "Whisson2007": 111 if model == "Whisson2007":
108 hmm_file = os.path.join(os.path.split(sys.argv[0])[0], 112 hmm_file = os.path.join(os.path.split(sys.argv[0])[0],
109 "whisson_et_al_rxlr_eer_cropped.hmm") 113 "whisson_et_al_rxlr_eer_cropped.hmm")
110 if not os.path.isfile(hmm_file): 114 if not os.path.isfile(hmm_file):
111 stop_err("Missing HMM file for Whisson et al. (2007)") 115 sys_exit("Missing HMM file for Whisson et al. (2007)")
112 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): 116 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"):
113 stop_err("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_searcher) 117 sys_exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search)
114 118
115 hmm_hits = set() 119 hmm_hits = set()
116 valid_ids = set() 120 valid_ids = set()
117 for title, seq in fasta_iterator(fasta_file): 121 for title, seq in fasta_iterator(fasta_file):
118 name = title.split(None,1)[0] 122 name = title.split(None,1)[0]
119 if name in valid_ids: 123 if name in valid_ids:
120 stop_err("Duplicated identifier %r" % name) 124 sys_exit("Duplicated identifier %r" % name)
121 else: 125 else:
122 valid_ids.add(name) 126 valid_ids.add(name)
123 if not valid_ids: 127 if not valid_ids:
124 #Special case, don't need to run HMMER if there are no sequences 128 # Special case, don't need to run HMMER if there are no sequences
125 pass 129 pass
126 else: 130 else:
127 #I've left the code to handle HMMER 3 in situ, in case 131 # I've left the code to handle HMMER 3 in situ, in case
128 #we revisit the choice to insist on HMMER 2. 132 # we revisit the choice to insist on HMMER 2.
129 hmmer3 = (3 == get_hmmer_version(hmmer_search)) 133 hmmer3 = (3 == get_hmmer_version(hmmer_search))
130 #Using zero (or 5.6?) for bitscore threshold 134 # Using zero (or 5.6?) for bitscore threshold
131 if hmmer3: 135 if hmmer3:
132 #The HMMER3 table output is easy to parse 136 # The HMMER3 table output is easy to parse
133 #In HMMER3 can't use both -T and -E 137 # In HMMER3 can't use both -T and -E
134 cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \ 138 cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \
135 % (hmmer_search, hmm_output_file, hmm_file, fasta_file) 139 % (hmmer_search, hmm_output_file, hmm_file, fasta_file)
136 else: 140 else:
137 #For HMMER2 we are stuck with parsing stdout 141 # For HMMER2 we are stuck with parsing stdout
138 #Put 1e6 to effectively have no expectation threshold (otherwise 142 # Put 1e6 to effectively have no expectation threshold (otherwise
139 #HMMER defaults to 10 and the calculated e-value depends on the 143 # HMMER defaults to 10 and the calculated e-value depends on the
140 #input FASTA file, and we can loose hits of interest). 144 # input FASTA file, and we can loose hits of interest).
141 cmd = "%s -T 0 -E 1e6 %s %s > %s" \ 145 cmd = "%s -T 0 -E 1e6 %s %s > %s" \
142 % (hmmer_search, hmm_file, fasta_file, hmm_output_file) 146 % (hmmer_search, hmm_file, fasta_file, hmm_output_file)
143 return_code = os.system(cmd) 147 return_code = os.system(cmd)
144 if return_code: 148 if return_code:
145 stop_err("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) 149 sys_exit("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code)
146 150
147 handle = open(hmm_output_file) 151 handle = open(hmm_output_file)
148 for line in handle: 152 for line in handle:
149 if not line.strip(): 153 if not line.strip():
150 #We expect blank lines in the HMMER2 stdout 154 # We expect blank lines in the HMMER2 stdout
151 continue 155 continue
152 elif line.startswith("#"): 156 elif line.startswith("#"):
153 #Header 157 # Header
154 continue 158 continue
155 else: 159 else:
156 name = line.split(None,1)[0] 160 name = line.split(None,1)[0]
157 #Should be a sequence name in the HMMER3 table output. 161 #Should be a sequence name in the HMMER3 table output.
158 #Could be anything in the HMMER2 stdout. 162 #Could be anything in the HMMER2 stdout.
159 if name in valid_ids: 163 if name in valid_ids:
160 hmm_hits.add(name) 164 hmm_hits.add(name)
161 elif hmmer3: 165 elif hmmer3:
162 stop_err("Unexpected identifer %r in hmmsearch output" % name) 166 sys_exit("Unexpected identifer %r in hmmsearch output" % name)
163 handle.close() 167 handle.close()
164 #if hmmer3: 168 # if hmmer3:
165 # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) 169 # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids))
166 #else: 170 # else:
167 # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) 171 # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids))
168 #print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) 172 # print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids))
169 os.remove(hmm_output_file) 173 os.remove(hmm_output_file)
170 del valid_ids 174 del valid_ids
171 175
172 176
173 #Prepare short list of candidates containing RXLR to pass to SignalP 177 # Prepare short list of candidates containing RXLR to pass to SignalP
174 assert min_rxlr_start > 0, "Min value one, since zero based counting" 178 assert min_rxlr_start > 0, "Min value one, since zero based counting"
175 count = 0 179 count = 0
176 total = 0 180 total = 0
177 handle = open(signalp_input_file, "w") 181 handle = open(signalp_input_file, "w")
178 for title, seq in fasta_iterator(fasta_file): 182 for title, seq in fasta_iterator(fasta_file):
179 total += 1 183 total += 1
180 name = title.split(None,1)[0] 184 name = title.split(None,1)[0]
181 match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) 185 match = re_rxlr.search(seq[min_rxlr_start-1:].upper())
182 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: 186 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start:
183 #This is a potential RXLR, depending on the SignalP results. 187 # This is a potential RXLR, depending on the SignalP results.
184 #Might as well truncate the sequence now, makes the temp file smaller 188 # Might as well truncate the sequence now, makes the temp file smaller
185 if signalp_trunc: 189 if signalp_trunc:
186 handle.write(">%s (truncated)\n%s\n" % (name, seq[:signalp_trunc])) 190 handle.write(">%s (truncated)\n%s\n" % (name, seq[:signalp_trunc]))
187 else: 191 else:
188 #Does it matter we don't line wrap? 192 # Does it matter we don't line wrap?
189 handle.write(">%s\n%s\n" % (name, seq)) 193 handle.write(">%s\n%s\n" % (name, seq))
190 count += 1 194 count += 1
191 handle.close() 195 handle.close()
192 #print "Running SignalP on %i/%i potentials." % (count, total) 196 # print "Running SignalP on %i/%i potentials." % (count, total)
193 197
194 198
195 #Run SignalP (using our wrapper script to get multi-core support etc) 199 # Run SignalP (using our wrapper script to get multi-core support etc)
196 signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py") 200 signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py")
197 if not os.path.isfile(signalp_script): 201 if not os.path.isfile(signalp_script):
198 stop_err("Error - missing signalp3.py script") 202 sys_exit("Error - missing signalp3.py script")
199 cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file) 203 cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file)
200 return_code = os.system(cmd) 204 return_code = os.system(cmd)
201 if return_code: 205 if return_code:
202 stop_err("Error %i from SignalP:\n%s" % (return_code, cmd)) 206 sys_exit("Error %i from SignalP:\n%s" % (return_code, cmd))
203 #print "SignalP done" 207 # print "SignalP done"
208
204 209
205 def parse_signalp(filename): 210 def parse_signalp(filename):
206 """Parse SignalP output, yield tuples of ID, HMM_Sprob_score and NN predicted signal peptide length. 211 """Parse SignalP output, yield tuples of ID, HMM_Sprob_score and NN predicted signal peptide length.
207 212
208 For signal peptide length we use NN_Ymax_pos (minus one). 213 For signal peptide length we use NN_Ymax_pos (minus one).
215 assert len(parts)==20, repr(line) 220 assert len(parts)==20, repr(line)
216 yield parts[0], float(parts[18]), int(parts[5])-1 221 yield parts[0], float(parts[18]), int(parts[5])-1
217 handle.close() 222 handle.close()
218 223
219 224
220 #Parse SignalP results and apply the strict RXLR criteria 225 # Parse SignalP results and apply the strict RXLR criteria
221 total = 0 226 total = 0
222 tally = dict() 227 tally = dict()
223 handle = open(tabular_file, "w") 228 handle = open(tabular_file, "w")
224 handle.write("#ID\t%s\n" % model) 229 handle.write("#ID\t%s\n" % model)
225 signalp_results = parse_signalp(signalp_output_file) 230 signalp_results = parse_signalp(signalp_output_file)
228 rxlr = "N" 233 rxlr = "N"
229 name = title.split(None,1)[0] 234 name = title.split(None,1)[0]
230 match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) 235 match = re_rxlr.search(seq[min_rxlr_start-1:].upper())
231 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: 236 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start:
232 del match 237 del match
233 #This was the criteria for calling SignalP, 238 # This was the criteria for calling SignalP,
234 #so it will be in the SignalP results. 239 #so it will be in the SignalP results.
235 sp_id, sp_hmm_score, sp_nn_len = signalp_results.next() 240 sp_id, sp_hmm_score, sp_nn_len = signalp_results.next()
236 assert name == sp_id, "%s vs %s" % (name, sp_id) 241 assert name == sp_id, "%s vs %s" % (name, sp_id)
237 if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp: 242 if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp:
238 match = re_rxlr.search(seq[sp_nn_len:].upper()) 243 match = re_rxlr.search(seq[sp_nn_len:].upper())
239 if match and match.start() + 1 <= max_sp_rxlr: #1-based counting 244 if match and match.start() + 1 <= max_sp_rxlr: # 1-based counting
240 rxlr_start = sp_nn_len + match.start() + 1 245 rxlr_start = sp_nn_len + match.start() + 1
241 if min_rxlr_start <= rxlr_start <= max_rxlr_start: 246 if min_rxlr_start <= rxlr_start <= max_rxlr_start:
242 rxlr = "Y" 247 rxlr = "Y"
243 if model == "Whisson2007": 248 if model == "Whisson2007":
244 #Combine the signalp with regular expression heuristic and the HMM 249 # Combine the signalp with regular expression heuristic and the HMM
245 if name in hmm_hits and rxlr == "N": 250 if name in hmm_hits and rxlr == "N":
246 rxlr = "hmm" #HMM only 251 rxlr = "hmm" # HMM only
247 elif rxlr == "N": 252 elif rxlr == "N":
248 rxlr = "neither" #Don't use N (no) 253 rxlr = "neither" # Don't use N (no)
249 elif name not in hmm_hits and rxlr == "Y": 254 elif name not in hmm_hits and rxlr == "Y":
250 rxlr = "re" #Heuristic only 255 rxlr = "re" # Heuristic only
251 #Now have a four way classifier: Y, hmm, re, neither 256 # Now have a four way classifier: Y, hmm, re, neither
252 #and count is the number of Y results (both HMM and heuristic) 257 # and count is the number of Y results (both HMM and heuristic)
253 handle.write("%s\t%s\n" % (name, rxlr)) 258 handle.write("%s\t%s\n" % (name, rxlr))
254 try: 259 try:
255 tally[rxlr] += 1 260 tally[rxlr] += 1
256 except KeyError: 261 except KeyError:
257 tally[rxlr] = 1 262 tally[rxlr] = 1
258 handle.close() 263 handle.close()
259 assert sum(tally.values()) == total 264 assert sum(tally.values()) == total
260 265
261 #Check the iterator is finished 266 # Check the iterator is finished
262 try: 267 try:
263 signalp_results.next() 268 signalp_results.next()
264 assert False, "Unexpected data in SignalP output" 269 assert False, "Unexpected data in SignalP output"
265 except StopIteration: 270 except StopIteration:
266 pass 271 pass
267 272
268 #Cleanup 273 # Cleanup
269 os.remove(signalp_input_file) 274 os.remove(signalp_input_file)
270 os.remove(signalp_output_file) 275 os.remove(signalp_output_file)
271 276
272 #Short summary to stdout for Galaxy's info display 277 # Short summary to stdout for Galaxy's info display
273 print "%s for %i sequences:" % (model, total) 278 print "%s for %i sequences:" % (model, total)
274 print ", ".join("%s = %i" % kv for kv in sorted(tally.iteritems())) 279 print ", ".join("%s = %i" % kv for kv in sorted(tally.iteritems()))