comparison tools/protein_analysis/rxlr_motifs.py @ 19:f3ecd80850e2 draft

v0.2.9 Python style improvements
author peterjc
date Wed, 01 Feb 2017 09:46:42 -0500
parents eb6ac44d4b8e
children a19b3ded8f33
comparison
equal deleted inserted replaced
18:eb6ac44d4b8e 19:f3ecd80850e2
38 """ 38 """
39 import os 39 import os
40 import sys 40 import sys
41 import re 41 import re
42 import subprocess 42 import subprocess
43 from seq_analysis_utils import sys_exit, fasta_iterator 43 from seq_analysis_utils import fasta_iterator
44 44
45 if "-v" in sys.argv: 45 if "-v" in sys.argv:
46 print("RXLR Motifs v0.0.10") 46 print("RXLR Motifs v0.0.10")
47 sys.exit(0) 47 sys.exit(0)
48 48
49 if len(sys.argv) != 5: 49 if len(sys.argv) != 5:
50 sys_exit("Requires four arguments: protein FASTA filename, threads, model, and output filename") 50 sys.exit("Requires four arguments: protein FASTA filename, threads, model, and output filename")
51 51
52 fasta_file, threads, model, tabular_file = sys.argv[1:] 52 fasta_file, threads, model, tabular_file = sys.argv[1:]
53 hmm_output_file = tabular_file + ".hmm.tmp" 53 hmm_output_file = tabular_file + ".hmm.tmp"
54 signalp_input_file = tabular_file + ".fasta.tmp" 54 signalp_input_file = tabular_file + ".fasta.tmp"
55 signalp_output_file = tabular_file + ".tabular.tmp" 55 signalp_output_file = tabular_file + ".tabular.tmp"
84 max_sp = 40 84 max_sp = 40
85 max_sp_rxlr = 100 85 max_sp_rxlr = 100
86 min_rxlr_start = 1 86 min_rxlr_start = 1
87 max_rxlr_start = max_sp + max_sp_rxlr 87 max_rxlr_start = max_sp + max_sp_rxlr
88 else: 88 else:
89 sys_exit("Did not recognise the model name %r\n" 89 sys.exit("Did not recognise the model name %r\n"
90 "Use Bhattacharjee2006, Win2007, or Whisson2007" % model) 90 "Use Bhattacharjee2006, Win2007, or Whisson2007" % model)
91 91
92 92
93 def get_hmmer_version(exe, required=None): 93 def get_hmmer_version(exe, required=None):
94 cmd = "%s -h" % exe 94 cmd = "%s -h" % exe
95 try: 95 try:
103 return 2 103 return 2
104 elif "HMMER 3" in stdout: 104 elif "HMMER 3" in stdout:
105 return 3 105 return 3
106 else: 106 else:
107 raise ValueError("Could not determine version of %s" % exe) 107 raise ValueError("Could not determine version of %s" % exe)
108 108
109 109
110 #Run hmmsearch for Whisson et al. (2007) 110 # Run hmmsearch for Whisson et al. (2007)
111 if model == "Whisson2007": 111 if model == "Whisson2007":
112 hmm_file = os.path.join(os.path.split(sys.argv[0])[0], 112 hmm_file = os.path.join(os.path.split(sys.argv[0])[0],
113 "whisson_et_al_rxlr_eer_cropped.hmm") 113 "whisson_et_al_rxlr_eer_cropped.hmm")
114 if not os.path.isfile(hmm_file): 114 if not os.path.isfile(hmm_file):
115 sys_exit("Missing HMM file for Whisson et al. (2007)") 115 sys.exit("Missing HMM file for Whisson et al. (2007)")
116 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): 116 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"):
117 sys_exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search) 117 sys.exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search)
118 118
119 hmm_hits = set() 119 hmm_hits = set()
120 valid_ids = set() 120 valid_ids = set()
121 for title, seq in fasta_iterator(fasta_file): 121 for title, seq in fasta_iterator(fasta_file):
122 name = title.split(None,1)[0] 122 name = title.split(None, 1)[0]
123 if name in valid_ids: 123 if name in valid_ids:
124 sys_exit("Duplicated identifier %r" % name) 124 sys.exit("Duplicated identifier %r" % name)
125 else: 125 else:
126 valid_ids.add(name) 126 valid_ids.add(name)
127 if not valid_ids: 127 if not valid_ids:
128 # Special case, don't need to run HMMER if there are no sequences 128 # Special case, don't need to run HMMER if there are no sequences
129 pass 129 pass
144 # input FASTA file, and we can loose hits of interest). 144 # input FASTA file, and we can loose hits of interest).
145 cmd = "%s -T 0 -E 1e6 %s %s > %s" \ 145 cmd = "%s -T 0 -E 1e6 %s %s > %s" \
146 % (hmmer_search, hmm_file, fasta_file, hmm_output_file) 146 % (hmmer_search, hmm_file, fasta_file, hmm_output_file)
147 return_code = os.system(cmd) 147 return_code = os.system(cmd)
148 if return_code: 148 if return_code:
149 sys_exit("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) 149 sys.exit("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code)
150 150
151 handle = open(hmm_output_file) 151 handle = open(hmm_output_file)
152 for line in handle: 152 for line in handle:
153 if not line.strip(): 153 if not line.strip():
154 # We expect blank lines in the HMMER2 stdout 154 # We expect blank lines in the HMMER2 stdout
155 continue 155 continue
156 elif line.startswith("#"): 156 elif line.startswith("#"):
157 # Header 157 # Header
158 continue 158 continue
159 else: 159 else:
160 name = line.split(None,1)[0] 160 name = line.split(None, 1)[0]
161 #Should be a sequence name in the HMMER3 table output. 161 # Should be a sequence name in the HMMER3 table output.
162 #Could be anything in the HMMER2 stdout. 162 # Could be anything in the HMMER2 stdout.
163 if name in valid_ids: 163 if name in valid_ids:
164 hmm_hits.add(name) 164 hmm_hits.add(name)
165 elif hmmer3: 165 elif hmmer3:
166 sys_exit("Unexpected identifer %r in hmmsearch output" % name) 166 sys.exit("Unexpected identifer %r in hmmsearch output" % name)
167 handle.close() 167 handle.close()
168 # if hmmer3: 168 # if hmmer3:
169 # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) 169 # print "HMMER3 hits for %i/%i" % (len(hmm_hits), len(valid_ids))
170 # else: 170 # else:
171 # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids)) 171 # print "HMMER2 hits for %i/%i" % (len(hmm_hits), len(valid_ids))
172 # print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids)) 172 # print "%i/%i matched HMM" % (len(hmm_hits), len(valid_ids))
173 os.remove(hmm_output_file) 173 os.remove(hmm_output_file)
174 del valid_ids 174 del valid_ids
175 175
176 176
179 count = 0 179 count = 0
180 total = 0 180 total = 0
181 handle = open(signalp_input_file, "w") 181 handle = open(signalp_input_file, "w")
182 for title, seq in fasta_iterator(fasta_file): 182 for title, seq in fasta_iterator(fasta_file):
183 total += 1 183 total += 1
184 name = title.split(None,1)[0] 184 name = title.split(None, 1)[0]
185 match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) 185 match = re_rxlr.search(seq[min_rxlr_start - 1:].upper())
186 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: 186 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start:
187 # This is a potential RXLR, depending on the SignalP results. 187 # This is a potential RXLR, depending on the SignalP results.
188 # Might as well truncate the sequence now, makes the temp file smaller 188 # Might as well truncate the sequence now, makes the temp file smaller
189 if signalp_trunc: 189 if signalp_trunc:
190 handle.write(">%s (truncated)\n%s\n" % (name, seq[:signalp_trunc])) 190 handle.write(">%s (truncated)\n%s\n" % (name, seq[:signalp_trunc]))
197 197
198 198
199 # Run SignalP (using our wrapper script to get multi-core support etc) 199 # Run SignalP (using our wrapper script to get multi-core support etc)
200 signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py") 200 signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py")
201 if not os.path.isfile(signalp_script): 201 if not os.path.isfile(signalp_script):
202 sys_exit("Error - missing signalp3.py script") 202 sys.exit("Error - missing signalp3.py script")
203 cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file) 203 cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file)
204 return_code = os.system(cmd) 204 return_code = os.system(cmd)
205 if return_code: 205 if return_code:
206 sys_exit("Error %i from SignalP:\n%s" % (return_code, cmd)) 206 sys.exit("Error %i from SignalP:\n%s" % (return_code, cmd))
207 # print "SignalP done" 207 # print "SignalP done"
208 208
209 209
210 def parse_signalp(filename): 210 def parse_signalp(filename):
211 """Parse SignalP output, yield tuples of ID, HMM_Sprob_score and NN predicted signal peptide length. 211 """Parse SignalP output, yield tuples of ID, HMM_Sprob_score and NN predicted signal peptide length.
215 handle = open(filename) 215 handle = open(filename)
216 line = handle.readline() 216 line = handle.readline()
217 assert line.startswith("#ID\t"), line 217 assert line.startswith("#ID\t"), line
218 for line in handle: 218 for line in handle:
219 parts = line.rstrip("\t").split("\t") 219 parts = line.rstrip("\t").split("\t")
220 assert len(parts)==20, repr(line) 220 assert len(parts) == 20, repr(line)
221 yield parts[0], float(parts[18]), int(parts[5])-1 221 yield parts[0], float(parts[18]), int(parts[5]) - 1
222 handle.close() 222 handle.close()
223 223
224 224
225 # Parse SignalP results and apply the strict RXLR criteria 225 # Parse SignalP results and apply the strict RXLR criteria
226 total = 0 226 total = 0
229 handle.write("#ID\t%s\n" % model) 229 handle.write("#ID\t%s\n" % model)
230 signalp_results = parse_signalp(signalp_output_file) 230 signalp_results = parse_signalp(signalp_output_file)
231 for title, seq in fasta_iterator(fasta_file): 231 for title, seq in fasta_iterator(fasta_file):
232 total += 1 232 total += 1
233 rxlr = "N" 233 rxlr = "N"
234 name = title.split(None,1)[0] 234 name = title.split(None, 1)[0]
235 match = re_rxlr.search(seq[min_rxlr_start-1:].upper()) 235 match = re_rxlr.search(seq[min_rxlr_start - 1:].upper())
236 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: 236 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start:
237 del match 237 del match
238 # This was the criteria for calling SignalP, 238 # This was the criteria for calling SignalP,
239 #so it will be in the SignalP results. 239 # so it will be in the SignalP results.
240 sp_id, sp_hmm_score, sp_nn_len = signalp_results.next() 240 sp_id, sp_hmm_score, sp_nn_len = signalp_results.next()
241 assert name == sp_id, "%s vs %s" % (name, sp_id) 241 assert name == sp_id, "%s vs %s" % (name, sp_id)
242 if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp: 242 if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp:
243 match = re_rxlr.search(seq[sp_nn_len:].upper()) 243 match = re_rxlr.search(seq[sp_nn_len:].upper())
244 if match and match.start() + 1 <= max_sp_rxlr: # 1-based counting 244 if match and match.start() + 1 <= max_sp_rxlr: # 1-based counting