Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/rxlr_motifs.py @ 21:238eae32483c draft
"Check this is up to date with all 2020 changes (black etc)"
| author | peterjc |
|---|---|
| date | Thu, 17 Jun 2021 08:21:06 +0000 |
| parents | a19b3ded8f33 |
| children | e1996f0f4e85 |
comparison
equal
deleted
inserted
replaced
| 20:a19b3ded8f33 | 21:238eae32483c |
|---|---|
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 """Implements assorted RXLR motif methods from the literature | 2 """Implements assorted RXLR motif methods from the literature. |
| 3 | 3 |
| 4 This script takes exactly four command line arguments: | 4 This script takes exactly four command line arguments: |
| 5 * Protein FASTA filename | 5 * Protein FASTA filename |
| 6 * Number of threads | 6 * Number of threads |
| 7 * Model name (Bhattacharjee2006, Win2007, Whisson2007) | 7 * Model name (Bhattacharjee2006, Win2007, Whisson2007) |
| 8 * Output tabular filename | 8 * Output tabular filename |
| 9 | 9 |
| 10 The model names are: | 10 The model names are: |
| 11 | 11 * Bhattacharjee2006: Simple regular expression search for RXLR |
| 12 Bhattacharjee2006: Simple regular expression search for RXLR | 12 with additional requirements for positioning and signal peptide. |
| 13 with additional requirements for positioning and signal peptide. | 13 * Win2007: Simple regular expression search for RXLR, but with |
| 14 | 14 different positional requirements. |
| 15 Win2007: Simple regular expression search for RXLR, but with | 15 * Whisson2007: As Bhattacharjee2006 but with a more complex regular |
| 16 different positional requirements. | 16 expression to look for RXLR-EER domain, and additionally calls HMMER. |
| 17 | |
| 18 Whisson2007: As Bhattacharjee2006 but with a more complex regular | |
| 19 expression to look for RXLR-EER domain, and additionally calls HMMER. | |
| 20 | 17 |
| 21 See the help text in the accompanying Galaxy tool XML file for more | 18 See the help text in the accompanying Galaxy tool XML file for more |
| 22 details including the full references. | 19 details including the full references. |
| 23 | 20 |
| 24 Note: | 21 Note |
| 25 | 22 ---- |
| 26 Bhattacharjee et al. (2006) and Win et al. (2007) used SignalP v2.0, | 23 Bhattacharjee et al. (2006) and Win et al. (2007) used SignalP v2.0, |
| 27 which is no longer available. The current release is SignalP v3.0 | 24 which is no longer available. The current release is SignalP v3.0 |
| 28 (Mar 5, 2007). We have therefore opted to use the NN Ymax position for | 25 (Mar 5, 2007). We have therefore opted to use the NN Ymax position for |
| 29 the predicted cleavage site, as this is expected to be more accurate. | 26 the predicted cleavage site, as this is expected to be more accurate. |
| 30 Also note that the HMM score values have changed from v2.0 to v3.0. | 27 Also note that the HMM score values have changed from v2.0 to v3.0. |
| 53 import sys | 50 import sys |
| 54 | 51 |
| 55 from seq_analysis_utils import fasta_iterator | 52 from seq_analysis_utils import fasta_iterator |
| 56 | 53 |
| 57 if "-v" in sys.argv: | 54 if "-v" in sys.argv: |
| 58 print("RXLR Motifs v0.0.14") | 55 print("RXLR Motifs v0.0.16") |
| 59 sys.exit(0) | 56 sys.exit(0) |
| 60 | 57 |
| 61 if len(sys.argv) != 5: | 58 if len(sys.argv) != 5: |
| 62 sys.exit("Requires four arguments: protein FASTA filename, threads, model, and output filename") | 59 sys.exit( |
| 60 "Requires four arguments: protein FASTA filename, threads, " | |
| 61 "model, and output filename" | |
| 62 ) | |
| 63 | 63 |
| 64 fasta_file, threads, model, tabular_file = sys.argv[1:] | 64 fasta_file, threads, model, tabular_file = sys.argv[1:] |
| 65 hmm_output_file = tabular_file + ".hmm.tmp" | 65 hmm_output_file = tabular_file + ".hmm.tmp" |
| 66 signalp_input_file = tabular_file + ".fasta.tmp" | 66 signalp_input_file = tabular_file + ".fasta.tmp" |
| 67 signalp_output_file = tabular_file + ".tabular.tmp" | 67 signalp_output_file = tabular_file + ".tabular.tmp" |
| 96 max_sp = 40 | 96 max_sp = 40 |
| 97 max_sp_rxlr = 100 | 97 max_sp_rxlr = 100 |
| 98 min_rxlr_start = 1 | 98 min_rxlr_start = 1 |
| 99 max_rxlr_start = max_sp + max_sp_rxlr | 99 max_rxlr_start = max_sp + max_sp_rxlr |
| 100 else: | 100 else: |
| 101 sys.exit("Did not recognise the model name %r\n" | 101 sys.exit( |
| 102 "Use Bhattacharjee2006, Win2007, or Whisson2007" % model) | 102 "Did not recognise the model name %r\n" |
| 103 "Use Bhattacharjee2006, Win2007, or Whisson2007" % model | |
| 104 ) | |
| 103 | 105 |
| 104 | 106 |
| 105 def get_hmmer_version(exe, required=None): | 107 def get_hmmer_version(exe, required=None): |
| 106 try: | 108 try: |
| 107 child = subprocess.Popen([exe, "-h"], | 109 child = subprocess.Popen( |
| 108 universal_newlines=True, | 110 [exe, "-h"], |
| 109 stdout=subprocess.PIPE, stderr=subprocess.PIPE) | 111 universal_newlines=True, |
| 112 stdout=subprocess.PIPE, | |
| 113 stderr=subprocess.PIPE, | |
| 114 ) | |
| 110 except OSError: | 115 except OSError: |
| 111 raise ValueError("Could not run %s" % exe) | 116 raise ValueError("Could not run %s" % exe) |
| 112 stdout, stderr = child.communicate() | 117 stdout, stderr = child.communicate() |
| 113 if required: | 118 if required: |
| 114 return required in stdout | 119 return required in stdout |
| 120 raise ValueError("Could not determine version of %s" % exe) | 125 raise ValueError("Could not determine version of %s" % exe) |
| 121 | 126 |
| 122 | 127 |
| 123 # Run hmmsearch for Whisson et al. (2007) | 128 # Run hmmsearch for Whisson et al. (2007) |
| 124 if model == "Whisson2007": | 129 if model == "Whisson2007": |
| 125 hmm_file = os.path.join(os.path.split(sys.argv[0])[0], | 130 hmm_file = os.path.join( |
| 126 "whisson_et_al_rxlr_eer_cropped.hmm") | 131 os.path.split(sys.argv[0])[0], "whisson_et_al_rxlr_eer_cropped.hmm" |
| 132 ) | |
| 127 if not os.path.isfile(hmm_file): | 133 if not os.path.isfile(hmm_file): |
| 128 sys.exit("Missing HMM file for Whisson et al. (2007)") | 134 sys.exit("Missing HMM file for Whisson et al. (2007)") |
| 129 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): | 135 if not get_hmmer_version(hmmer_search, "HMMER 2.3.2 (Oct 2003)"): |
| 130 sys.exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search) | 136 sys.exit("Missing HMMER 2.3.2 (Oct 2003) binary, %s" % hmmer_search) |
| 131 | 137 |
| 141 # Special case, don't need to run HMMER if there are no sequences | 147 # Special case, don't need to run HMMER if there are no sequences |
| 142 pass | 148 pass |
| 143 else: | 149 else: |
| 144 # I've left the code to handle HMMER 3 in situ, in case | 150 # I've left the code to handle HMMER 3 in situ, in case |
| 145 # we revisit the choice to insist on HMMER 2. | 151 # we revisit the choice to insist on HMMER 2. |
| 146 hmmer3 = (3 == get_hmmer_version(hmmer_search)) | 152 hmmer3 = 3 == get_hmmer_version(hmmer_search) |
| 147 # Using zero (or 5.6?) for bitscore threshold | 153 # Using zero (or 5.6?) for bitscore threshold |
| 148 if hmmer3: | 154 if hmmer3: |
| 149 # The HMMER3 table output is easy to parse | 155 # The HMMER3 table output is easy to parse |
| 150 # In HMMER3 can't use both -T and -E | 156 # In HMMER3 can't use both -T and -E |
| 151 cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" \ | 157 cmd = "%s -T 0 --tblout %s --noali %s %s > /dev/null" % ( |
| 152 % (hmmer_search, hmm_output_file, hmm_file, fasta_file) | 158 hmmer_search, |
| 159 hmm_output_file, | |
| 160 hmm_file, | |
| 161 fasta_file, | |
| 162 ) | |
| 153 else: | 163 else: |
| 154 # For HMMER2 we are stuck with parsing stdout | 164 # For HMMER2 we are stuck with parsing stdout |
| 155 # Put 1e6 to effectively have no expectation threshold (otherwise | 165 # Put 1e6 to effectively have no expectation threshold (otherwise |
| 156 # HMMER defaults to 10 and the calculated e-value depends on the | 166 # HMMER defaults to 10 and the calculated e-value depends on the |
| 157 # input FASTA file, and we can loose hits of interest). | 167 # input FASTA file, and we can loose hits of interest). |
| 158 cmd = "%s -T 0 -E 1e6 %s %s > %s" \ | 168 cmd = "%s -T 0 -E 1e6 %s %s > %s" % ( |
| 159 % (hmmer_search, hmm_file, fasta_file, hmm_output_file) | 169 hmmer_search, |
| 170 hmm_file, | |
| 171 fasta_file, | |
| 172 hmm_output_file, | |
| 173 ) | |
| 160 return_code = os.system(cmd) | 174 return_code = os.system(cmd) |
| 161 if return_code: | 175 if return_code: |
| 162 sys.exit("Error %i from hmmsearch:\n%s" % (return_code, cmd), return_code) | 176 sys.stderr.write("Error %i from hmmsearch:\n%s\n" % (return_code, cmd)) |
| 177 sys.exit(return_code) | |
| 163 | 178 |
| 164 handle = open(hmm_output_file) | 179 handle = open(hmm_output_file) |
| 165 for line in handle: | 180 for line in handle: |
| 166 if not line.strip(): | 181 if not line.strip(): |
| 167 # We expect blank lines in the HMMER2 stdout | 182 # We expect blank lines in the HMMER2 stdout |
| 193 total = 0 | 208 total = 0 |
| 194 handle = open(signalp_input_file, "w") | 209 handle = open(signalp_input_file, "w") |
| 195 for title, seq in fasta_iterator(fasta_file): | 210 for title, seq in fasta_iterator(fasta_file): |
| 196 total += 1 | 211 total += 1 |
| 197 name = title.split(None, 1)[0] | 212 name = title.split(None, 1)[0] |
| 198 match = re_rxlr.search(seq[min_rxlr_start - 1:].upper()) | 213 match = re_rxlr.search(seq[min_rxlr_start - 1 :].upper()) |
| 199 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: | 214 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: |
| 200 # This is a potential RXLR, depending on the SignalP results. | 215 # This is a potential RXLR, depending on the SignalP results. |
| 201 # Might as well truncate the sequence now, makes the temp file smaller | 216 # Might as well truncate the sequence now, makes the temp file smaller |
| 202 if signalp_trunc: | 217 if signalp_trunc: |
| 203 handle.write(">%s (truncated)\n%s\n" % (name, seq[:signalp_trunc])) | 218 handle.write(">%s (truncated)\n%s\n" % (name, seq[:signalp_trunc])) |
| 211 | 226 |
| 212 # Run SignalP (using our wrapper script to get multi-core support etc) | 227 # Run SignalP (using our wrapper script to get multi-core support etc) |
| 213 signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py") | 228 signalp_script = os.path.join(os.path.split(sys.argv[0])[0], "signalp3.py") |
| 214 if not os.path.isfile(signalp_script): | 229 if not os.path.isfile(signalp_script): |
| 215 sys.exit("Error - missing signalp3.py script") | 230 sys.exit("Error - missing signalp3.py script") |
| 216 cmd = "python %s euk %i %s %s %s" % (signalp_script, signalp_trunc, threads, signalp_input_file, signalp_output_file) | 231 cmd = "python '%s' 'euk' '%i' '%s' '%s' '%s'" % ( |
| 232 signalp_script, | |
| 233 signalp_trunc, | |
| 234 threads, | |
| 235 signalp_input_file, | |
| 236 signalp_output_file, | |
| 237 ) | |
| 217 return_code = os.system(cmd) | 238 return_code = os.system(cmd) |
| 218 if return_code: | 239 if return_code: |
| 219 sys.exit("Error %i from SignalP:\n%s" % (return_code, cmd)) | 240 sys.exit("Error %i from SignalP:\n%s" % (return_code, cmd)) |
| 220 # print "SignalP done" | 241 # print "SignalP done" |
| 221 | 242 |
| 222 | 243 |
| 223 def parse_signalp(filename): | 244 def parse_signalp(filename): |
| 224 """Parse SignalP output, yield tuples of ID, HMM_Sprob_score and NN predicted signal peptide length. | 245 """Parse SignalP output, yield tuples of values. |
| 246 | |
| 247 Returns tuples of ID, HMM_Sprob_score and NN predicted signal | |
| 248 peptide length. | |
| 225 | 249 |
| 226 For signal peptide length we use NN_Ymax_pos (minus one). | 250 For signal peptide length we use NN_Ymax_pos (minus one). |
| 227 """ | 251 """ |
| 228 handle = open(filename) | 252 handle = open(filename) |
| 229 line = handle.readline() | 253 line = handle.readline() |
| 235 handle.close() | 259 handle.close() |
| 236 | 260 |
| 237 | 261 |
| 238 # Parse SignalP results and apply the strict RXLR criteria | 262 # Parse SignalP results and apply the strict RXLR criteria |
| 239 total = 0 | 263 total = 0 |
| 240 tally = dict() | 264 tally = {} |
| 241 handle = open(tabular_file, "w") | 265 handle = open(tabular_file, "w") |
| 242 handle.write("#ID\t%s\n" % model) | 266 handle.write("#ID\t%s\n" % model) |
| 243 signalp_results = parse_signalp(signalp_output_file) | 267 signalp_results = parse_signalp(signalp_output_file) |
| 244 for title, seq in fasta_iterator(fasta_file): | 268 for title, seq in fasta_iterator(fasta_file): |
| 245 total += 1 | 269 total += 1 |
| 246 rxlr = "N" | 270 rxlr = "N" |
| 247 name = title.split(None, 1)[0] | 271 name = title.split(None, 1)[0] |
| 248 match = re_rxlr.search(seq[min_rxlr_start - 1:].upper()) | 272 match = re_rxlr.search(seq[min_rxlr_start - 1 :].upper()) |
| 249 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: | 273 if match and min_rxlr_start - 1 + match.start() + 1 <= max_rxlr_start: |
| 250 del match | 274 del match |
| 251 # This was the criteria for calling SignalP, | 275 # This was the criteria for calling SignalP, |
| 252 # so it will be in the SignalP results. | 276 # so it will be in the SignalP results. |
| 253 sp_id, sp_hmm_score, sp_nn_len = signalp_results.next() | 277 sp_id, sp_hmm_score, sp_nn_len = next(signalp_results) |
| 254 assert name == sp_id, "%s vs %s" % (name, sp_id) | 278 assert name == sp_id, "%s vs %s" % (name, sp_id) |
| 255 if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp: | 279 if sp_hmm_score >= min_signalp_hmm and min_sp <= sp_nn_len <= max_sp: |
| 256 match = re_rxlr.search(seq[sp_nn_len:].upper()) | 280 match = re_rxlr.search(seq[sp_nn_len:].upper()) |
| 257 if match and match.start() + 1 <= max_sp_rxlr: # 1-based counting | 281 if match and match.start() + 1 <= max_sp_rxlr: # 1-based counting |
| 258 rxlr_start = sp_nn_len + match.start() + 1 | 282 rxlr_start = sp_nn_len + match.start() + 1 |
| 276 handle.close() | 300 handle.close() |
| 277 assert sum(tally.values()) == total | 301 assert sum(tally.values()) == total |
| 278 | 302 |
| 279 # Check the iterator is finished | 303 # Check the iterator is finished |
| 280 try: | 304 try: |
| 281 signalp_results.next() | 305 next(signalp_results) |
| 282 assert False, "Unexpected data in SignalP output" | 306 assert False, "Unexpected data in SignalP output" |
| 283 except StopIteration: | 307 except StopIteration: |
| 284 pass | 308 pass |
| 285 | 309 |
| 286 # Cleanup | 310 # Cleanup |
| 287 os.remove(signalp_input_file) | 311 os.remove(signalp_input_file) |
| 288 os.remove(signalp_output_file) | 312 os.remove(signalp_output_file) |
| 289 | 313 |
| 290 # Short summary to stdout for Galaxy's info display | 314 # Short summary to stdout for Galaxy's info display |
| 291 print("%s for %i sequences:" % (model, total)) | 315 print("%s for %i sequences:" % (model, total)) |
| 292 print(", ".join("%s = %i" % kv for kv in sorted(tally.iteritems()))) | 316 print(", ".join("%s = %i" % kv for kv in sorted(tally.items()))) |
