comparison tools/protein_analysis/signalp3.py @ 20:a19b3ded8f33 draft

v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
author peterjc
date Thu, 21 Sep 2017 11:35:20 -0400
parents f3ecd80850e2
children 238eae32483c
comparison
equal deleted inserted replaced
19:f3ecd80850e2 20:a19b3ded8f33
50 itself (see the SignalP XML file for settings). 50 itself (see the SignalP XML file for settings).
51 51
52 Finally, you can opt to have a GFF3 file produced which will describe the 52 Finally, you can opt to have a GFF3 file produced which will describe the
53 predicted signal peptide and mature peptide for each protein (using one of 53 predicted signal peptide and mature peptide for each protein (using one of
54 the predictors which gives a cleavage site). *WORK IN PROGRESS* 54 the predictors which gives a cleavage site). *WORK IN PROGRESS*
55 """ 55 """ # noqa: E501
56
57 from __future__ import print_function
58
59 import os
56 import sys 60 import sys
57 import os
58 import tempfile 61 import tempfile
59 from seq_analysis_utils import split_fasta, fasta_iterator 62
63 from seq_analysis_utils import fasta_iterator, split_fasta
60 from seq_analysis_utils import run_jobs, thread_count 64 from seq_analysis_utils import run_jobs, thread_count
61 65
62 FASTA_CHUNK = 500 66 FASTA_CHUNK = 500
63 MAX_LEN = 6000 # Found by trial and error 67 MAX_LEN = 6000 # Found by trial and error
68
69 if "-v" in sys.argv or "--version" in sys.argv:
70 print("SignalP Galaxy wrapper version 0.0.19")
71 sys.exit(os.system("signalp -version"))
64 72
65 if len(sys.argv) not in [6, 8]: 73 if len(sys.argv) not in [6, 8]:
66 sys.exit("Require five (or 7) arguments, organism, truncate, threads, " 74 sys.exit("Require five (or 7) arguments, organism, truncate, threads, "
67 "input protein FASTA file & output tabular file (plus " 75 "input protein FASTA file & output tabular file (plus "
68 "optionally cut method and GFF3 output file). " 76 "optionally cut method and GFF3 output file). "
94 102
95 103
96 tmp_dir = tempfile.mkdtemp() 104 tmp_dir = tempfile.mkdtemp()
97 105
98 106
99 def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None): 107 def clean_tabular(raw_handle, out_handle, gff_handle=None):
100 """Clean up SignalP output to make it tabular.""" 108 """Clean up SignalP output to make it tabular."""
101 if cut_method:
102 cut_col = {"NN_Cmax": 2,
103 "NN_Ymax": 5,
104 "NN_Smax": 8,
105 "HMM_Cmax": 16}[cut_method]
106 else:
107 cut_col = None
108 for line in raw_handle: 109 for line in raw_handle:
109 if not line or line.startswith("#"): 110 if not line or line.startswith("#"):
110 continue 111 continue
111 parts = line.rstrip("\r\n").split() 112 parts = line.rstrip("\r\n").split()
112 assert len(parts) == 21, repr(line) 113 assert len(parts) == 21, repr(line)
117 parts = parts[14:15] + parts[1:14] + parts[15:] 118 parts = parts[14:15] + parts[1:14] + parts[15:]
118 out_handle.write("\t".join(parts) + "\n") 119 out_handle.write("\t".join(parts) + "\n")
119 120
120 121
121 def make_gff(fasta_file, tabular_file, gff_file, cut_method): 122 def make_gff(fasta_file, tabular_file, gff_file, cut_method):
123 """Make a GFF file."""
122 cut_col, score_col = {"NN_Cmax": (2, 1), 124 cut_col, score_col = {"NN_Cmax": (2, 1),
123 "NN_Ymax": (5, 4), 125 "NN_Ymax": (5, 4),
124 "NN_Smax": (8, 7), 126 "NN_Smax": (8, 7),
125 "HMM_Cmax": (16, 15), 127 "HMM_Cmax": (16, 15),
126 }[cut_method] 128 }[cut_method]
150 # TODO - Why does it do this? 152 # TODO - Why does it do this?
151 cut = 1 153 cut = 1
152 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) 154 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq))
153 score = parts[score_col] 155 score = parts[score_col]
154 gff_handle.write("##sequence-region %s %i %i\n" 156 gff_handle.write("##sequence-region %s %i %i\n"
155 % (seqid, 1, len(seq))) 157 % (seqid, 1, len(seq)))
156 # If the cut is at the very begining, there is no signal peptide! 158 # If the cut is at the very begining, there is no signal peptide!
157 if cut > 1: 159 if cut > 1:
158 # signal_peptide = SO:0000418 160 # signal_peptide = SO:0000418
159 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" 161 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n"
160 % (seqid, source, 162 % (seqid, source,
186 try: 188 try:
187 os.rmdir(tmp_dir) 189 os.rmdir(tmp_dir)
188 except Exception: 190 except Exception:
189 pass 191 pass
190 192
193
191 if len(jobs) > 1 and num_threads > 1: 194 if len(jobs) > 1 and num_threads > 1:
192 # A small "info" message for Galaxy to show the user. 195 # A small "info" message for Galaxy to show the user.
193 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) 196 print("Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)))
194 results = run_jobs(jobs, num_threads) 197 results = run_jobs(jobs, num_threads)
195 assert len(fasta_files) == len(temp_files) == len(jobs) 198 assert len(fasta_files) == len(temp_files) == len(jobs)
196 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): 199 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
197 error_level = results[cmd] 200 error_level = results[cmd]
198 try: 201 try:
199 output = open(temp).readline() 202 output = open(temp).readline()
200 except IOError: 203 except IOError:
201 output = "(no output)" 204 output = "(no output)"
202 if error_level or output.lower().startswith("error running"): 205 if error_level or output.lower().startswith("error running"):
203 clean_up(fasta_files + temp_files) 206 clean_up(fasta_files + temp_files)
204 sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), 207 if output:
205 error_level) 208 sys.stderr.write("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output))
209 else:
210 sys.stderr.write("One or more tasks failed, e.g. %i from %r with no output\n" % (error_level, cmd))
211 sys.exit(error_level)
206 del results 212 del results
207 213
208 out_handle = open(tabular_file, "w") 214 out_handle = open(tabular_file, "w")
209 fields = ["ID"] 215 fields = ["ID"]
210 # NN results: 216 # NN results: