comparison tools/protein_analysis/signalp3.py @ 21:238eae32483c draft

"Check this is up to date with all 2020 changes (black etc)"
author peterjc
date Thu, 17 Jun 2021 08:21:06 +0000
parents a19b3ded8f33
children e1996f0f4e85
comparison
equal deleted inserted replaced
20:a19b3ded8f33 21:238eae32483c
69 if "-v" in sys.argv or "--version" in sys.argv: 69 if "-v" in sys.argv or "--version" in sys.argv:
70 print("SignalP Galaxy wrapper version 0.0.19") 70 print("SignalP Galaxy wrapper version 0.0.19")
71 sys.exit(os.system("signalp -version")) 71 sys.exit(os.system("signalp -version"))
72 72
73 if len(sys.argv) not in [6, 8]: 73 if len(sys.argv) not in [6, 8]:
74 sys.exit("Require five (or 7) arguments, organism, truncate, threads, " 74 sys.exit(
75 "input protein FASTA file & output tabular file (plus " 75 "Require five (or 7) arguments, organism, truncate, threads, "
76 "optionally cut method and GFF3 output file). " 76 "input protein FASTA file & output tabular file (plus "
77 "Got %i arguments." % (len(sys.argv) - 1)) 77 "optionally cut method and GFF3 output file). "
78 "Got %i arguments." % (len(sys.argv) - 1)
79 )
78 80
79 organism = sys.argv[1] 81 organism = sys.argv[1]
80 if organism not in ["euk", "gram+", "gram-"]: 82 if organism not in ["euk", "gram+", "gram-"]:
81 sys.exit("Organism argument %s is not one of euk, gram+ or gram-" % organism) 83 sys.exit("Organism argument %s is not one of euk, gram+ or gram-" % organism)
82 84
109 for line in raw_handle: 111 for line in raw_handle:
110 if not line or line.startswith("#"): 112 if not line or line.startswith("#"):
111 continue 113 continue
112 parts = line.rstrip("\r\n").split() 114 parts = line.rstrip("\r\n").split()
113 assert len(parts) == 21, repr(line) 115 assert len(parts) == 21, repr(line)
114 assert parts[14].startswith(parts[0]), \ 116 assert parts[14].startswith(parts[0]), (
115 "Bad entry in SignalP output, ID miss-match:\n%r" % line 117 "Bad entry in SignalP output, ID miss-match:\n%r" % line
118 )
116 # Remove redundant truncated name column (col 0) 119 # Remove redundant truncated name column (col 0)
117 # and put full name at start (col 14) 120 # and put full name at start (col 14)
118 parts = parts[14:15] + parts[1:14] + parts[15:] 121 parts = parts[14:15] + parts[1:14] + parts[15:]
119 out_handle.write("\t".join(parts) + "\n") 122 out_handle.write("\t".join(parts) + "\n")
120 123
121 124
122 def make_gff(fasta_file, tabular_file, gff_file, cut_method): 125 def make_gff(fasta_file, tabular_file, gff_file, cut_method):
123 """Make a GFF file.""" 126 """Make a GFF file."""
124 cut_col, score_col = {"NN_Cmax": (2, 1), 127 cut_col, score_col = {
125 "NN_Ymax": (5, 4), 128 "NN_Cmax": (2, 1),
126 "NN_Smax": (8, 7), 129 "NN_Ymax": (5, 4),
127 "HMM_Cmax": (16, 15), 130 "NN_Smax": (8, 7),
128 }[cut_method] 131 "HMM_Cmax": (16, 15),
132 }[cut_method]
129 133
130 source = "SignalP" 134 source = "SignalP"
131 strand = "." # not stranded 135 strand = "." # not stranded
132 phase = "." # not phased 136 phase = "." # not phased
133 tags = "Note=%s" % cut_method 137 tags = "Note=%s" % cut_method
151 assert cut_method == "HMM_Cmax", cut_method 155 assert cut_method == "HMM_Cmax", cut_method
152 # TODO - Why does it do this? 156 # TODO - Why does it do this?
153 cut = 1 157 cut = 1
154 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) 158 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq))
155 score = parts[score_col] 159 score = parts[score_col]
156 gff_handle.write("##sequence-region %s %i %i\n" 160 gff_handle.write("##sequence-region %s %i %i\n" % (seqid, 1, len(seq)))
157 % (seqid, 1, len(seq)))
158 # If the cut is at the very begining, there is no signal peptide! 161 # If the cut is at the very begining, there is no signal peptide!
159 if cut > 1: 162 if cut > 1:
160 # signal_peptide = SO:0000418 163 # signal_peptide = SO:0000418
161 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" 164 gff_handle.write(
162 % (seqid, source, 165 "%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n"
163 "signal_peptide", 1, cut - 1, 166 % (
164 score, strand, phase, tags)) 167 seqid,
168 source,
169 "signal_peptide",
170 1,
171 cut - 1,
172 score,
173 strand,
174 phase,
175 tags,
176 )
177 )
165 # mature_protein_region = SO:0000419 178 # mature_protein_region = SO:0000419
166 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" 179 gff_handle.write(
167 % (seqid, source, 180 "%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n"
168 "mature_protein_region", cut, len(seq), 181 % (
169 score, strand, phase, tags)) 182 seqid,
183 source,
184 "mature_protein_region",
185 cut,
186 len(seq),
187 score,
188 strand,
189 phase,
190 tags,
191 )
192 )
170 tab_handle.close() 193 tab_handle.close()
171 gff_handle.close() 194 gff_handle.close()
172 195
173 196
174 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"), 197 if num_threads == 1:
175 n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) 198 # Still want to call split_fasta to apply truncation, but
199 # no reason to make multiple files - and more chance of
200 # hitting file system glitches if we do. So,
201 FASTA_CHUNK = sys.maxsize
202
203 fasta_files = split_fasta(
204 fasta_file,
205 os.path.join(tmp_dir, "signalp"),
206 n=FASTA_CHUNK,
207 truncate=truncate,
208 max_len=MAX_LEN,
209 )
176 temp_files = [f + ".out" for f in fasta_files] 210 temp_files = [f + ".out" for f in fasta_files]
177 assert len(fasta_files) == len(temp_files) 211 assert len(fasta_files) == len(temp_files)
178 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) 212 jobs = [
179 for (fasta, temp) in zip(fasta_files, temp_files)] 213 "signalp -short -t %s %s > %s" % (organism, fasta, temp)
214 for (fasta, temp) in zip(fasta_files, temp_files)
215 ]
180 assert len(fasta_files) == len(temp_files) == len(jobs) 216 assert len(fasta_files) == len(temp_files) == len(jobs)
181 217
182 218
183 def clean_up(file_list): 219 def clean_up(file_list):
184 """Remove temp files, and if possible the temp directory.""" 220 """Remove temp files, and if possible the temp directory."""
203 except IOError: 239 except IOError:
204 output = "(no output)" 240 output = "(no output)"
205 if error_level or output.lower().startswith("error running"): 241 if error_level or output.lower().startswith("error running"):
206 clean_up(fasta_files + temp_files) 242 clean_up(fasta_files + temp_files)
207 if output: 243 if output:
208 sys.stderr.write("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output)) 244 sys.stderr.write(
245 "One or more tasks failed, e.g. %i from %r gave:\n%s"
246 % (error_level, cmd, output)
247 )
209 else: 248 else:
210 sys.stderr.write("One or more tasks failed, e.g. %i from %r with no output\n" % (error_level, cmd)) 249 sys.stderr.write(
250 "One or more tasks failed, e.g. %i from %r with no output\n"
251 % (error_level, cmd)
252 )
211 sys.exit(error_level) 253 sys.exit(error_level)
212 del results 254 del results
213 255
214 out_handle = open(tabular_file, "w") 256 out_handle = open(tabular_file, "w")
215 fields = ["ID"] 257 fields = ["ID"]
216 # NN results: 258 # NN results:
217 for name in ["Cmax", "Ymax", "Smax"]: 259 for name in ["Cmax", "Ymax", "Smax"]:
218 fields.extend(["NN_%s_score" % name, "NN_%s_pos" % name, "NN_%s_pred" % name]) 260 fields.extend(["NN_%s_score" % name, "NN_%s_pos" % name, "NN_%s_pred" % name])
219 fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"]) 261 fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"])
220 # HMM results: 262 # HMM results:
221 fields.extend(["HMM_type", "HMM_Cmax_score", "HMM_Cmax_pos", "HMM_Cmax_pred", 263 fields.extend(
222 "HMM_Sprob_score", "HMM_Sprob_pred"]) 264 [
265 "HMM_type",
266 "HMM_Cmax_score",
267 "HMM_Cmax_pos",
268 "HMM_Cmax_pred",
269 "HMM_Sprob_score",
270 "HMM_Sprob_pred",
271 ]
272 )
223 out_handle.write("#" + "\t".join(fields) + "\n") 273 out_handle.write("#" + "\t".join(fields) + "\n")
224 for temp in temp_files: 274 for temp in temp_files:
225 data_handle = open(temp) 275 data_handle = open(temp)
226 clean_tabular(data_handle, out_handle) 276 clean_tabular(data_handle, out_handle)
227 data_handle.close() 277 data_handle.close()