Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/signalp3.py @ 21:238eae32483c draft
"Check this is up to date with all 2020 changes (black etc)"
author | peterjc |
---|---|
date | Thu, 17 Jun 2021 08:21:06 +0000 |
parents | a19b3ded8f33 |
children | e1996f0f4e85 |
comparison
equal
deleted
inserted
replaced
20:a19b3ded8f33 | 21:238eae32483c |
---|---|
69 if "-v" in sys.argv or "--version" in sys.argv: | 69 if "-v" in sys.argv or "--version" in sys.argv: |
70 print("SignalP Galaxy wrapper version 0.0.19") | 70 print("SignalP Galaxy wrapper version 0.0.19") |
71 sys.exit(os.system("signalp -version")) | 71 sys.exit(os.system("signalp -version")) |
72 | 72 |
73 if len(sys.argv) not in [6, 8]: | 73 if len(sys.argv) not in [6, 8]: |
74 sys.exit("Require five (or 7) arguments, organism, truncate, threads, " | 74 sys.exit( |
75 "input protein FASTA file & output tabular file (plus " | 75 "Require five (or 7) arguments, organism, truncate, threads, " |
76 "optionally cut method and GFF3 output file). " | 76 "input protein FASTA file & output tabular file (plus " |
77 "Got %i arguments." % (len(sys.argv) - 1)) | 77 "optionally cut method and GFF3 output file). " |
78 "Got %i arguments." % (len(sys.argv) - 1) | |
79 ) | |
78 | 80 |
79 organism = sys.argv[1] | 81 organism = sys.argv[1] |
80 if organism not in ["euk", "gram+", "gram-"]: | 82 if organism not in ["euk", "gram+", "gram-"]: |
81 sys.exit("Organism argument %s is not one of euk, gram+ or gram-" % organism) | 83 sys.exit("Organism argument %s is not one of euk, gram+ or gram-" % organism) |
82 | 84 |
109 for line in raw_handle: | 111 for line in raw_handle: |
110 if not line or line.startswith("#"): | 112 if not line or line.startswith("#"): |
111 continue | 113 continue |
112 parts = line.rstrip("\r\n").split() | 114 parts = line.rstrip("\r\n").split() |
113 assert len(parts) == 21, repr(line) | 115 assert len(parts) == 21, repr(line) |
114 assert parts[14].startswith(parts[0]), \ | 116 assert parts[14].startswith(parts[0]), ( |
115 "Bad entry in SignalP output, ID miss-match:\n%r" % line | 117 "Bad entry in SignalP output, ID miss-match:\n%r" % line |
118 ) | |
116 # Remove redundant truncated name column (col 0) | 119 # Remove redundant truncated name column (col 0) |
117 # and put full name at start (col 14) | 120 # and put full name at start (col 14) |
118 parts = parts[14:15] + parts[1:14] + parts[15:] | 121 parts = parts[14:15] + parts[1:14] + parts[15:] |
119 out_handle.write("\t".join(parts) + "\n") | 122 out_handle.write("\t".join(parts) + "\n") |
120 | 123 |
121 | 124 |
122 def make_gff(fasta_file, tabular_file, gff_file, cut_method): | 125 def make_gff(fasta_file, tabular_file, gff_file, cut_method): |
123 """Make a GFF file.""" | 126 """Make a GFF file.""" |
124 cut_col, score_col = {"NN_Cmax": (2, 1), | 127 cut_col, score_col = { |
125 "NN_Ymax": (5, 4), | 128 "NN_Cmax": (2, 1), |
126 "NN_Smax": (8, 7), | 129 "NN_Ymax": (5, 4), |
127 "HMM_Cmax": (16, 15), | 130 "NN_Smax": (8, 7), |
128 }[cut_method] | 131 "HMM_Cmax": (16, 15), |
132 }[cut_method] | |
129 | 133 |
130 source = "SignalP" | 134 source = "SignalP" |
131 strand = "." # not stranded | 135 strand = "." # not stranded |
132 phase = "." # not phased | 136 phase = "." # not phased |
133 tags = "Note=%s" % cut_method | 137 tags = "Note=%s" % cut_method |
151 assert cut_method == "HMM_Cmax", cut_method | 155 assert cut_method == "HMM_Cmax", cut_method |
152 # TODO - Why does it do this? | 156 # TODO - Why does it do this? |
153 cut = 1 | 157 cut = 1 |
154 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) | 158 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) |
155 score = parts[score_col] | 159 score = parts[score_col] |
156 gff_handle.write("##sequence-region %s %i %i\n" | 160 gff_handle.write("##sequence-region %s %i %i\n" % (seqid, 1, len(seq))) |
157 % (seqid, 1, len(seq))) | |
158 # If the cut is at the very begining, there is no signal peptide! | 161 # If the cut is at the very begining, there is no signal peptide! |
159 if cut > 1: | 162 if cut > 1: |
160 # signal_peptide = SO:0000418 | 163 # signal_peptide = SO:0000418 |
161 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" | 164 gff_handle.write( |
162 % (seqid, source, | 165 "%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" |
163 "signal_peptide", 1, cut - 1, | 166 % ( |
164 score, strand, phase, tags)) | 167 seqid, |
168 source, | |
169 "signal_peptide", | |
170 1, | |
171 cut - 1, | |
172 score, | |
173 strand, | |
174 phase, | |
175 tags, | |
176 ) | |
177 ) | |
165 # mature_protein_region = SO:0000419 | 178 # mature_protein_region = SO:0000419 |
166 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" | 179 gff_handle.write( |
167 % (seqid, source, | 180 "%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" |
168 "mature_protein_region", cut, len(seq), | 181 % ( |
169 score, strand, phase, tags)) | 182 seqid, |
183 source, | |
184 "mature_protein_region", | |
185 cut, | |
186 len(seq), | |
187 score, | |
188 strand, | |
189 phase, | |
190 tags, | |
191 ) | |
192 ) | |
170 tab_handle.close() | 193 tab_handle.close() |
171 gff_handle.close() | 194 gff_handle.close() |
172 | 195 |
173 | 196 |
174 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"), | 197 if num_threads == 1: |
175 n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) | 198 # Still want to call split_fasta to apply truncation, but |
199 # no reason to make multiple files - and more chance of | |
200 # hitting file system glitches if we do. So, | |
201 FASTA_CHUNK = sys.maxsize | |
202 | |
203 fasta_files = split_fasta( | |
204 fasta_file, | |
205 os.path.join(tmp_dir, "signalp"), | |
206 n=FASTA_CHUNK, | |
207 truncate=truncate, | |
208 max_len=MAX_LEN, | |
209 ) | |
176 temp_files = [f + ".out" for f in fasta_files] | 210 temp_files = [f + ".out" for f in fasta_files] |
177 assert len(fasta_files) == len(temp_files) | 211 assert len(fasta_files) == len(temp_files) |
178 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) | 212 jobs = [ |
179 for (fasta, temp) in zip(fasta_files, temp_files)] | 213 "signalp -short -t %s %s > %s" % (organism, fasta, temp) |
214 for (fasta, temp) in zip(fasta_files, temp_files) | |
215 ] | |
180 assert len(fasta_files) == len(temp_files) == len(jobs) | 216 assert len(fasta_files) == len(temp_files) == len(jobs) |
181 | 217 |
182 | 218 |
183 def clean_up(file_list): | 219 def clean_up(file_list): |
184 """Remove temp files, and if possible the temp directory.""" | 220 """Remove temp files, and if possible the temp directory.""" |
203 except IOError: | 239 except IOError: |
204 output = "(no output)" | 240 output = "(no output)" |
205 if error_level or output.lower().startswith("error running"): | 241 if error_level or output.lower().startswith("error running"): |
206 clean_up(fasta_files + temp_files) | 242 clean_up(fasta_files + temp_files) |
207 if output: | 243 if output: |
208 sys.stderr.write("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output)) | 244 sys.stderr.write( |
245 "One or more tasks failed, e.g. %i from %r gave:\n%s" | |
246 % (error_level, cmd, output) | |
247 ) | |
209 else: | 248 else: |
210 sys.stderr.write("One or more tasks failed, e.g. %i from %r with no output\n" % (error_level, cmd)) | 249 sys.stderr.write( |
250 "One or more tasks failed, e.g. %i from %r with no output\n" | |
251 % (error_level, cmd) | |
252 ) | |
211 sys.exit(error_level) | 253 sys.exit(error_level) |
212 del results | 254 del results |
213 | 255 |
214 out_handle = open(tabular_file, "w") | 256 out_handle = open(tabular_file, "w") |
215 fields = ["ID"] | 257 fields = ["ID"] |
216 # NN results: | 258 # NN results: |
217 for name in ["Cmax", "Ymax", "Smax"]: | 259 for name in ["Cmax", "Ymax", "Smax"]: |
218 fields.extend(["NN_%s_score" % name, "NN_%s_pos" % name, "NN_%s_pred" % name]) | 260 fields.extend(["NN_%s_score" % name, "NN_%s_pos" % name, "NN_%s_pred" % name]) |
219 fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"]) | 261 fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"]) |
220 # HMM results: | 262 # HMM results: |
221 fields.extend(["HMM_type", "HMM_Cmax_score", "HMM_Cmax_pos", "HMM_Cmax_pred", | 263 fields.extend( |
222 "HMM_Sprob_score", "HMM_Sprob_pred"]) | 264 [ |
265 "HMM_type", | |
266 "HMM_Cmax_score", | |
267 "HMM_Cmax_pos", | |
268 "HMM_Cmax_pred", | |
269 "HMM_Sprob_score", | |
270 "HMM_Sprob_pred", | |
271 ] | |
272 ) | |
223 out_handle.write("#" + "\t".join(fields) + "\n") | 273 out_handle.write("#" + "\t".join(fields) + "\n") |
224 for temp in temp_files: | 274 for temp in temp_files: |
225 data_handle = open(temp) | 275 data_handle = open(temp) |
226 clean_tabular(data_handle, out_handle) | 276 clean_tabular(data_handle, out_handle) |
227 data_handle.close() | 277 data_handle.close() |