Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/signalp3.py @ 20:a19b3ded8f33 draft
v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
author | peterjc |
---|---|
date | Thu, 21 Sep 2017 11:35:20 -0400 |
parents | f3ecd80850e2 |
children | 238eae32483c |
comparison
equal
deleted
inserted
replaced
19:f3ecd80850e2 | 20:a19b3ded8f33 |
---|---|
50 itself (see the SignalP XML file for settings). | 50 itself (see the SignalP XML file for settings). |
51 | 51 |
52 Finally, you can opt to have a GFF3 file produced which will describe the | 52 Finally, you can opt to have a GFF3 file produced which will describe the |
53 predicted signal peptide and mature peptide for each protein (using one of | 53 predicted signal peptide and mature peptide for each protein (using one of |
54 the predictors which gives a cleavage site). *WORK IN PROGRESS* | 54 the predictors which gives a cleavage site). *WORK IN PROGRESS* |
55 """ | 55 """ # noqa: E501 |
56 | |
57 from __future__ import print_function | |
58 | |
59 import os | |
56 import sys | 60 import sys |
57 import os | |
58 import tempfile | 61 import tempfile |
59 from seq_analysis_utils import split_fasta, fasta_iterator | 62 |
63 from seq_analysis_utils import fasta_iterator, split_fasta | |
60 from seq_analysis_utils import run_jobs, thread_count | 64 from seq_analysis_utils import run_jobs, thread_count |
61 | 65 |
62 FASTA_CHUNK = 500 | 66 FASTA_CHUNK = 500 |
63 MAX_LEN = 6000 # Found by trial and error | 67 MAX_LEN = 6000 # Found by trial and error |
68 | |
69 if "-v" in sys.argv or "--version" in sys.argv: | |
70 print("SignalP Galaxy wrapper version 0.0.19") | |
71 sys.exit(os.system("signalp -version")) | |
64 | 72 |
65 if len(sys.argv) not in [6, 8]: | 73 if len(sys.argv) not in [6, 8]: |
66 sys.exit("Require five (or 7) arguments, organism, truncate, threads, " | 74 sys.exit("Require five (or 7) arguments, organism, truncate, threads, " |
67 "input protein FASTA file & output tabular file (plus " | 75 "input protein FASTA file & output tabular file (plus " |
68 "optionally cut method and GFF3 output file). " | 76 "optionally cut method and GFF3 output file). " |
94 | 102 |
95 | 103 |
96 tmp_dir = tempfile.mkdtemp() | 104 tmp_dir = tempfile.mkdtemp() |
97 | 105 |
98 | 106 |
99 def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None): | 107 def clean_tabular(raw_handle, out_handle, gff_handle=None): |
100 """Clean up SignalP output to make it tabular.""" | 108 """Clean up SignalP output to make it tabular.""" |
101 if cut_method: | |
102 cut_col = {"NN_Cmax": 2, | |
103 "NN_Ymax": 5, | |
104 "NN_Smax": 8, | |
105 "HMM_Cmax": 16}[cut_method] | |
106 else: | |
107 cut_col = None | |
108 for line in raw_handle: | 109 for line in raw_handle: |
109 if not line or line.startswith("#"): | 110 if not line or line.startswith("#"): |
110 continue | 111 continue |
111 parts = line.rstrip("\r\n").split() | 112 parts = line.rstrip("\r\n").split() |
112 assert len(parts) == 21, repr(line) | 113 assert len(parts) == 21, repr(line) |
117 parts = parts[14:15] + parts[1:14] + parts[15:] | 118 parts = parts[14:15] + parts[1:14] + parts[15:] |
118 out_handle.write("\t".join(parts) + "\n") | 119 out_handle.write("\t".join(parts) + "\n") |
119 | 120 |
120 | 121 |
121 def make_gff(fasta_file, tabular_file, gff_file, cut_method): | 122 def make_gff(fasta_file, tabular_file, gff_file, cut_method): |
123 """Make a GFF file.""" | |
122 cut_col, score_col = {"NN_Cmax": (2, 1), | 124 cut_col, score_col = {"NN_Cmax": (2, 1), |
123 "NN_Ymax": (5, 4), | 125 "NN_Ymax": (5, 4), |
124 "NN_Smax": (8, 7), | 126 "NN_Smax": (8, 7), |
125 "HMM_Cmax": (16, 15), | 127 "HMM_Cmax": (16, 15), |
126 }[cut_method] | 128 }[cut_method] |
150 # TODO - Why does it do this? | 152 # TODO - Why does it do this? |
151 cut = 1 | 153 cut = 1 |
152 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) | 154 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) |
153 score = parts[score_col] | 155 score = parts[score_col] |
154 gff_handle.write("##sequence-region %s %i %i\n" | 156 gff_handle.write("##sequence-region %s %i %i\n" |
155 % (seqid, 1, len(seq))) | 157 % (seqid, 1, len(seq))) |
156 # If the cut is at the very begining, there is no signal peptide! | 158 # If the cut is at the very begining, there is no signal peptide! |
157 if cut > 1: | 159 if cut > 1: |
158 # signal_peptide = SO:0000418 | 160 # signal_peptide = SO:0000418 |
159 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" | 161 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" |
160 % (seqid, source, | 162 % (seqid, source, |
186 try: | 188 try: |
187 os.rmdir(tmp_dir) | 189 os.rmdir(tmp_dir) |
188 except Exception: | 190 except Exception: |
189 pass | 191 pass |
190 | 192 |
193 | |
191 if len(jobs) > 1 and num_threads > 1: | 194 if len(jobs) > 1 and num_threads > 1: |
192 # A small "info" message for Galaxy to show the user. | 195 # A small "info" message for Galaxy to show the user. |
193 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | 196 print("Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))) |
194 results = run_jobs(jobs, num_threads) | 197 results = run_jobs(jobs, num_threads) |
195 assert len(fasta_files) == len(temp_files) == len(jobs) | 198 assert len(fasta_files) == len(temp_files) == len(jobs) |
196 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | 199 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): |
197 error_level = results[cmd] | 200 error_level = results[cmd] |
198 try: | 201 try: |
199 output = open(temp).readline() | 202 output = open(temp).readline() |
200 except IOError: | 203 except IOError: |
201 output = "(no output)" | 204 output = "(no output)" |
202 if error_level or output.lower().startswith("error running"): | 205 if error_level or output.lower().startswith("error running"): |
203 clean_up(fasta_files + temp_files) | 206 clean_up(fasta_files + temp_files) |
204 sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | 207 if output: |
205 error_level) | 208 sys.stderr.write("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output)) |
209 else: | |
210 sys.stderr.write("One or more tasks failed, e.g. %i from %r with no output\n" % (error_level, cmd)) | |
211 sys.exit(error_level) | |
206 del results | 212 del results |
207 | 213 |
208 out_handle = open(tabular_file, "w") | 214 out_handle = open(tabular_file, "w") |
209 fields = ["ID"] | 215 fields = ["ID"] |
210 # NN results: | 216 # NN results: |