Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/signalp3.py @ 7:9b45a8743100 draft
Uploaded v0.1.0, which adds a wrapper for Promoter 2.0 (DNA tool) and enables use of Galaxy's <parallelism> tag for SignalP, TMHMM X Promoter wrappers.
author | peterjc |
---|---|
date | Mon, 30 Jul 2012 10:25:07 -0400 |
parents | 0f1c61998b22 |
children | e52220a9ddad |
comparison
equal
deleted
inserted
replaced
6:a290c6d4e658 | 7:9b45a8743100 |
---|---|
2 """Wrapper for SignalP v3.0 for use in Galaxy. | 2 """Wrapper for SignalP v3.0 for use in Galaxy. |
3 | 3 |
4 This script takes exactly five command line arguments: | 4 This script takes exactly five command line arguments: |
5 * the organism type (euk, gram+ or gram-) | 5 * the organism type (euk, gram+ or gram-) |
6 * length to truncate sequences to (integer) | 6 * length to truncate sequences to (integer) |
7 * number of threads to use (integer) | 7 * number of threads to use (integer, defaults to one) |
8 * an input protein FASTA filename | 8 * an input protein FASTA filename |
9 * output tabular filename. | 9 * output tabular filename. |
10 | |
11 There are two further optional arguments | |
12 * cut type (NN_Cmax, NN_Ymax, NN_Smax or HMM_Cmax) | |
13 * output GFF3 filename | |
10 | 14 |
11 It then calls the standalone SignalP v3.0 program (not the webservice) | 15 It then calls the standalone SignalP v3.0 program (not the webservice) |
12 requesting the short output (one line per protein) using both NN and HMM | 16 requesting the short output (one line per protein) using both NN and HMM |
13 for predictions. | 17 for predictions. |
14 | 18 |
39 The third major feature is taking advantage of multiple cores (since SignalP | 43 The third major feature is taking advantage of multiple cores (since SignalP |
40 v3.0 itself is single threaded) by using the individual FASTA input files to | 44 v3.0 itself is single threaded) by using the individual FASTA input files to |
41 run multiple copies of TMHMM in parallel. I would normally use Python's | 45 run multiple copies of TMHMM in parallel. I would normally use Python's |
42 multiprocessing library in this situation but it requires at least Python 2.6 | 46 multiprocessing library in this situation but it requires at least Python 2.6 |
43 and at the time of writing Galaxy still supports Python 2.4. | 47 and at the time of writing Galaxy still supports Python 2.4. |
48 | |
49 Note that this is somewhat redundant with job-splitting available in Galaxy | |
50 itself (see the SignalP XML file for settings). | |
51 | |
52 Finally, you can opt to have a GFF3 file produced which will describe the | |
53 predicted signal peptide and mature peptide for each protein (using one of | |
54 the predictors which gives a cleavage site). *WORK IN PROGRESS* | |
44 """ | 55 """ |
45 import sys | 56 import sys |
46 import os | 57 import os |
47 from seq_analysis_utils import stop_err, split_fasta, run_jobs | 58 import tempfile |
59 from seq_analysis_utils import stop_err, split_fasta, run_jobs, fasta_iterator | |
48 | 60 |
49 FASTA_CHUNK = 500 | 61 FASTA_CHUNK = 500 |
50 MAX_LEN = 6000 #Found by trial and error | 62 MAX_LEN = 6000 #Found by trial and error |
51 | 63 |
52 if len(sys.argv) != 6: | 64 if len(sys.argv) not in [6,8]: |
53 stop_err("Require five arguments, organism, truncate, threads, input protein FASTA file & output tabular file") | 65 stop_err("Require five (or 7) arguments, organism, truncate, threads, " |
66 "input protein FASTA file & output tabular file (plus " | |
67 "optionally cut method and GFF3 output file). " | |
68 "Got %i arguments." % (len(sys.argv)-1)) | |
54 | 69 |
55 organism = sys.argv[1] | 70 organism = sys.argv[1] |
56 if organism not in ["euk", "gram+", "gram-"]: | 71 if organism not in ["euk", "gram+", "gram-"]: |
57 stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism) | 72 stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism) |
58 | 73 |
64 stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) | 79 stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) |
65 | 80 |
66 try: | 81 try: |
67 num_threads = int(sys.argv[3]) | 82 num_threads = int(sys.argv[3]) |
68 except: | 83 except: |
69 num_threads = 0 | 84 num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined |
70 if num_threads < 1: | 85 if num_threads < 1: |
71 stop_err("Threads argument %s is not a positive integer" % sys.argv[3]) | 86 stop_err("Threads argument %s is not a positive integer" % sys.argv[3]) |
72 | 87 |
73 fasta_file = sys.argv[4] | 88 fasta_file = sys.argv[4] |
74 | 89 |
75 tabular_file = sys.argv[5] | 90 tabular_file = sys.argv[5] |
76 | 91 |
77 def clean_tabular(raw_handle, out_handle): | 92 if len(sys.argv) == 8: |
93 cut_method = sys.argv[6] | |
94 if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]: | |
95 stop_err("Invalid cut method %r" % cut_method) | |
96 gff3_file = sys.argv[7] | |
97 else: | |
98 cut_method = None | |
99 gff3_file = None | |
100 | |
101 | |
102 tmp_dir = tempfile.mkdtemp() | |
103 | |
104 def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None): | |
78 """Clean up SignalP output to make it tabular.""" | 105 """Clean up SignalP output to make it tabular.""" |
106 if cut_method: | |
107 cut_col = {"NN_Cmax" : 2, | |
108 "NN_Ymax" : 5, | |
109 "NN_Smax" : 8, | |
110 "HMM_Cmax" : 16}[cut_method] | |
111 else: | |
112 cut_col = None | |
79 for line in raw_handle: | 113 for line in raw_handle: |
80 if not line or line.startswith("#"): | 114 if not line or line.startswith("#"): |
81 continue | 115 continue |
82 parts = line.rstrip("\r\n").split() | 116 parts = line.rstrip("\r\n").split() |
83 assert len(parts)==21, repr(line) | 117 assert len(parts)==21, repr(line) |
85 #Remove redundant truncated name column (col 0) | 119 #Remove redundant truncated name column (col 0) |
86 #and put full name at start (col 14) | 120 #and put full name at start (col 14) |
87 parts = parts[14:15] + parts[1:14] + parts[15:] | 121 parts = parts[14:15] + parts[1:14] + parts[15:] |
88 out_handle.write("\t".join(parts) + "\n") | 122 out_handle.write("\t".join(parts) + "\n") |
89 | 123 |
90 fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) | 124 def make_gff(fasta_file, tabular_file, gff_file, cut_method): |
125 cut_col, score_col = {"NN_Cmax" : (2,1), | |
126 "NN_Ymax" : (5,4), | |
127 "NN_Smax" : (8,7), | |
128 "HMM_Cmax" : (16,15), | |
129 }[cut_method] | |
130 | |
131 source = "SignalP" | |
132 strand = "." #not stranded | |
133 phase = "." #not phased | |
134 tags = "Note=%s" % cut_method | |
135 | |
136 tab_handle = open(tabular_file) | |
137 line = tab_handle.readline() | |
138 assert line.startswith("#ID\t"), line | |
139 | |
140 gff_handle = open(gff_file, "w") | |
141 gff_handle.write("##gff-version 3\n") | |
142 | |
143 for (title, seq), line in zip(fasta_iterator(fasta_file), tab_handle): | |
144 parts = line.rstrip("\n").split("\t") | |
145 seqid = parts[0] | |
146 assert title.startswith(seqid), "%s vs %s" % (seqid, title) | |
147 if len(seq)==0: | |
148 #Is it possible to have a zero length reference in GFF3? | |
149 continue | |
150 cut = int(parts[cut_col]) | |
151 if cut == 0: | |
152 assert cut_method == "HMM_Cmax", cut_method | |
153 #TODO - Why does it do this? | |
154 cut = 1 | |
155 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) | |
156 score = parts[score_col] | |
157 gff_handle.write("##sequence-region %s %i %i\n" \ | |
158 % (seqid, 1, len(seq))) | |
159 #If the cut is at the very begining, there is no signal peptide! | |
160 if cut > 1: | |
161 #signal_peptide = SO:0000418 | |
162 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \ | |
163 % (seqid, source, | |
164 "signal_peptide", 1, cut-1, | |
165 score, strand, phase, tags)) | |
166 #mature_protein_region = SO:0000419 | |
167 gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \ | |
168 % (seqid, source, | |
169 "mature_protein_region", cut, len(seq), | |
170 score, strand, phase, tags)) | |
171 tab_handle.close() | |
172 gff_handle.close() | |
173 | |
174 | |
175 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"), | |
176 n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) | |
91 temp_files = [f+".out" for f in fasta_files] | 177 temp_files = [f+".out" for f in fasta_files] |
92 assert len(fasta_files) == len(temp_files) | 178 assert len(fasta_files) == len(temp_files) |
93 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) | 179 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) |
94 for (fasta, temp) in zip(fasta_files, temp_files)] | 180 for (fasta, temp) in zip(fasta_files, temp_files)] |
95 assert len(fasta_files) == len(temp_files) == len(jobs) | 181 assert len(fasta_files) == len(temp_files) == len(jobs) |
96 | 182 |
97 def clean_up(file_list): | 183 def clean_up(file_list): |
98 for f in file_list: | 184 for f in file_list: |
99 if os.path.isfile(f): | 185 if os.path.isfile(f): |
100 os.remove(f) | 186 os.remove(f) |
187 try: | |
188 os.rmdir(tmp_dir) | |
189 except: | |
190 pass | |
101 | 191 |
102 if len(jobs) > 1 and num_threads > 1: | 192 if len(jobs) > 1 and num_threads > 1: |
103 #A small "info" message for Galaxy to show the user. | 193 #A small "info" message for Galaxy to show the user. |
104 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | 194 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) |
105 results = run_jobs(jobs, num_threads) | 195 results = run_jobs(jobs, num_threads) |
107 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | 197 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): |
108 error_level = results[cmd] | 198 error_level = results[cmd] |
109 try: | 199 try: |
110 output = open(temp).readline() | 200 output = open(temp).readline() |
111 except IOError: | 201 except IOError: |
112 output = "" | 202 output = "(no output)" |
113 if error_level or output.lower().startswith("error running"): | 203 if error_level or output.lower().startswith("error running"): |
114 clean_up(fasta_files) | 204 clean_up(fasta_files + temp_files) |
115 clean_up(temp_files) | |
116 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | 205 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), |
117 error_level) | 206 error_level) |
118 del results | 207 del results |
119 | 208 |
120 out_handle = open(tabular_file, "w") | 209 out_handle = open(tabular_file, "w") |
131 data_handle = open(temp) | 220 data_handle = open(temp) |
132 clean_tabular(data_handle, out_handle) | 221 clean_tabular(data_handle, out_handle) |
133 data_handle.close() | 222 data_handle.close() |
134 out_handle.close() | 223 out_handle.close() |
135 | 224 |
136 clean_up(fasta_files) | 225 #GFF3: |
137 clean_up(temp_files) | 226 if cut_method: |
227 make_gff(fasta_file, tabular_file, gff3_file, cut_method) | |
228 | |
229 clean_up(fasta_files + temp_files) |