Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/signalp3.py @ 0:bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
author | peterjc |
---|---|
date | Tue, 07 Jun 2011 18:03:34 -0400 |
parents | |
children | 0f1c61998b22 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:bca9bc7fdaef |
---|---|
1 #!/usr/bin/env python | |
2 """Wrapper for SignalP v3.0 for use in Galaxy. | |
3 | |
4 This script takes exactly fives command line arguments: | |
5 * the organism type (euk, gram+ or gram-) | |
6 * length to truncate sequences to (integer) | |
7 * number of threads to use (integer) | |
8 * an input protein FASTA filename | |
9 * output tabular filename. | |
10 | |
11 It then calls the standalone SignalP v3.0 program (not the webservice) | |
12 requesting the short output (one line per protein) using both NN and HMM | |
13 for predictions. | |
14 | |
15 First major feature is cleaning up the output. The raw output from SignalP | |
16 v3.0 looks like this (21 columns space separated): | |
17 | |
18 # SignalP-NN euk predictions # SignalP-HMM euk predictions | |
19 # name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ? | |
20 gi|2781234|pdb|1JLY| 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N gi|2781234|pdb|1JLY|B Q 0.000 17 N 0.000 N | |
21 gi|4959044|gb|AAD342 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N gi|4959044|gb|AAD34209.1|AF069992_1 Q 0.000 0 N 0.000 N | |
22 gi|671626|emb|CAA856 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N gi|671626|emb|CAA85685.1| Q 0.000 0 N 0.000 N | |
23 gi|3298468|dbj|BAA31 0.208 24 N 0.184 38 N 0.980 32 Y 0.613 Y 0.398 N gi|3298468|dbj|BAA31520.1| Q 0.066 24 N 0.139 N | |
24 | |
25 In order to make it easier to use in Galaxy, this wrapper script reformats | |
26 this to use tab separators. Also it removes the redundant truncated name | |
27 column, and assigns unique column names in the header: | |
28 | |
29 #ID NN_Cmax_score NN_Cmax_pos NN_Cmax_pred NN_Ymax_score NN_Ymax_pos NN_Ymax_pred NN_Smax_score NN_Smax_pos NN_Smax_pred NN_Smean_score NN_Smean_pred NN_D_score NN_D_pred HMM_bang HMM_Cmax_score HMM_Cmax_pos HMM_Cmax_pred HMM_Sprob_score HMM_Sprob_pred | |
30 gi|2781234|pdb|1JLY|B 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N Q 0.000 17 N 0.000 N | |
31 gi|4959044|gb|AAD34209.1|AF069992_1 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N Q 0.000 0 N 0.000 N | |
32 gi|671626|emb|CAA85685.1| 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N Q 0.000 0 N 0.000 N | |
33 gi|3298468|dbj|BAA31520.1| 0.208 24 N 0.184 38 N 0.980 32 Y 0.613 Y 0.398 N Q 0.066 24 N 0.139 N | |
34 | |
35 The second major feature is overcoming SignalP's built in limit of 4000 | |
36 sequences by breaking up the input FASTA file into chunks. This also allows | |
37 us to pre-trim the sequences since SignalP only needs their starts. | |
38 | |
39 The third major feature is taking advantage of multiple cores (since SignalP | |
40 v3.0 itself is single threaded) by using the individual FASTA input files to | |
41 run multiple copies of TMHMM in parallel. I would normally use Python's | |
42 multiprocessing library in this situation but it requires at least Python 2.6 | |
43 and at the time of writing Galaxy still supports Python 2.4. | |
44 """ | |
45 import sys | |
46 import os | |
47 from seq_analysis_utils import stop_err, split_fasta, run_jobs | |
48 | |
49 FASTA_CHUNK = 500 | |
50 MAX_LEN = 6000 #Found by trial and error | |
51 | |
52 if len(sys.argv) != 6: | |
53 stop_err("Require five arguments, organism, truncate, threads, input protein FASTA file & output tabular file") | |
54 | |
55 organism = sys.argv[1] | |
56 if organism not in ["euk", "gram+", "gram-"]: | |
57 stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism) | |
58 | |
59 try: | |
60 truncate = int(sys.argv[2]) | |
61 except: | |
62 truncate = 0 | |
63 if truncate < 0: | |
64 stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) | |
65 | |
66 try: | |
67 num_threads = int(sys.argv[3]) | |
68 except: | |
69 num_threads = 0 | |
70 if num_threads < 1: | |
71 stop_err("Threads argument %s is not a positive integer" % sys.argv[3]) | |
72 | |
73 fasta_file = sys.argv[4] | |
74 | |
75 tabular_file = sys.argv[5] | |
76 | |
77 def clean_tabular(raw_handle, out_handle): | |
78 """Clean up SignalP output to make it tabular.""" | |
79 for line in raw_handle: | |
80 if not line or line.startswith("#"): | |
81 continue | |
82 parts = line.rstrip("\r\n").split() | |
83 assert len(parts)==21, repr(line) | |
84 assert parts[14].startswith(parts[0]) | |
85 #Remove redundant truncated name column (col 0) | |
86 #and put full name at start (col 14) | |
87 parts = parts[14:15] + parts[1:14] + parts[15:] | |
88 out_handle.write("\t".join(parts) + "\n") | |
89 | |
90 fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) | |
91 temp_files = [f+".out" for f in fasta_files] | |
92 assert len(fasta_files) == len(temp_files) | |
93 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) | |
94 for (fasta, temp) in zip(fasta_files, temp_files)] | |
95 assert len(fasta_files) == len(temp_files) == len(jobs) | |
96 | |
97 def clean_up(file_list): | |
98 for f in file_list: | |
99 if os.path.isfile(f): | |
100 os.remove(f) | |
101 | |
102 if len(jobs) > 1 and num_threads > 1: | |
103 #A small "info" message for Galaxy to show the user. | |
104 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | |
105 results = run_jobs(jobs, num_threads) | |
106 assert len(fasta_files) == len(temp_files) == len(jobs) | |
107 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | |
108 error_level = results[cmd] | |
109 try: | |
110 output = open(temp).readline() | |
111 except IOError: | |
112 output = "" | |
113 if error_level or output.lower().startswith("error running"): | |
114 clean_up(fasta_files) | |
115 clean_up(temp_files) | |
116 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | |
117 error_level) | |
118 del results | |
119 | |
120 out_handle = open(tabular_file, "w") | |
121 fields = ["ID"] | |
122 #NN results: | |
123 for name in ["Cmax", "Ymax", "Smax"]: | |
124 fields.extend(["NN_%s_score"%name, "NN_%s_pos"%name, "NN_%s_pred"%name]) | |
125 fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"]) | |
126 #HMM results: | |
127 fields.extend(["HMM_type", "HMM_Cmax_score", "HMM_Cmax_pos", "HMM_Cmax_pred", | |
128 "HMM_Sprob_score", "HMM_Sprob_pred"]) | |
129 out_handle.write("#" + "\t".join(fields) + "\n") | |
130 for temp in temp_files: | |
131 data_handle = open(temp) | |
132 clean_tabular(data_handle, out_handle) | |
133 data_handle.close() | |
134 out_handle.close() | |
135 | |
136 clean_up(fasta_files) | |
137 clean_up(temp_files) |