Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/wolf_psort.py @ 5:0f1c61998b22
Migrated tool version 0.0.8 from old tool shed archive to new tool shed repository
author | peterjc |
---|---|
date | Tue, 07 Jun 2011 18:06:27 -0400 |
parents | |
children | a290c6d4e658 |
comparison
equal
deleted
inserted
replaced
4:81caef04ce8b | 5:0f1c61998b22 |
---|---|
1 #!/usr/bin/env python | |
2 """Wrapper for WoLF PSORT v0.2 for use in Galaxy. | |
3 | |
4 This script takes exactly four command line arguments: | |
5 * the organism type (animal, plant or fungi) | |
6 * number of threads to use (integer) | |
7 * an input protein FASTA filename | |
8 * output tabular filename. | |
9 | |
10 It then calls the standalone WoLF PSORT v0.2 program runWolfPsortSummary | |
11 (not the webservice), and coverts the output from something like this: | |
12 | |
13 # k used for kNN is: 27 | |
14 gi|301087619|ref|XP_002894699.1| extr 12, mito 4, E.R. 3, golg 3, mito_nucl 3 | |
15 gi|301087623|ref|XP_002894700.1| extr 21, mito 2, cyto 2, cyto_mito 2 | |
16 | |
17 In order to make it easier to use in Galaxy, this wrapper script reformats | |
18 this to use tab separators, with one line per compartment prediction: | |
19 | |
20 #ID Compartment Score Rank | |
21 gi|301087619|ref|XP_002894699.1| extr 12 1 | |
22 gi|301087619|ref|XP_002894699.1| mito 4 2 | |
23 gi|301087619|ref|XP_002894699.1| E.R. 3 3 | |
24 gi|301087619|ref|XP_002894699.1| golg 3 4 | |
25 gi|301087619|ref|XP_002894699.1| mito_nucl 3 5 | |
26 gi|301087623|ref|XP_002894700.1| extr 21 1 | |
27 gi|301087623|ref|XP_002894700.1| mito 2 2 | |
28 gi|301087623|ref|XP_002894700.1| cyto 2 3 | |
29 gi|301087623|ref|XP_002894700.1| cyto_mito 2 4 | |
30 | |
31 Additionally in order to take full advantage of multiple cores, by subdividing | |
32 the input FASTA file multiple copies of WoLF PSORT are run in parallel. I would | |
33 normally use Python's multiprocessing library in this situation but it requires | |
34 at least Python 2.6 and at the time of writing Galaxy still supports Python 2.4. | |
35 """ | |
36 import sys | |
37 import os | |
38 from seq_analysis_utils import stop_err, split_fasta, run_jobs | |
39 | |
40 FASTA_CHUNK = 500 | |
41 exe = "runWolfPsortSummary" | |
42 | |
43 """ | |
44 Note: I had trouble getting runWolfPsortSummary on the path, so used a wrapper | |
45 python script called runWolfPsortSummary as follows: | |
46 | |
47 #!/usr/bin/env python | |
48 #Wrapper script to call WoLF PSORT from its own directory. | |
49 import os | |
50 import sys | |
51 import subprocess | |
52 saved_dir = os.path.abspath(os.curdir) | |
53 os.chdir("/opt/WoLFPSORT_package_v0.2/bin") | |
54 args = ["./runWolfPsortSummary"] + sys.argv[1:] | |
55 return_code = subprocess.call(args) | |
56 os.chdir(saved_dir) | |
57 sys.exit(return_code) | |
58 """ | |
59 | |
60 if len(sys.argv) != 5: | |
61 stop_err("Require four arguments, organism, threads, input protein FASTA file & output tabular file") | |
62 | |
63 organism = sys.argv[1] | |
64 if organism not in ["animal", "plant", "fungi"]: | |
65 stop_err("Organism argument %s is not one of animal, plant, fungi" % organism) | |
66 | |
67 try: | |
68 num_threads = int(sys.argv[2]) | |
69 except: | |
70 num_threads = 0 | |
71 if num_threads < 1: | |
72 stop_err("Threads argument %s is not a positive integer" % sys.argv[3]) | |
73 | |
74 fasta_file = sys.argv[3] | |
75 | |
76 tabular_file = sys.argv[4] | |
77 | |
78 def clean_tabular(raw_handle, out_handle): | |
79 """Clean up WoLF PSORT output to make it tabular.""" | |
80 for line in raw_handle: | |
81 if not line or line.startswith("#"): | |
82 continue | |
83 name, data = line.rstrip("\r\n").split(None,1) | |
84 for rank, comp_data in enumerate(data.split(",")): | |
85 comp, score = comp_data.split() | |
86 out_handle.write("%s\t%s\t%s\t%i\n" \ | |
87 % (name, comp, score, rank+1)) | |
88 | |
89 fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK) | |
90 temp_files = [f+".out" for f in fasta_files] | |
91 assert len(fasta_files) == len(temp_files) | |
92 jobs = ["%s %s < %s > %s" % (exe, organism, fasta, temp) | |
93 for (fasta, temp) in zip(fasta_files, temp_files)] | |
94 assert len(fasta_files) == len(temp_files) == len(jobs) | |
95 | |
96 def clean_up(file_list): | |
97 for f in file_list: | |
98 if os.path.isfile(f): | |
99 os.remove(f) | |
100 | |
101 if len(jobs) > 1 and num_threads > 1: | |
102 #A small "info" message for Galaxy to show the user. | |
103 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | |
104 results = run_jobs(jobs, num_threads) | |
105 assert len(fasta_files) == len(temp_files) == len(jobs) | |
106 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | |
107 error_level = results[cmd] | |
108 try: | |
109 output = open(temp).readline() | |
110 except IOError: | |
111 output = "" | |
112 if error_level or output.lower().startswith("error running"): | |
113 clean_up(fasta_files) | |
114 clean_up(temp_files) | |
115 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | |
116 error_level) | |
117 del results | |
118 | |
119 out_handle = open(tabular_file, "w") | |
120 out_handle.write("#ID\tCompartment\tScore\tRank\n") | |
121 for temp in temp_files: | |
122 data_handle = open(temp) | |
123 clean_tabular(data_handle, out_handle) | |
124 data_handle.close() | |
125 out_handle.close() | |
126 | |
127 clean_up(fasta_files) | |
128 clean_up(temp_files) |