Mercurial > repos > peterjc > tmhmm_and_signalp
annotate tools/protein_analysis/seq_analysis_utils.py @ 19:f3ecd80850e2 draft
v0.2.9 Python style improvements
author | peterjc |
---|---|
date | Wed, 01 Feb 2017 09:46:42 -0500 |
parents | eb6ac44d4b8e |
children | a19b3ded8f33 |
rev | line source |
---|---|
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
1 """A few useful functions for working with FASTA files and running jobs. |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
2 |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
3 This module was originally written to hold common code used in both the TMHMM |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
4 and SignalP wrappers in Galaxy. |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
5 |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
6 Given Galaxy currently supports Python 2.4+ this cannot use the Python module |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
7 multiprocessing so the function run_jobs instead is a simple pool approach |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
8 using just the subprocess library. |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
9 """ |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
10 import sys |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
11 import os |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
12 import subprocess |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
13 from time import sleep |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
14 |
19 | 15 __version__ = "0.0.2" |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
16 |
9 | 17 try: |
18 from multiprocessing import cpu_count | |
19 except ImportError: | |
19 | 20 # Must be under Python 2.5, this is copied from multiprocessing: |
9 | 21 def cpu_count(): |
22 """Returns the number of CPUs in the system.""" | |
23 if sys.platform == 'win32': | |
24 try: | |
25 num = int(os.environ['NUMBER_OF_PROCESSORS']) | |
26 except (ValueError, KeyError): | |
27 num = 0 | |
28 elif 'bsd' in sys.platform or sys.platform == 'darwin': | |
29 comm = '/sbin/sysctl -n hw.ncpu' | |
30 if sys.platform == 'darwin': | |
31 comm = '/usr' + comm | |
32 try: | |
33 with os.popen(comm) as p: | |
34 num = int(p.read()) | |
35 except ValueError: | |
36 num = 0 | |
37 else: | |
38 try: | |
39 num = os.sysconf('SC_NPROCESSORS_ONLN') | |
40 except (ValueError, OSError, AttributeError): | |
41 num = 0 | |
42 | |
43 if num >= 1: | |
44 return num | |
45 else: | |
46 raise NotImplementedError('cannot determine number of cpus') | |
47 | |
48 | |
49 def thread_count(command_line_arg, default=1): | |
50 try: | |
51 num = int(command_line_arg) | |
19 | 52 except ValueError: |
9 | 53 num = default |
54 if num < 1: | |
19 | 55 sys.exit("Threads argument %r is not a positive integer" % command_line_arg) |
56 # Cap this with the pysical limit of the machine, | |
9 | 57 try: |
58 num = min(num, cpu_count()) | |
59 except NotImplementedError: | |
60 pass | |
19 | 61 # For debugging, |
62 # hostname = os.environ.get("HOSTNAME", "this machine") | |
63 # sys.stderr.write("Using %i cores on %s\n" % (num, hostname)) | |
9 | 64 return num |
65 | |
66 | |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
67 def fasta_iterator(filename, max_len=None, truncate=None): |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
68 """Simple FASTA parser yielding tuples of (title, sequence) strings.""" |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
69 handle = open(filename) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
70 title, seq = "", "" |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
71 for line in handle: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
72 if line.startswith(">"): |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
73 if title: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
74 if truncate: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
75 seq = seq[:truncate] |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
76 if max_len and len(seq) > max_len: |
19 | 77 raise ValueError("Sequence %s is length %i, max length %i" |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
78 % (title.split()[0], len(seq), max_len)) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
79 yield title, seq |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
80 title = line[1:].rstrip() |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
81 seq = "" |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
82 elif title: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
83 seq += line.strip() |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
84 elif not line.strip() or line.startswith("#"): |
19 | 85 # Ignore blank lines, and any comment lines |
86 # between records (starting with hash). | |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
87 pass |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
88 else: |
17
e6cc27d182a8
Uploaded v0.2.6, embedded citations and uses $GALAXY_SLOTS
peterjc
parents:
9
diff
changeset
|
89 handle.close() |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
90 raise ValueError("Bad FASTA line %r" % line) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
91 handle.close() |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
92 if title: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
93 if truncate: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
94 seq = seq[:truncate] |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
95 if max_len and len(seq) > max_len: |
19 | 96 raise ValueError("Sequence %s is length %i, max length %i" |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
97 % (title.split()[0], len(seq), max_len)) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
98 yield title, seq |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
99 raise StopIteration |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
100 |
19 | 101 |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
102 def split_fasta(input_filename, output_filename_base, n=500, truncate=None, keep_descr=False, max_len=None): |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
103 """Split FASTA file into sub-files each of at most n sequences. |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
104 |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
105 Returns a list of the filenames used (based on the input filename). |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
106 Each sequence can also be truncated (since we only need the start for |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
107 SignalP), and have its description discarded (since we don't usually |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
108 care about it and some tools don't like very long title lines). |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
109 |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
110 If a max_len is given and any sequence exceeds it no temp files are |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
111 created and an exception is raised. |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
112 """ |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
113 iterator = fasta_iterator(input_filename, max_len, truncate) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
114 files = [] |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
115 try: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
116 while True: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
117 records = [] |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
118 for i in range(n): |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
119 try: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
120 records.append(iterator.next()) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
121 except StopIteration: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
122 break |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
123 if not records: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
124 break |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
125 new_filename = "%s.%i.tmp" % (output_filename_base, len(files)) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
126 handle = open(new_filename, "w") |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
127 if keep_descr: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
128 for title, seq in records: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
129 handle.write(">%s\n" % title) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
130 for i in range(0, len(seq), 60): |
19 | 131 handle.write(seq[i:i + 60] + "\n") |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
132 else: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
133 for title, seq in records: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
134 handle.write(">%s\n" % title.split()[0]) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
135 for i in range(0, len(seq), 60): |
19 | 136 handle.write(seq[i:i + 60] + "\n") |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
137 handle.close() |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
138 files.append(new_filename) |
19 | 139 # print "%i records in %s" % (len(records), new_filename) |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
140 except ValueError, err: |
19 | 141 # Max length failure from parser - clean up |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
142 try: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
143 handle.close() |
19 | 144 except Exception: |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
145 pass |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
146 for f in files: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
147 if os.path.isfile(f): |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
148 os.remove(f) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
149 raise err |
7
9b45a8743100
Uploaded v0.1.0, which adds a wrapper for Promoter 2.0 (DNA tool) and enables use of Galaxy's <parallelism> tag for SignalP, TMHMM X Promoter wrappers.
peterjc
parents:
6
diff
changeset
|
150 for f in files: |
9b45a8743100
Uploaded v0.1.0, which adds a wrapper for Promoter 2.0 (DNA tool) and enables use of Galaxy's <parallelism> tag for SignalP, TMHMM X Promoter wrappers.
peterjc
parents:
6
diff
changeset
|
151 assert os.path.isfile(f), "Missing split file %r (!??)" % f |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
152 return files |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
153 |
19 | 154 |
6
a290c6d4e658
Migrated tool version 0.0.9 from old tool shed archive to new tool shed repository
peterjc
parents:
3
diff
changeset
|
155 def run_jobs(jobs, threads, pause=10, verbose=False): |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
156 """Takes list of cmd strings, returns dict with error levels.""" |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
157 pending = jobs[:] |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
158 running = [] |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
159 results = {} |
9 | 160 if threads == 1: |
19 | 161 # Special case this for speed, don't need the waits |
9 | 162 for cmd in jobs: |
163 results[cmd] = subprocess.call(cmd, shell=True) | |
164 return results | |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
165 while pending or running: |
19 | 166 # See if any have finished |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
167 for (cmd, process) in running: |
19 | 168 return_code = process.poll() # non-blocking |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
169 if return_code is not None: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
170 results[cmd] = return_code |
19 | 171 running = [(cmd, process) for (cmd, process) in running |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
172 if cmd not in results] |
3
f3b373a41f81
Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents:
0
diff
changeset
|
173 if verbose: |
f3b373a41f81
Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents:
0
diff
changeset
|
174 print "%i jobs pending, %i running, %i completed" \ |
f3b373a41f81
Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents:
0
diff
changeset
|
175 % (len(pending), len(running), len(results)) |
19 | 176 # See if we can start any new threads |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
177 while pending and len(running) < threads: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
178 cmd = pending.pop(0) |
3
f3b373a41f81
Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents:
0
diff
changeset
|
179 if verbose: |
f3b373a41f81
Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents:
0
diff
changeset
|
180 print cmd |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
181 process = subprocess.Popen(cmd, shell=True) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
182 running.append((cmd, process)) |
19 | 183 # Loop... |
6
a290c6d4e658
Migrated tool version 0.0.9 from old tool shed archive to new tool shed repository
peterjc
parents:
3
diff
changeset
|
184 sleep(pause) |
3
f3b373a41f81
Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents:
0
diff
changeset
|
185 if verbose: |
f3b373a41f81
Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents:
0
diff
changeset
|
186 print "%i jobs completed" % len(results) |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
187 assert set(jobs) == set(results) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
188 return results |