Mercurial > repos > peterjc > tmhmm_and_signalp
annotate tools/protein_analysis/seq_analysis_utils.py @ 5:0f1c61998b22
Migrated tool version 0.0.8 from old tool shed archive to new tool shed repository
author | peterjc |
---|---|
date | Tue, 07 Jun 2011 18:06:27 -0400 |
parents | f3b373a41f81 |
children | a290c6d4e658 |
rev | line source |
---|---|
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
1 """A few useful functions for working with FASTA files and running jobs. |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
2 |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
3 This module was originally written to hold common code used in both the TMHMM |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
4 and SignalP wrappers in Galaxy. |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
5 |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
6 Given Galaxy currently supports Python 2.4+ this cannot use the Python module |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
7 multiprocessing so the function run_jobs instead is a simple pool approach |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
8 using just the subprocess library. |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
9 """ |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
10 import sys |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
11 import os |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
12 import subprocess |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
13 from time import sleep |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
14 |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
15 __version__ = "0.0.1" |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
16 |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
17 def stop_err(msg, error_level=1): |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
18 """Print error message to stdout and quit with given error level.""" |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
19 sys.stderr.write("%s\n" % msg) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
20 sys.exit(error_level) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
21 |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
22 def fasta_iterator(filename, max_len=None, truncate=None): |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
23 """Simple FASTA parser yielding tuples of (title, sequence) strings.""" |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
24 handle = open(filename) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
25 title, seq = "", "" |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
26 for line in handle: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
27 if line.startswith(">"): |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
28 if title: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
29 if truncate: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
30 seq = seq[:truncate] |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
31 if max_len and len(seq) > max_len: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
32 raise ValueError("Sequence %s is length %i, max length %i" \ |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
33 % (title.split()[0], len(seq), max_len)) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
34 yield title, seq |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
35 title = line[1:].rstrip() |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
36 seq = "" |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
37 elif title: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
38 seq += line.strip() |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
39 elif not line.strip() or line.startswith("#"): |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
40 #Ignore blank lines, and any comment lines |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
41 #between records (starting with hash). |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
42 pass |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
43 else: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
44 raise ValueError("Bad FASTA line %r" % line) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
45 handle.close() |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
46 if title: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
47 if truncate: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
48 seq = seq[:truncate] |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
49 if max_len and len(seq) > max_len: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
50 raise ValueError("Sequence %s is length %i, max length %i" \ |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
51 % (title.split()[0], len(seq), max_len)) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
52 yield title, seq |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
53 raise StopIteration |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
54 |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
55 def split_fasta(input_filename, output_filename_base, n=500, truncate=None, keep_descr=False, max_len=None): |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
56 """Split FASTA file into sub-files each of at most n sequences. |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
57 |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
58 Returns a list of the filenames used (based on the input filename). |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
59 Each sequence can also be truncated (since we only need the start for |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
60 SignalP), and have its description discarded (since we don't usually |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
61 care about it and some tools don't like very long title lines). |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
62 |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
63 If a max_len is given and any sequence exceeds it no temp files are |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
64 created and an exception is raised. |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
65 """ |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
66 iterator = fasta_iterator(input_filename, max_len, truncate) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
67 files = [] |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
68 try: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
69 while True: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
70 records = [] |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
71 for i in range(n): |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
72 try: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
73 records.append(iterator.next()) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
74 except StopIteration: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
75 break |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
76 if not records: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
77 break |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
78 new_filename = "%s.%i.tmp" % (output_filename_base, len(files)) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
79 handle = open(new_filename, "w") |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
80 if keep_descr: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
81 for title, seq in records: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
82 handle.write(">%s\n" % title) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
83 for i in range(0, len(seq), 60): |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
84 handle.write(seq[i:i+60] + "\n") |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
85 else: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
86 for title, seq in records: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
87 handle.write(">%s\n" % title.split()[0]) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
88 for i in range(0, len(seq), 60): |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
89 handle.write(seq[i:i+60] + "\n") |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
90 handle.close() |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
91 files.append(new_filename) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
92 #print "%i records in %s" % (len(records), new_filename) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
93 except ValueError, err: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
94 #Max length failure from parser - clean up |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
95 try: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
96 handle.close() |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
97 except: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
98 pass |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
99 for f in files: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
100 if os.path.isfile(f): |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
101 os.remove(f) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
102 raise err |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
103 return files |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
104 |
3
f3b373a41f81
Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents:
0
diff
changeset
|
105 def run_jobs(jobs, threads, verbose=False): |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
106 """Takes list of cmd strings, returns dict with error levels.""" |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
107 pending = jobs[:] |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
108 running = [] |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
109 results = {} |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
110 while pending or running: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
111 #See if any have finished |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
112 for (cmd, process) in running: |
3
f3b373a41f81
Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents:
0
diff
changeset
|
113 return_code = process.poll() #non-blocking |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
114 if return_code is not None: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
115 results[cmd] = return_code |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
116 running = [(cmd, process) for (cmd, process) in running \ |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
117 if cmd not in results] |
3
f3b373a41f81
Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents:
0
diff
changeset
|
118 if verbose: |
f3b373a41f81
Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents:
0
diff
changeset
|
119 print "%i jobs pending, %i running, %i completed" \ |
f3b373a41f81
Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents:
0
diff
changeset
|
120 % (len(pending), len(running), len(results)) |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
121 #See if we can start any new threads |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
122 while pending and len(running) < threads: |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
123 cmd = pending.pop(0) |
3
f3b373a41f81
Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents:
0
diff
changeset
|
124 if verbose: |
f3b373a41f81
Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents:
0
diff
changeset
|
125 print cmd |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
126 process = subprocess.Popen(cmd, shell=True) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
127 running.append((cmd, process)) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
128 #Loop... |
3
f3b373a41f81
Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents:
0
diff
changeset
|
129 sleep(10) |
f3b373a41f81
Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents:
0
diff
changeset
|
130 if verbose: |
f3b373a41f81
Migrated tool version 0.0.6 from old tool shed archive to new tool shed repository
peterjc
parents:
0
diff
changeset
|
131 print "%i jobs completed" % len(results) |
0
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
132 assert set(jobs) == set(results) |
bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
peterjc
parents:
diff
changeset
|
133 return results |