Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/seq_analysis_utils.py @ 21:238eae32483c draft
"Check this is up to date with all 2020 changes (black etc)"
author | peterjc |
---|---|
date | Thu, 17 Jun 2021 08:21:06 +0000 |
parents | a19b3ded8f33 |
children | e1996f0f4e85 |
comparison
equal
deleted
inserted
replaced
20:a19b3ded8f33 | 21:238eae32483c |
---|---|
14 import subprocess | 14 import subprocess |
15 import sys | 15 import sys |
16 | 16 |
17 from time import sleep | 17 from time import sleep |
18 | 18 |
19 __version__ = "0.0.2" | 19 if sys.version_info[0] < 3: |
20 range = xrange # noqa: F821 | |
21 | |
22 __version__ = "0.0.4" | |
20 | 23 |
21 try: | 24 try: |
22 from multiprocessing import cpu_count | 25 from multiprocessing import cpu_count |
23 except ImportError: | 26 except ImportError: |
24 # Must be under Python 2.5, this is copied from multiprocessing: | 27 # Must be under Python 2.5, this is copied from multiprocessing: |
25 def cpu_count(): | 28 def cpu_count(): |
26 """Returns the number of CPUs in the system.""" | 29 """Return the number of CPUs in the system.""" |
27 if sys.platform == 'win32': | 30 if sys.platform == "win32": |
28 try: | 31 try: |
29 num = int(os.environ['NUMBER_OF_PROCESSORS']) | 32 num = int(os.environ["NUMBER_OF_PROCESSORS"]) |
30 except (ValueError, KeyError): | 33 except (ValueError, KeyError): |
31 num = 0 | 34 num = 0 |
32 elif 'bsd' in sys.platform or sys.platform == 'darwin': | 35 elif "bsd" in sys.platform or sys.platform == "darwin": |
33 comm = '/sbin/sysctl -n hw.ncpu' | 36 comm = "/sbin/sysctl -n hw.ncpu" |
34 if sys.platform == 'darwin': | 37 if sys.platform == "darwin": |
35 comm = '/usr' + comm | 38 comm = "/usr" + comm |
36 try: | 39 try: |
37 with os.popen(comm) as p: | 40 with os.popen(comm) as p: |
38 num = int(p.read()) | 41 num = int(p.read()) |
39 except ValueError: | 42 except ValueError: |
40 num = 0 | 43 num = 0 |
41 else: | 44 else: |
42 try: | 45 try: |
43 num = os.sysconf('SC_NPROCESSORS_ONLN') | 46 num = os.sysconf("SC_NPROCESSORS_ONLN") |
44 except (ValueError, OSError, AttributeError): | 47 except (ValueError, OSError, AttributeError): |
45 num = 0 | 48 num = 0 |
46 | 49 |
47 if num >= 1: | 50 if num >= 1: |
48 return num | 51 return num |
49 else: | 52 else: |
50 raise NotImplementedError('cannot determine number of cpus') | 53 raise NotImplementedError("cannot determine number of cpus") |
51 | 54 |
52 | 55 |
53 def thread_count(command_line_arg, default=1): | 56 def thread_count(command_line_arg, default=1): |
54 """Determine number of threads to use from the command line args.""" | 57 """Determine number of threads to use from the command line args.""" |
55 try: | 58 try: |
68 # sys.stderr.write("Using %i cores on %s\n" % (num, hostname)) | 71 # sys.stderr.write("Using %i cores on %s\n" % (num, hostname)) |
69 return num | 72 return num |
70 | 73 |
71 | 74 |
72 def fasta_iterator(filename, max_len=None, truncate=None): | 75 def fasta_iterator(filename, max_len=None, truncate=None): |
73 """Simple FASTA parser yielding tuples of (title, sequence) strings.""" | 76 """Parse FASTA file yielding tuples of (name, sequence).""" |
74 handle = open(filename) | 77 handle = open(filename) |
75 title, seq = "", "" | 78 title, seq = "", "" |
76 for line in handle: | 79 for line in handle: |
77 if line.startswith(">"): | 80 if line.startswith(">"): |
78 if title: | 81 if title: |
79 if truncate: | 82 if truncate: |
80 seq = seq[:truncate] | 83 seq = seq[:truncate] |
81 if max_len and len(seq) > max_len: | 84 if max_len and len(seq) > max_len: |
82 raise ValueError("Sequence %s is length %i, max length %i" | 85 raise ValueError( |
83 % (title.split()[0], len(seq), max_len)) | 86 "Sequence %s is length %i, max length %i" |
87 % (title.split()[0], len(seq), max_len) | |
88 ) | |
84 yield title, seq | 89 yield title, seq |
85 title = line[1:].rstrip() | 90 title = line[1:].rstrip() |
86 seq = "" | 91 seq = "" |
87 elif title: | 92 elif title: |
88 seq += line.strip() | 93 seq += line.strip() |
96 handle.close() | 101 handle.close() |
97 if title: | 102 if title: |
98 if truncate: | 103 if truncate: |
99 seq = seq[:truncate] | 104 seq = seq[:truncate] |
100 if max_len and len(seq) > max_len: | 105 if max_len and len(seq) > max_len: |
101 raise ValueError("Sequence %s is length %i, max length %i" | 106 raise ValueError( |
102 % (title.split()[0], len(seq), max_len)) | 107 "Sequence %s is length %i, max length %i" |
108 % (title.split()[0], len(seq), max_len) | |
109 ) | |
103 yield title, seq | 110 yield title, seq |
104 raise StopIteration | 111 raise StopIteration |
105 | 112 |
106 | 113 |
107 def split_fasta(input_filename, output_filename_base, n=500, truncate=None, keep_descr=False, max_len=None): | 114 def split_fasta( |
115 input_filename, | |
116 output_filename_base, | |
117 n=500, | |
118 truncate=None, | |
119 keep_descr=False, | |
120 max_len=None, | |
121 ): | |
108 """Split FASTA file into sub-files each of at most n sequences. | 122 """Split FASTA file into sub-files each of at most n sequences. |
109 | 123 |
110 Returns a list of the filenames used (based on the input filename). | 124 Returns a list of the filenames used (based on the input filename). |
111 Each sequence can also be truncated (since we only need the start for | 125 Each sequence can also be truncated (since we only need the start for |
112 SignalP), and have its description discarded (since we don't usually | 126 SignalP), and have its description discarded (since we don't usually |
120 try: | 134 try: |
121 while True: | 135 while True: |
122 records = [] | 136 records = [] |
123 for i in range(n): | 137 for i in range(n): |
124 try: | 138 try: |
125 records.append(iterator.next()) | 139 records.append(next(iterator)) |
126 except StopIteration: | 140 except StopIteration: |
127 break | 141 break |
128 if not records: | 142 if not records: |
129 break | 143 break |
130 new_filename = "%s.%i.tmp" % (output_filename_base, len(files)) | 144 new_filename = "%s.%i.tmp" % (output_filename_base, len(files)) |
131 handle = open(new_filename, "w") | 145 handle = open(new_filename, "w") |
132 if keep_descr: | 146 if keep_descr: |
133 for title, seq in records: | 147 for title, seq in records: |
134 handle.write(">%s\n" % title) | 148 handle.write(">%s\n" % title) |
135 for i in range(0, len(seq), 60): | 149 for i in range(0, len(seq), 60): |
136 handle.write(seq[i:i + 60] + "\n") | 150 handle.write(seq[i : i + 60] + "\n") |
137 else: | 151 else: |
138 for title, seq in records: | 152 for title, seq in records: |
139 handle.write(">%s\n" % title.split()[0]) | 153 handle.write(">%s\n" % title.split()[0]) |
140 for i in range(0, len(seq), 60): | 154 for i in range(0, len(seq), 60): |
141 handle.write(seq[i:i + 60] + "\n") | 155 handle.write(seq[i : i + 60] + "\n") |
142 handle.close() | 156 handle.close() |
143 files.append(new_filename) | 157 files.append(new_filename) |
144 # print "%i records in %s" % (len(records), new_filename) | 158 # print "%i records in %s" % (len(records), new_filename) |
145 except ValueError as err: | 159 except ValueError as err: |
146 # Max length failure from parser - clean up | 160 # Max length failure from parser - clean up |
156 assert os.path.isfile(f), "Missing split file %r (!??)" % f | 170 assert os.path.isfile(f), "Missing split file %r (!??)" % f |
157 return files | 171 return files |
158 | 172 |
159 | 173 |
160 def run_jobs(jobs, threads, pause=10, verbose=False, fast_fail=True): | 174 def run_jobs(jobs, threads, pause=10, verbose=False, fast_fail=True): |
161 """Takes list of cmd strings, returns dict with error levels.""" | 175 """Take list of cmd strings, return dict with error levels.""" |
162 pending = jobs[:] | 176 pending = jobs[:] |
163 running = [] | 177 running = [] |
164 results = {} | 178 results = {} |
165 skipped = [] | 179 skipped = [] |
166 if threads == 1: | 180 if threads == 1: |
175 return_code = process.poll() # non-blocking | 189 return_code = process.poll() # non-blocking |
176 if return_code is not None: | 190 if return_code is not None: |
177 results[cmd] = return_code | 191 results[cmd] = return_code |
178 if return_code: | 192 if return_code: |
179 failed = True | 193 failed = True |
180 running = [(cmd, process) for (cmd, process) in running | 194 running = [(cmd, process) for (cmd, process) in running if cmd not in results] |
181 if cmd not in results] | |
182 if verbose: | 195 if verbose: |
183 print("%i jobs pending, %i running, %i completed" % | 196 print( |
184 (len(pending), len(running), len(results))) | 197 "%i jobs pending, %i running, %i completed" |
198 % (len(pending), len(running), len(results)) | |
199 ) | |
185 # See if we can start any new threads | 200 # See if we can start any new threads |
186 if pending and failed and fast_fail: | 201 if pending and failed and fast_fail: |
187 # Don't start any more jobs | 202 # Don't start any more jobs |
188 if verbose: | 203 if verbose: |
189 print("Failed, will not start remaining %i jobs" % len(pending)) | 204 print("Failed, will not start remaining %i jobs" % len(pending)) |