comparison tools/protein_analysis/seq_analysis_utils.py @ 21:238eae32483c draft

"Check this is up to date with all 2020 changes (black etc)"
author peterjc
date Thu, 17 Jun 2021 08:21:06 +0000
parents a19b3ded8f33
children e1996f0f4e85
comparison
equal deleted inserted replaced
20:a19b3ded8f33 21:238eae32483c
14 import subprocess 14 import subprocess
15 import sys 15 import sys
16 16
17 from time import sleep 17 from time import sleep
18 18
19 __version__ = "0.0.2" 19 if sys.version_info[0] < 3:
20 range = xrange # noqa: F821
21
22 __version__ = "0.0.4"
20 23
21 try: 24 try:
22 from multiprocessing import cpu_count 25 from multiprocessing import cpu_count
23 except ImportError: 26 except ImportError:
24 # Must be under Python 2.5, this is copied from multiprocessing: 27 # Must be under Python 2.5, this is copied from multiprocessing:
25 def cpu_count(): 28 def cpu_count():
26 """Returns the number of CPUs in the system.""" 29 """Return the number of CPUs in the system."""
27 if sys.platform == 'win32': 30 if sys.platform == "win32":
28 try: 31 try:
29 num = int(os.environ['NUMBER_OF_PROCESSORS']) 32 num = int(os.environ["NUMBER_OF_PROCESSORS"])
30 except (ValueError, KeyError): 33 except (ValueError, KeyError):
31 num = 0 34 num = 0
32 elif 'bsd' in sys.platform or sys.platform == 'darwin': 35 elif "bsd" in sys.platform or sys.platform == "darwin":
33 comm = '/sbin/sysctl -n hw.ncpu' 36 comm = "/sbin/sysctl -n hw.ncpu"
34 if sys.platform == 'darwin': 37 if sys.platform == "darwin":
35 comm = '/usr' + comm 38 comm = "/usr" + comm
36 try: 39 try:
37 with os.popen(comm) as p: 40 with os.popen(comm) as p:
38 num = int(p.read()) 41 num = int(p.read())
39 except ValueError: 42 except ValueError:
40 num = 0 43 num = 0
41 else: 44 else:
42 try: 45 try:
43 num = os.sysconf('SC_NPROCESSORS_ONLN') 46 num = os.sysconf("SC_NPROCESSORS_ONLN")
44 except (ValueError, OSError, AttributeError): 47 except (ValueError, OSError, AttributeError):
45 num = 0 48 num = 0
46 49
47 if num >= 1: 50 if num >= 1:
48 return num 51 return num
49 else: 52 else:
50 raise NotImplementedError('cannot determine number of cpus') 53 raise NotImplementedError("cannot determine number of cpus")
51 54
52 55
53 def thread_count(command_line_arg, default=1): 56 def thread_count(command_line_arg, default=1):
54 """Determine number of threads to use from the command line args.""" 57 """Determine number of threads to use from the command line args."""
55 try: 58 try:
68 # sys.stderr.write("Using %i cores on %s\n" % (num, hostname)) 71 # sys.stderr.write("Using %i cores on %s\n" % (num, hostname))
69 return num 72 return num
70 73
71 74
72 def fasta_iterator(filename, max_len=None, truncate=None): 75 def fasta_iterator(filename, max_len=None, truncate=None):
73 """Simple FASTA parser yielding tuples of (title, sequence) strings.""" 76 """Parse FASTA file yielding tuples of (name, sequence)."""
74 handle = open(filename) 77 handle = open(filename)
75 title, seq = "", "" 78 title, seq = "", ""
76 for line in handle: 79 for line in handle:
77 if line.startswith(">"): 80 if line.startswith(">"):
78 if title: 81 if title:
79 if truncate: 82 if truncate:
80 seq = seq[:truncate] 83 seq = seq[:truncate]
81 if max_len and len(seq) > max_len: 84 if max_len and len(seq) > max_len:
82 raise ValueError("Sequence %s is length %i, max length %i" 85 raise ValueError(
83 % (title.split()[0], len(seq), max_len)) 86 "Sequence %s is length %i, max length %i"
87 % (title.split()[0], len(seq), max_len)
88 )
84 yield title, seq 89 yield title, seq
85 title = line[1:].rstrip() 90 title = line[1:].rstrip()
86 seq = "" 91 seq = ""
87 elif title: 92 elif title:
88 seq += line.strip() 93 seq += line.strip()
96 handle.close() 101 handle.close()
97 if title: 102 if title:
98 if truncate: 103 if truncate:
99 seq = seq[:truncate] 104 seq = seq[:truncate]
100 if max_len and len(seq) > max_len: 105 if max_len and len(seq) > max_len:
101 raise ValueError("Sequence %s is length %i, max length %i" 106 raise ValueError(
102 % (title.split()[0], len(seq), max_len)) 107 "Sequence %s is length %i, max length %i"
108 % (title.split()[0], len(seq), max_len)
109 )
103 yield title, seq 110 yield title, seq
104 raise StopIteration 111 raise StopIteration
105 112
106 113
107 def split_fasta(input_filename, output_filename_base, n=500, truncate=None, keep_descr=False, max_len=None): 114 def split_fasta(
115 input_filename,
116 output_filename_base,
117 n=500,
118 truncate=None,
119 keep_descr=False,
120 max_len=None,
121 ):
108 """Split FASTA file into sub-files each of at most n sequences. 122 """Split FASTA file into sub-files each of at most n sequences.
109 123
110 Returns a list of the filenames used (based on the input filename). 124 Returns a list of the filenames used (based on the input filename).
111 Each sequence can also be truncated (since we only need the start for 125 Each sequence can also be truncated (since we only need the start for
112 SignalP), and have its description discarded (since we don't usually 126 SignalP), and have its description discarded (since we don't usually
120 try: 134 try:
121 while True: 135 while True:
122 records = [] 136 records = []
123 for i in range(n): 137 for i in range(n):
124 try: 138 try:
125 records.append(iterator.next()) 139 records.append(next(iterator))
126 except StopIteration: 140 except StopIteration:
127 break 141 break
128 if not records: 142 if not records:
129 break 143 break
130 new_filename = "%s.%i.tmp" % (output_filename_base, len(files)) 144 new_filename = "%s.%i.tmp" % (output_filename_base, len(files))
131 handle = open(new_filename, "w") 145 handle = open(new_filename, "w")
132 if keep_descr: 146 if keep_descr:
133 for title, seq in records: 147 for title, seq in records:
134 handle.write(">%s\n" % title) 148 handle.write(">%s\n" % title)
135 for i in range(0, len(seq), 60): 149 for i in range(0, len(seq), 60):
136 handle.write(seq[i:i + 60] + "\n") 150 handle.write(seq[i : i + 60] + "\n")
137 else: 151 else:
138 for title, seq in records: 152 for title, seq in records:
139 handle.write(">%s\n" % title.split()[0]) 153 handle.write(">%s\n" % title.split()[0])
140 for i in range(0, len(seq), 60): 154 for i in range(0, len(seq), 60):
141 handle.write(seq[i:i + 60] + "\n") 155 handle.write(seq[i : i + 60] + "\n")
142 handle.close() 156 handle.close()
143 files.append(new_filename) 157 files.append(new_filename)
144 # print "%i records in %s" % (len(records), new_filename) 158 # print "%i records in %s" % (len(records), new_filename)
145 except ValueError as err: 159 except ValueError as err:
146 # Max length failure from parser - clean up 160 # Max length failure from parser - clean up
156 assert os.path.isfile(f), "Missing split file %r (!??)" % f 170 assert os.path.isfile(f), "Missing split file %r (!??)" % f
157 return files 171 return files
158 172
159 173
160 def run_jobs(jobs, threads, pause=10, verbose=False, fast_fail=True): 174 def run_jobs(jobs, threads, pause=10, verbose=False, fast_fail=True):
161 """Takes list of cmd strings, returns dict with error levels.""" 175 """Take list of cmd strings, return dict with error levels."""
162 pending = jobs[:] 176 pending = jobs[:]
163 running = [] 177 running = []
164 results = {} 178 results = {}
165 skipped = [] 179 skipped = []
166 if threads == 1: 180 if threads == 1:
175 return_code = process.poll() # non-blocking 189 return_code = process.poll() # non-blocking
176 if return_code is not None: 190 if return_code is not None:
177 results[cmd] = return_code 191 results[cmd] = return_code
178 if return_code: 192 if return_code:
179 failed = True 193 failed = True
180 running = [(cmd, process) for (cmd, process) in running 194 running = [(cmd, process) for (cmd, process) in running if cmd not in results]
181 if cmd not in results]
182 if verbose: 195 if verbose:
183 print("%i jobs pending, %i running, %i completed" % 196 print(
184 (len(pending), len(running), len(results))) 197 "%i jobs pending, %i running, %i completed"
198 % (len(pending), len(running), len(results))
199 )
185 # See if we can start any new threads 200 # See if we can start any new threads
186 if pending and failed and fast_fail: 201 if pending and failed and fast_fail:
187 # Don't start any more jobs 202 # Don't start any more jobs
188 if verbose: 203 if verbose:
189 print("Failed, will not start remaining %i jobs" % len(pending)) 204 print("Failed, will not start remaining %i jobs" % len(pending))