tmhmm_and_signalp: tools/protein_analysis/seq_analysis

comparison tools/protein_analysis/seq_analysis_utils.py @ 21:238eae32483c draft

"Check this is up to date with all 2020 changes (black etc)"

author	peterjc
date	Thu, 17 Jun 2021 08:21:06 +0000
parents	a19b3ded8f33
children	e1996f0f4e85

comparison

equal deleted inserted replaced

-:a19b3ded8f33
+:238eae32483c
 import subprocess
 import sys
 from time import sleep
-__version__ = "0.0.2"
+if sys.version_info[0] < 3:
+range = xrange  # noqa: F821
+__version__ = "0.0.4"
 try:
 from multiprocessing import cpu_count
 except ImportError:
 # Must be under Python 2.5, this is copied from multiprocessing:
 def cpu_count():
-"""Returns the number of CPUs in the system."""
+"""Return the number of CPUs in the system."""
-if sys.platform == 'win32':
+if sys.platform == "win32":
 try:
-num = int(os.environ['NUMBER_OF_PROCESSORS'])
+num = int(os.environ["NUMBER_OF_PROCESSORS"])
 except (ValueError, KeyError):
 num = 0
-elif 'bsd' in sys.platform or sys.platform == 'darwin':
+elif "bsd" in sys.platform or sys.platform == "darwin":
-comm = '/sbin/sysctl -n hw.ncpu'
+comm = "/sbin/sysctl -n hw.ncpu"
-if sys.platform == 'darwin':
+if sys.platform == "darwin":
-comm = '/usr' + comm
+comm = "/usr" + comm
 try:
 with os.popen(comm) as p:
 num = int(p.read())
 except ValueError:
 num = 0
 else:
 try:
-num = os.sysconf('SC_NPROCESSORS_ONLN')
+num = os.sysconf("SC_NPROCESSORS_ONLN")
 except (ValueError, OSError, AttributeError):
 num = 0
 if num >= 1:
 return num
 else:
-raise NotImplementedError('cannot determine number of cpus')
+raise NotImplementedError("cannot determine number of cpus")
 def thread_count(command_line_arg, default=1):
 """Determine number of threads to use from the command line args."""
 try:
 # sys.stderr.write("Using %i cores on %s\n" % (num, hostname))
 return num
 def fasta_iterator(filename, max_len=None, truncate=None):
-"""Simple FASTA parser yielding tuples of (title, sequence) strings."""
+"""Parse FASTA file yielding tuples of (name, sequence)."""
 handle = open(filename)
 title, seq = "", ""
 for line in handle:
 if line.startswith(">"):
 if title:
 if truncate:
 seq = seq[:truncate]
 if max_len and len(seq) > max_len:
-raise ValueError("Sequence %s is length %i, max length %i"
+raise ValueError(
-% (title.split()[0], len(seq), max_len))
+"Sequence %s is length %i, max length %i"
+% (title.split()[0], len(seq), max_len)
+)
 yield title, seq
 title = line[1:].rstrip()
 seq = ""
 elif title:
 seq += line.strip()
 handle.close()
 if title:
 if truncate:
 seq = seq[:truncate]
 if max_len and len(seq) > max_len:
-raise ValueError("Sequence %s is length %i, max length %i"
+raise ValueError(
-% (title.split()[0], len(seq), max_len))
+"Sequence %s is length %i, max length %i"
+% (title.split()[0], len(seq), max_len)
+)
 yield title, seq
 raise StopIteration
-def split_fasta(input_filename, output_filename_base, n=500, truncate=None, keep_descr=False, max_len=None):
+def split_fasta(
+input_filename,
+output_filename_base,
+n=500,
+truncate=None,
+keep_descr=False,
+max_len=None,
+):
 """Split FASTA file into sub-files each of at most n sequences.
 Returns a list of the filenames used (based on the input filename).
 Each sequence can also be truncated (since we only need the start for
 SignalP), and have its description discarded (since we don't usually
 try:
 while True:
 records = []
 for i in range(n):
 try:
-records.append(iterator.next())
+records.append(next(iterator))
 except StopIteration:
 break
 if not records:
 break
 new_filename = "%s.%i.tmp" % (output_filename_base, len(files))
 handle = open(new_filename, "w")
 if keep_descr:
 for title, seq in records:
 handle.write(">%s\n" % title)
 for i in range(0, len(seq), 60):
-handle.write(seq[i:i + 60] + "\n")
+handle.write(seq[i : i + 60] + "\n")
 else:
 for title, seq in records:
 handle.write(">%s\n" % title.split()[0])
 for i in range(0, len(seq), 60):
-handle.write(seq[i:i + 60] + "\n")
+handle.write(seq[i : i + 60] + "\n")
 handle.close()
 files.append(new_filename)
 # print "%i records in %s" % (len(records), new_filename)
 except ValueError as err:
 # Max length failure from parser - clean up
 assert os.path.isfile(f), "Missing split file %r (!??)" % f
 return files
 def run_jobs(jobs, threads, pause=10, verbose=False, fast_fail=True):
-"""Takes list of cmd strings, returns dict with error levels."""
+"""Take list of cmd strings, return dict with error levels."""
 pending = jobs[:]
 running = []
 results = {}
 skipped = []
 if threads == 1:
 return_code = process.poll()  # non-blocking
 if return_code is not None:
 results[cmd] = return_code
 if return_code:
 failed = True
-running = [(cmd, process) for (cmd, process) in running
+running = [(cmd, process) for (cmd, process) in running if cmd not in results]
-if cmd not in results]
 if verbose:
-print("%i jobs pending, %i running, %i completed" %
+print(
-(len(pending), len(running), len(results)))
+"%i jobs pending, %i running, %i completed"
+% (len(pending), len(running), len(results))
+)
 # See if we can start any new threads
 if pending and failed and fast_fail:
 # Don't start any more jobs
 if verbose:
 print("Failed, will not start remaining %i jobs" % len(pending))

Mercurial > repos > peterjc > tmhmm_and_signalp

comparison tools/protein_analysis/seq_analysis_utils.py @ 21:238eae32483c draft