diff tools/protein_analysis/seq_analysis_utils.py @ 20:a19b3ded8f33 draft

v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
author peterjc
date Thu, 21 Sep 2017 11:35:20 -0400
parents f3ecd80850e2
children 238eae32483c
line wrap: on
line diff
--- a/tools/protein_analysis/seq_analysis_utils.py	Wed Feb 01 09:46:42 2017 -0500
+++ b/tools/protein_analysis/seq_analysis_utils.py	Thu Sep 21 11:35:20 2017 -0400
@@ -7,9 +7,13 @@
 multiprocessing so the function run_jobs instead is a simple pool approach
 using just the subprocess library.
 """
-import sys
+
+from __future__ import print_function
+
 import os
 import subprocess
+import sys
+
 from time import sleep
 
 __version__ = "0.0.2"
@@ -47,6 +51,7 @@
 
 
 def thread_count(command_line_arg, default=1):
+    """Determine number of threads to use from the command line args."""
     try:
         num = int(command_line_arg)
     except ValueError:
@@ -137,7 +142,7 @@
             handle.close()
             files.append(new_filename)
             # print "%i records in %s" % (len(records), new_filename)
-    except ValueError, err:
+    except ValueError as err:
         # Max length failure from parser - clean up
         try:
             handle.close()
@@ -152,37 +157,47 @@
     return files
 
 
-def run_jobs(jobs, threads, pause=10, verbose=False):
+def run_jobs(jobs, threads, pause=10, verbose=False, fast_fail=True):
     """Takes list of cmd strings, returns dict with error levels."""
     pending = jobs[:]
     running = []
     results = {}
+    skipped = []
     if threads == 1:
         # Special case this for speed, don't need the waits
         for cmd in jobs:
             results[cmd] = subprocess.call(cmd, shell=True)
         return results
+    failed = False
     while pending or running:
         # See if any have finished
         for (cmd, process) in running:
             return_code = process.poll()  # non-blocking
             if return_code is not None:
                 results[cmd] = return_code
+                if return_code:
+                    failed = True
         running = [(cmd, process) for (cmd, process) in running
                    if cmd not in results]
         if verbose:
-            print "%i jobs pending, %i running, %i completed" \
-                  % (len(pending), len(running), len(results))
+            print("%i jobs pending, %i running, %i completed" %
+                  (len(pending), len(running), len(results)))
         # See if we can start any new threads
+        if pending and failed and fast_fail:
+            # Don't start any more jobs
+            if verbose:
+                print("Failed, will not start remaining %i jobs" % len(pending))
+            skipped = pending
+            pending = []
         while pending and len(running) < threads:
             cmd = pending.pop(0)
             if verbose:
-                print cmd
+                print(cmd)
             process = subprocess.Popen(cmd, shell=True)
             running.append((cmd, process))
         # Loop...
         sleep(pause)
     if verbose:
-        print "%i jobs completed" % len(results)
-    assert set(jobs) == set(results)
+        print("%i jobs completed" % len(results))
+    assert set(jobs) == set(results).union(skipped)
     return results