diff tools/protein_analysis/promoter2.py @ 7:9b45a8743100 draft

Uploaded v0.1.0, which adds a wrapper for Promoter 2.0 (DNA tool) and enables use of Galaxy's <parallelism> tag for SignalP, TMHMM X Promoter wrappers.
author peterjc
date Mon, 30 Jul 2012 10:25:07 -0400
parents
children 976a5f2833cd
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/protein_analysis/promoter2.py	Mon Jul 30 10:25:07 2012 -0400
@@ -0,0 +1,163 @@
+#!/usr/bin/env python
+"""Wrapper for Promoter 2.0 for use in Galaxy.
+
+This script takes exactly three command line arguments:
+ * number of threads
+ * an input DNA FASTA filename
+ * output tabular filename.
+
+It calls the Promoter 2.0 binary (e.g. .../promoter-2.0/bin/promoter_Linux,
+bypassing the Perl wrapper script 'promoter' which imposes a significant
+performace overhead for no benefit here (we don't need HTML output for
+example).
+
+The main feature is this Python wrapper script parsers the bespoke
+tabular output from Promoter 2.0 and reformats it into a Galaxy friendly
+tab separated table.
+
+Additionally, in order to take advantage of multiple cores the input FASTA
+file is broken into chunks and multiple copies of promoter run at once.
+This can be used in combination with the job-splitting available in Galaxy.
+
+Note that rewriting the FASTA input file allows us to avoid a bug in
+promoter 2 with long descriptions in the FASTA header line (over 200
+characters) which produces stray fragements of the description in the
+output file, making parsing non-trivial.
+
+TODO - Automatically extract the sequence containing a promoter prediction?
+"""
+import sys
+import os
+import commands
+import tempfile
+from seq_analysis_utils import stop_err, split_fasta, run_jobs
+
+FASTA_CHUNK = 500
+
+if len(sys.argv) != 4:
+    stop_err("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. "
+             "Got %i arguments." % (len(sys.argv)-1))
+try:
+    num_threads = int(sys.argv[1])
+except:
+    num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined
+if num_threads < 1:
+    stop_err("Threads argument %s is not a positive integer" % sys.argv[1])
+
+fasta_file = os.path.abspath(sys.argv[2])
+tabular_file = os.path.abspath(sys.argv[3])
+
+tmp_dir = tempfile.mkdtemp()
+
+def get_path_and_binary():
+    platform = commands.getoutput("uname") #e.g. Linux
+    shell_script = commands.getoutput("which promoter")
+    if not os.path.isfile(shell_script):
+        stop_err("ERROR: Missing promoter executable shell script")
+    path = None
+    for line in open(shell_script):
+        if line.startswith("setenv"): #could then be tab or space!
+            parts = line.rstrip().split(None, 2)
+            if parts[0] == "setenv" and parts[1] == "PROM":
+                path = parts[2]
+    if not path:
+        stop_err("ERROR: Could not find promoter path (PROM) in %r" % shell_script)
+    if not os.path.isdir(path):
+        stop_error("ERROR: %r is not a directory" % path)
+    bin = "%s/bin/promoter_%s" % (path, platform)
+    if not os.path.isfile(bin):
+        stop_err("ERROR: Missing promoter binary %r" % bin)
+    return path, bin
+
+def make_tabular(raw_handle, out_handle):
+    """Parse text output into tabular, return query count."""
+    identifier = None
+    queries = 0
+    #out.write("#Identifier\tDescription\tPosition\tScore\tLikelihood\n")
+    for line in raw_handle:
+        #print repr(line)
+        if not line.strip() or line == "Promoter prediction:\n":
+            pass
+        elif line[0] != " ":
+            identifier = line.strip().replace("\t", " ").split(None,1)[0]
+            queries += 1
+        elif line == "  No promoter predicted\n":
+            #End of a record
+            identifier = None
+        elif line == "  Position  Score  Likelihood\n":
+            assert identifier
+        else:
+            try:
+                position, score, likelihood = line.strip().split(None,2)
+            except ValueError:
+                print "WARNING: Problem with line: %r" % line
+                continue
+                #stop_err("ERROR: Problem with line: %r" % line)
+            if likelihood not in ["ignored",
+                                  "Marginal prediction",
+                                  "Medium likely prediction",
+                                  "Highly likely prediction"]:
+                stop_err("ERROR: Problem with line: %r" % line)
+            out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood))
+    #out.close()
+    return queries
+    
+working_dir, bin = get_path_and_binary()
+
+if not os.path.isfile(fasta_file):
+   stop_err("ERROR: Missing input FASTA file %r" % fasta_file)
+
+#Note that if the input FASTA file contains no sequences,
+#split_fasta returns an empty list (i.e. zero temp files).
+#We deliberately omit the FASTA descriptions to avoid a
+#bug in promoter2 with descriptions over 200 characters.
+fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "promoter"), FASTA_CHUNK, keep_descr=False)
+temp_files = [f+".out" for f in fasta_files]
+jobs = ["%s %s > %s" % (bin, fasta, temp)
+        for fasta, temp in zip(fasta_files, temp_files)]
+
+def clean_up(file_list):
+    for f in file_list:
+        if os.path.isfile(f):
+            os.remove(f)
+    try:
+        os.rmdir(tmp_dir)
+    except:
+        pass
+
+if len(jobs) > 1 and num_threads > 1:
+    #A small "info" message for Galaxy to show the user.
+    print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
+cur_dir = os.path.abspath(os.curdir)
+os.chdir(working_dir)
+results = run_jobs(jobs, num_threads)
+os.chdir(cur_dir)
+for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
+    error_level = results[cmd]
+    if error_level:
+        try:
+            output = open(temp).readline()
+        except IOError:
+            output = ""
+        clean_up(fasta_files + temp_files)
+        stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
+                 error_level)
+
+del results
+del jobs
+
+out_handle = open(tabular_file, "w")
+out_handle.write("#Identifier\tDescription\tPosition\tScore\tLikelihood\n")
+queries = 0
+for temp in temp_files:
+    data_handle = open(temp)
+    count = make_tabular(data_handle, out_handle)
+    data_handle.close()
+    if not count:
+        clean_up(fasta_files + temp_files)
+        stop_err("No output from promoter2")
+    queries += count
+out_handle.close()
+
+clean_up(fasta_files + temp_files)
+print "Results for %i queries" % queries