diff tools/protein_analysis/signalp3.py @ 7:9b45a8743100 draft

Uploaded v0.1.0, which adds a wrapper for Promoter 2.0 (DNA tool) and enables use of Galaxy's <parallelism> tag for SignalP, TMHMM X Promoter wrappers.
author peterjc
date Mon, 30 Jul 2012 10:25:07 -0400
parents 0f1c61998b22
children e52220a9ddad
line wrap: on
line diff
--- a/tools/protein_analysis/signalp3.py	Tue Jun 07 18:07:09 2011 -0400
+++ b/tools/protein_analysis/signalp3.py	Mon Jul 30 10:25:07 2012 -0400
@@ -4,10 +4,14 @@
 This script takes exactly five command line arguments:
  * the organism type (euk, gram+ or gram-)
  * length to truncate sequences to (integer)
- * number of threads to use (integer)
+ * number of threads to use (integer, defaults to one)
  * an input protein FASTA filename
  * output tabular filename.
 
+There are two further optional arguments
+ * cut type (NN_Cmax, NN_Ymax, NN_Smax or HMM_Cmax)
+ * output GFF3 filename
+
 It then calls the standalone SignalP v3.0 program (not the webservice)
 requesting the short output (one line per protein) using both NN and HMM
 for predictions.
@@ -41,16 +45,27 @@
 run multiple copies of TMHMM in parallel. I would normally use Python's
 multiprocessing library in this situation but it requires at least Python 2.6
 and at the time of writing Galaxy still supports Python 2.4.
+
+Note that this is somewhat redundant with job-splitting available in Galaxy
+itself (see the SignalP XML file for settings).
+
+Finally, you can opt to have a GFF3 file produced which will describe the
+predicted signal peptide and mature peptide for each protein (using one of
+the predictors which gives a cleavage site). *WORK IN PROGRESS*
 """
 import sys
 import os
-from seq_analysis_utils import stop_err, split_fasta, run_jobs
+import tempfile
+from seq_analysis_utils import stop_err, split_fasta, run_jobs, fasta_iterator
 
 FASTA_CHUNK = 500
 MAX_LEN = 6000 #Found by trial and error
 
-if len(sys.argv) != 6:
-   stop_err("Require five arguments, organism, truncate, threads, input protein FASTA file & output tabular file")
+if len(sys.argv) not in  [6,8]:
+   stop_err("Require five (or 7) arguments, organism, truncate, threads, "
+            "input protein FASTA file & output tabular file (plus "
+            "optionally cut method and GFF3 output file). "
+            "Got %i arguments." % (len(sys.argv)-1))
 
 organism = sys.argv[1]
 if organism not in ["euk", "gram+", "gram-"]:
@@ -66,7 +81,7 @@
 try:
    num_threads = int(sys.argv[3])
 except:
-   num_threads = 0
+   num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined
 if num_threads < 1:
    stop_err("Threads argument %s is not a positive integer" % sys.argv[3])
 
@@ -74,8 +89,27 @@
 
 tabular_file = sys.argv[5]
 
-def clean_tabular(raw_handle, out_handle):
+if len(sys.argv) == 8:
+   cut_method = sys.argv[6]
+   if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]:
+      stop_err("Invalid cut method %r" % cut_method)
+   gff3_file = sys.argv[7]
+else:
+   cut_method = None
+   gff3_file = None
+
+
+tmp_dir = tempfile.mkdtemp()
+
+def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None):
     """Clean up SignalP output to make it tabular."""
+    if cut_method:
+       cut_col = {"NN_Cmax" : 2,
+                  "NN_Ymax" : 5,
+                  "NN_Smax" : 8,
+                  "HMM_Cmax" : 16}[cut_method]
+    else:
+       cut_col = None
     for line in raw_handle:
         if not line or line.startswith("#"):
             continue
@@ -87,7 +121,59 @@
         parts = parts[14:15] + parts[1:14] + parts[15:]
         out_handle.write("\t".join(parts) + "\n")
 
-fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN)
+def make_gff(fasta_file, tabular_file, gff_file, cut_method):
+    cut_col, score_col = {"NN_Cmax" : (2,1),
+                          "NN_Ymax" : (5,4),
+                          "NN_Smax" : (8,7),
+                          "HMM_Cmax" : (16,15),
+                          }[cut_method]
+
+    source = "SignalP"
+    strand = "." #not stranded
+    phase = "." #not phased
+    tags = "Note=%s" % cut_method
+    
+    tab_handle = open(tabular_file)
+    line = tab_handle.readline()
+    assert line.startswith("#ID\t"), line
+
+    gff_handle = open(gff_file, "w")
+    gff_handle.write("##gff-version 3\n")
+
+    for (title, seq), line in zip(fasta_iterator(fasta_file), tab_handle):
+        parts = line.rstrip("\n").split("\t")
+        seqid = parts[0]
+        assert title.startswith(seqid), "%s vs %s" % (seqid, title)
+        if len(seq)==0:
+            #Is it possible to have a zero length reference in GFF3?
+            continue
+        cut = int(parts[cut_col])
+        if cut == 0:
+            assert cut_method == "HMM_Cmax", cut_method
+            #TODO - Why does it do this?
+            cut = 1
+        assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq))
+        score = parts[score_col]
+        gff_handle.write("##sequence-region %s %i %i\n" \
+                          % (seqid, 1, len(seq)))
+        #If the cut is at the very begining, there is no signal peptide!
+        if cut > 1:
+            #signal_peptide = SO:0000418
+            gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \
+                             % (seqid, source,
+                                "signal_peptide", 1, cut-1,
+                                score, strand, phase, tags))
+        #mature_protein_region = SO:0000419
+        gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \
+                         % (seqid, source,
+                            "mature_protein_region", cut, len(seq),
+                            score, strand, phase, tags))
+        tab_handle.close()
+    gff_handle.close()
+
+
+fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"),
+                          n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN)
 temp_files = [f+".out" for f in fasta_files]
 assert len(fasta_files) == len(temp_files)
 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp)
@@ -98,6 +184,10 @@
     for f in file_list:
         if os.path.isfile(f):
             os.remove(f)
+    try:
+        os.rmdir(tmp_dir)
+    except:
+        pass
 
 if len(jobs) > 1 and num_threads > 1:
     #A small "info" message for Galaxy to show the user.
@@ -109,10 +199,9 @@
     try:
         output = open(temp).readline()
     except IOError:
-        output = ""
+        output = "(no output)"
     if error_level or output.lower().startswith("error running"):
-        clean_up(fasta_files)
-        clean_up(temp_files)
+        clean_up(fasta_files + temp_files)
         stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
                  error_level)
 del results
@@ -133,5 +222,8 @@
     data_handle.close()
 out_handle.close()
 
-clean_up(fasta_files)
-clean_up(temp_files)
+#GFF3:
+if cut_method:
+   make_gff(fasta_file, tabular_file, gff3_file, cut_method)
+
+clean_up(fasta_files + temp_files)