tmhmm_and_signalp: tools/protein_analysis/signalp3.py comparison

comparison tools/protein_analysis/signalp3.py @ 7:9b45a8743100 draft

Uploaded v0.1.0, which adds a wrapper for Promoter 2.0 (DNA tool) and enables use of Galaxy's <parallelism> tag for SignalP, TMHMM X Promoter wrappers.

author	peterjc
date	Mon, 30 Jul 2012 10:25:07 -0400
parents	0f1c61998b22
children	e52220a9ddad

comparison

equal deleted inserted replaced

-:a290c6d4e658
+:9b45a8743100
 """Wrapper for SignalP v3.0 for use in Galaxy.
 This script takes exactly five command line arguments:
 * the organism type (euk, gram+ or gram-)
 * length to truncate sequences to (integer)
-* number of threads to use (integer)
+* number of threads to use (integer, defaults to one)
 * an input protein FASTA filename
 * output tabular filename.
+There are two further optional arguments
+* cut type (NN_Cmax, NN_Ymax, NN_Smax or HMM_Cmax)
+* output GFF3 filename
 It then calls the standalone SignalP v3.0 program (not the webservice)
 requesting the short output (one line per protein) using both NN and HMM
 for predictions.
 The third major feature is taking advantage of multiple cores (since SignalP
 v3.0 itself is single threaded) by using the individual FASTA input files to
 run multiple copies of TMHMM in parallel. I would normally use Python's
 multiprocessing library in this situation but it requires at least Python 2.6
 and at the time of writing Galaxy still supports Python 2.4.
+Note that this is somewhat redundant with job-splitting available in Galaxy
+itself (see the SignalP XML file for settings).
+Finally, you can opt to have a GFF3 file produced which will describe the
+predicted signal peptide and mature peptide for each protein (using one of
+the predictors which gives a cleavage site). *WORK IN PROGRESS*
 """
 import sys
 import os
-from seq_analysis_utils import stop_err, split_fasta, run_jobs
+import tempfile
+from seq_analysis_utils import stop_err, split_fasta, run_jobs, fasta_iterator
 FASTA_CHUNK = 500
 MAX_LEN = 6000 #Found by trial and error
-if len(sys.argv) != 6:
+if len(sys.argv) not in  [6,8]:
-stop_err("Require five arguments, organism, truncate, threads, input protein FASTA file & output tabular file")
+stop_err("Require five (or 7) arguments, organism, truncate, threads, "
+"input protein FASTA file & output tabular file (plus "
+"optionally cut method and GFF3 output file). "
+"Got %i arguments." % (len(sys.argv)-1))
 organism = sys.argv[1]
 if organism not in ["euk", "gram+", "gram-"]:
 stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism)
 stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2])
 try:
 num_threads = int(sys.argv[3])
 except:
-num_threads = 0
+num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined
 if num_threads < 1:
 stop_err("Threads argument %s is not a positive integer" % sys.argv[3])
 fasta_file = sys.argv[4]
 tabular_file = sys.argv[5]
-def clean_tabular(raw_handle, out_handle):
+if len(sys.argv) == 8:
+cut_method = sys.argv[6]
+if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]:
+stop_err("Invalid cut method %r" % cut_method)
+gff3_file = sys.argv[7]
+else:
+cut_method = None
+gff3_file = None
+tmp_dir = tempfile.mkdtemp()
+def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None):
 """Clean up SignalP output to make it tabular."""
+if cut_method:
+cut_col = {"NN_Cmax" : 2,
+"NN_Ymax" : 5,
+"NN_Smax" : 8,
+"HMM_Cmax" : 16}[cut_method]
+else:
+cut_col = None
 for line in raw_handle:
 if not line or line.startswith("#"):
 continue
 parts = line.rstrip("\r\n").split()
 assert len(parts)==21, repr(line)
 #Remove redundant truncated name column (col 0)
 #and put full name at start (col 14)
 parts = parts[14:15] + parts[1:14] + parts[15:]
 out_handle.write("\t".join(parts) + "\n")
-fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN)
+def make_gff(fasta_file, tabular_file, gff_file, cut_method):
+cut_col, score_col = {"NN_Cmax" : (2,1),
+"NN_Ymax" : (5,4),
+"NN_Smax" : (8,7),
+"HMM_Cmax" : (16,15),
+}[cut_method]
+source = "SignalP"
+strand = "." #not stranded
+phase = "." #not phased
+tags = "Note=%s" % cut_method
+tab_handle = open(tabular_file)
+line = tab_handle.readline()
+assert line.startswith("#ID\t"), line
+gff_handle = open(gff_file, "w")
+gff_handle.write("##gff-version 3\n")
+for (title, seq), line in zip(fasta_iterator(fasta_file), tab_handle):
+parts = line.rstrip("\n").split("\t")
+seqid = parts[0]
+assert title.startswith(seqid), "%s vs %s" % (seqid, title)
+if len(seq)==0:
+#Is it possible to have a zero length reference in GFF3?
+continue
+cut = int(parts[cut_col])
+if cut == 0:
+assert cut_method == "HMM_Cmax", cut_method
+#TODO - Why does it do this?
+cut = 1
+assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq))
+score = parts[score_col]
+gff_handle.write("##sequence-region %s %i %i\n" \
+% (seqid, 1, len(seq)))
+#If the cut is at the very begining, there is no signal peptide!
+if cut > 1:
+#signal_peptide = SO:0000418
+gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \
+% (seqid, source,
+"signal_peptide", 1, cut-1,
+score, strand, phase, tags))
+#mature_protein_region = SO:0000419
+gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \
+% (seqid, source,
+"mature_protein_region", cut, len(seq),
+score, strand, phase, tags))
+tab_handle.close()
+gff_handle.close()
+fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"),
+n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN)
 temp_files = [f+".out" for f in fasta_files]
 assert len(fasta_files) == len(temp_files)
 jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp)
 for (fasta, temp) in zip(fasta_files, temp_files)]
 assert len(fasta_files) == len(temp_files) == len(jobs)
 def clean_up(file_list):
 for f in file_list:
 if os.path.isfile(f):
 os.remove(f)
+try:
+os.rmdir(tmp_dir)
+except:
+pass
 if len(jobs) > 1 and num_threads > 1:
 #A small "info" message for Galaxy to show the user.
 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
 results = run_jobs(jobs, num_threads)
 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
 error_level = results[cmd]
 try:
 output = open(temp).readline()
 except IOError:
-output = ""
+output = "(no output)"
 if error_level or output.lower().startswith("error running"):
-clean_up(fasta_files)
+clean_up(fasta_files + temp_files)
-clean_up(temp_files)
 stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
 error_level)
 del results
 out_handle = open(tabular_file, "w")
 data_handle = open(temp)
 clean_tabular(data_handle, out_handle)
 data_handle.close()
 out_handle.close()
-clean_up(fasta_files)
+#GFF3:
-clean_up(temp_files)
+if cut_method:
+make_gff(fasta_file, tabular_file, gff3_file, cut_method)
+clean_up(fasta_files + temp_files)

Mercurial > repos > peterjc > tmhmm_and_signalp

comparison tools/protein_analysis/signalp3.py @ 7:9b45a8743100 draft