Mercurial > repos > peterjc > tmhmm_and_signalp
diff tools/protein_analysis/signalp3.py @ 7:9b45a8743100 draft
Uploaded v0.1.0, which adds a wrapper for Promoter 2.0 (DNA tool) and enables use of Galaxy's <parallelism> tag for SignalP, TMHMM X Promoter wrappers.
author | peterjc |
---|---|
date | Mon, 30 Jul 2012 10:25:07 -0400 |
parents | 0f1c61998b22 |
children | e52220a9ddad |
line wrap: on
line diff
--- a/tools/protein_analysis/signalp3.py Tue Jun 07 18:07:09 2011 -0400 +++ b/tools/protein_analysis/signalp3.py Mon Jul 30 10:25:07 2012 -0400 @@ -4,10 +4,14 @@ This script takes exactly five command line arguments: * the organism type (euk, gram+ or gram-) * length to truncate sequences to (integer) - * number of threads to use (integer) + * number of threads to use (integer, defaults to one) * an input protein FASTA filename * output tabular filename. +There are two further optional arguments + * cut type (NN_Cmax, NN_Ymax, NN_Smax or HMM_Cmax) + * output GFF3 filename + It then calls the standalone SignalP v3.0 program (not the webservice) requesting the short output (one line per protein) using both NN and HMM for predictions. @@ -41,16 +45,27 @@ run multiple copies of TMHMM in parallel. I would normally use Python's multiprocessing library in this situation but it requires at least Python 2.6 and at the time of writing Galaxy still supports Python 2.4. + +Note that this is somewhat redundant with job-splitting available in Galaxy +itself (see the SignalP XML file for settings). + +Finally, you can opt to have a GFF3 file produced which will describe the +predicted signal peptide and mature peptide for each protein (using one of +the predictors which gives a cleavage site). *WORK IN PROGRESS* """ import sys import os -from seq_analysis_utils import stop_err, split_fasta, run_jobs +import tempfile +from seq_analysis_utils import stop_err, split_fasta, run_jobs, fasta_iterator FASTA_CHUNK = 500 MAX_LEN = 6000 #Found by trial and error -if len(sys.argv) != 6: - stop_err("Require five arguments, organism, truncate, threads, input protein FASTA file & output tabular file") +if len(sys.argv) not in [6,8]: + stop_err("Require five (or 7) arguments, organism, truncate, threads, " + "input protein FASTA file & output tabular file (plus " + "optionally cut method and GFF3 output file). " + "Got %i arguments." % (len(sys.argv)-1)) organism = sys.argv[1] if organism not in ["euk", "gram+", "gram-"]: @@ -66,7 +81,7 @@ try: num_threads = int(sys.argv[3]) except: - num_threads = 0 + num_threads = 1 #Default, e.g. used "$NSLOTS" and environment variable not defined if num_threads < 1: stop_err("Threads argument %s is not a positive integer" % sys.argv[3]) @@ -74,8 +89,27 @@ tabular_file = sys.argv[5] -def clean_tabular(raw_handle, out_handle): +if len(sys.argv) == 8: + cut_method = sys.argv[6] + if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]: + stop_err("Invalid cut method %r" % cut_method) + gff3_file = sys.argv[7] +else: + cut_method = None + gff3_file = None + + +tmp_dir = tempfile.mkdtemp() + +def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None): """Clean up SignalP output to make it tabular.""" + if cut_method: + cut_col = {"NN_Cmax" : 2, + "NN_Ymax" : 5, + "NN_Smax" : 8, + "HMM_Cmax" : 16}[cut_method] + else: + cut_col = None for line in raw_handle: if not line or line.startswith("#"): continue @@ -87,7 +121,59 @@ parts = parts[14:15] + parts[1:14] + parts[15:] out_handle.write("\t".join(parts) + "\n") -fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) +def make_gff(fasta_file, tabular_file, gff_file, cut_method): + cut_col, score_col = {"NN_Cmax" : (2,1), + "NN_Ymax" : (5,4), + "NN_Smax" : (8,7), + "HMM_Cmax" : (16,15), + }[cut_method] + + source = "SignalP" + strand = "." #not stranded + phase = "." #not phased + tags = "Note=%s" % cut_method + + tab_handle = open(tabular_file) + line = tab_handle.readline() + assert line.startswith("#ID\t"), line + + gff_handle = open(gff_file, "w") + gff_handle.write("##gff-version 3\n") + + for (title, seq), line in zip(fasta_iterator(fasta_file), tab_handle): + parts = line.rstrip("\n").split("\t") + seqid = parts[0] + assert title.startswith(seqid), "%s vs %s" % (seqid, title) + if len(seq)==0: + #Is it possible to have a zero length reference in GFF3? + continue + cut = int(parts[cut_col]) + if cut == 0: + assert cut_method == "HMM_Cmax", cut_method + #TODO - Why does it do this? + cut = 1 + assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) + score = parts[score_col] + gff_handle.write("##sequence-region %s %i %i\n" \ + % (seqid, 1, len(seq))) + #If the cut is at the very begining, there is no signal peptide! + if cut > 1: + #signal_peptide = SO:0000418 + gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \ + % (seqid, source, + "signal_peptide", 1, cut-1, + score, strand, phase, tags)) + #mature_protein_region = SO:0000419 + gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \ + % (seqid, source, + "mature_protein_region", cut, len(seq), + score, strand, phase, tags)) + tab_handle.close() + gff_handle.close() + + +fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"), + n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) temp_files = [f+".out" for f in fasta_files] assert len(fasta_files) == len(temp_files) jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) @@ -98,6 +184,10 @@ for f in file_list: if os.path.isfile(f): os.remove(f) + try: + os.rmdir(tmp_dir) + except: + pass if len(jobs) > 1 and num_threads > 1: #A small "info" message for Galaxy to show the user. @@ -109,10 +199,9 @@ try: output = open(temp).readline() except IOError: - output = "" + output = "(no output)" if error_level or output.lower().startswith("error running"): - clean_up(fasta_files) - clean_up(temp_files) + clean_up(fasta_files + temp_files) stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results @@ -133,5 +222,8 @@ data_handle.close() out_handle.close() -clean_up(fasta_files) -clean_up(temp_files) +#GFF3: +if cut_method: + make_gff(fasta_file, tabular_file, gff3_file, cut_method) + +clean_up(fasta_files + temp_files)