Mercurial > repos > peterjc > tmhmm_and_signalp
diff tools/protein_analysis/signalp3.py @ 0:bca9bc7fdaef
Migrated tool version 0.0.1 from old tool shed archive to new tool shed repository
author | peterjc |
---|---|
date | Tue, 07 Jun 2011 18:03:34 -0400 |
parents | |
children | 0f1c61998b22 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/protein_analysis/signalp3.py Tue Jun 07 18:03:34 2011 -0400 @@ -0,0 +1,137 @@ +#!/usr/bin/env python +"""Wrapper for SignalP v3.0 for use in Galaxy. + +This script takes exactly fives command line arguments: + * the organism type (euk, gram+ or gram-) + * length to truncate sequences to (integer) + * number of threads to use (integer) + * an input protein FASTA filename + * output tabular filename. + +It then calls the standalone SignalP v3.0 program (not the webservice) +requesting the short output (one line per protein) using both NN and HMM +for predictions. + +First major feature is cleaning up the output. The raw output from SignalP +v3.0 looks like this (21 columns space separated): + +# SignalP-NN euk predictions # SignalP-HMM euk predictions +# name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ? +gi|2781234|pdb|1JLY| 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N gi|2781234|pdb|1JLY|B Q 0.000 17 N 0.000 N +gi|4959044|gb|AAD342 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N gi|4959044|gb|AAD34209.1|AF069992_1 Q 0.000 0 N 0.000 N +gi|671626|emb|CAA856 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N gi|671626|emb|CAA85685.1| Q 0.000 0 N 0.000 N +gi|3298468|dbj|BAA31 0.208 24 N 0.184 38 N 0.980 32 Y 0.613 Y 0.398 N gi|3298468|dbj|BAA31520.1| Q 0.066 24 N 0.139 N + +In order to make it easier to use in Galaxy, this wrapper script reformats +this to use tab separators. Also it removes the redundant truncated name +column, and assigns unique column names in the header: + +#ID NN_Cmax_score NN_Cmax_pos NN_Cmax_pred NN_Ymax_score NN_Ymax_pos NN_Ymax_pred NN_Smax_score NN_Smax_pos NN_Smax_pred NN_Smean_score NN_Smean_pred NN_D_score NN_D_pred HMM_bang HMM_Cmax_score HMM_Cmax_pos HMM_Cmax_pred HMM_Sprob_score HMM_Sprob_pred +gi|2781234|pdb|1JLY|B 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N Q 0.000 17 N 0.000 N +gi|4959044|gb|AAD34209.1|AF069992_1 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N Q 0.000 0 N 0.000 N +gi|671626|emb|CAA85685.1| 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N Q 0.000 0 N 0.000 N +gi|3298468|dbj|BAA31520.1| 0.208 24 N 0.184 38 N 0.980 32 Y 0.613 Y 0.398 N Q 0.066 24 N 0.139 N + +The second major feature is overcoming SignalP's built in limit of 4000 +sequences by breaking up the input FASTA file into chunks. This also allows +us to pre-trim the sequences since SignalP only needs their starts. + +The third major feature is taking advantage of multiple cores (since SignalP +v3.0 itself is single threaded) by using the individual FASTA input files to +run multiple copies of TMHMM in parallel. I would normally use Python's +multiprocessing library in this situation but it requires at least Python 2.6 +and at the time of writing Galaxy still supports Python 2.4. +""" +import sys +import os +from seq_analysis_utils import stop_err, split_fasta, run_jobs + +FASTA_CHUNK = 500 +MAX_LEN = 6000 #Found by trial and error + +if len(sys.argv) != 6: + stop_err("Require five arguments, organism, truncate, threads, input protein FASTA file & output tabular file") + +organism = sys.argv[1] +if organism not in ["euk", "gram+", "gram-"]: + stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism) + +try: + truncate = int(sys.argv[2]) +except: + truncate = 0 +if truncate < 0: + stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) + +try: + num_threads = int(sys.argv[3]) +except: + num_threads = 0 +if num_threads < 1: + stop_err("Threads argument %s is not a positive integer" % sys.argv[3]) + +fasta_file = sys.argv[4] + +tabular_file = sys.argv[5] + +def clean_tabular(raw_handle, out_handle): + """Clean up SignalP output to make it tabular.""" + for line in raw_handle: + if not line or line.startswith("#"): + continue + parts = line.rstrip("\r\n").split() + assert len(parts)==21, repr(line) + assert parts[14].startswith(parts[0]) + #Remove redundant truncated name column (col 0) + #and put full name at start (col 14) + parts = parts[14:15] + parts[1:14] + parts[15:] + out_handle.write("\t".join(parts) + "\n") + +fasta_files = split_fasta(fasta_file, tabular_file, n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) +temp_files = [f+".out" for f in fasta_files] +assert len(fasta_files) == len(temp_files) +jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) + for (fasta, temp) in zip(fasta_files, temp_files)] +assert len(fasta_files) == len(temp_files) == len(jobs) + +def clean_up(file_list): + for f in file_list: + if os.path.isfile(f): + os.remove(f) + +if len(jobs) > 1 and num_threads > 1: + #A small "info" message for Galaxy to show the user. + print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) +results = run_jobs(jobs, num_threads) +assert len(fasta_files) == len(temp_files) == len(jobs) +for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): + error_level = results[cmd] + try: + output = open(temp).readline() + except IOError: + output = "" + if error_level or output.lower().startswith("error running"): + clean_up(fasta_files) + clean_up(temp_files) + stop_err("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), + error_level) +del results + +out_handle = open(tabular_file, "w") +fields = ["ID"] +#NN results: +for name in ["Cmax", "Ymax", "Smax"]: + fields.extend(["NN_%s_score"%name, "NN_%s_pos"%name, "NN_%s_pred"%name]) +fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"]) +#HMM results: +fields.extend(["HMM_type", "HMM_Cmax_score", "HMM_Cmax_pos", "HMM_Cmax_pred", + "HMM_Sprob_score", "HMM_Sprob_pred"]) +out_handle.write("#" + "\t".join(fields) + "\n") +for temp in temp_files: + data_handle = open(temp) + clean_tabular(data_handle, out_handle) + data_handle.close() +out_handle.close() + +clean_up(fasta_files) +clean_up(temp_files)