Mercurial > repos > peterjc > tmhmm_and_signalp
diff tools/protein_analysis/signalp3.py @ 19:f3ecd80850e2 draft
v0.2.9 Python style improvements
author | peterjc |
---|---|
date | Wed, 01 Feb 2017 09:46:42 -0500 |
parents | eb6ac44d4b8e |
children | a19b3ded8f33 |
line wrap: on
line diff
--- a/tools/protein_analysis/signalp3.py Tue Sep 01 09:56:36 2015 -0400 +++ b/tools/protein_analysis/signalp3.py Wed Feb 01 09:46:42 2017 -0500 @@ -21,9 +21,9 @@ # SignalP-NN euk predictions # SignalP-HMM euk predictions # name Cmax pos ? Ymax pos ? Smax pos ? Smean ? D ? # name ! Cmax pos ? Sprob ? -gi|2781234|pdb|1JLY| 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N gi|2781234|pdb|1JLY|B Q 0.000 17 N 0.000 N -gi|4959044|gb|AAD342 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N gi|4959044|gb|AAD34209.1|AF069992_1 Q 0.000 0 N 0.000 N -gi|671626|emb|CAA856 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N gi|671626|emb|CAA85685.1| Q 0.000 0 N 0.000 N +gi|2781234|pdb|1JLY| 0.061 17 N 0.043 17 N 0.199 1 N 0.067 N 0.055 N gi|2781234|pdb|1JLY|B Q 0.000 17 N 0.000 N +gi|4959044|gb|AAD342 0.099 191 N 0.012 38 N 0.023 12 N 0.014 N 0.013 N gi|4959044|gb|AAD34209.1|AF069992_1 Q 0.000 0 N 0.000 N +gi|671626|emb|CAA856 0.139 381 N 0.020 8 N 0.121 4 N 0.067 N 0.044 N gi|671626|emb|CAA85685.1| Q 0.000 0 N 0.000 N gi|3298468|dbj|BAA31 0.208 24 N 0.184 38 N 0.980 32 Y 0.613 Y 0.398 N gi|3298468|dbj|BAA31520.1| Q 0.066 24 N 0.139 N In order to make it easier to use in Galaxy, this wrapper script reformats @@ -56,28 +56,28 @@ import sys import os import tempfile -from seq_analysis_utils import sys_exit, split_fasta, fasta_iterator +from seq_analysis_utils import split_fasta, fasta_iterator from seq_analysis_utils import run_jobs, thread_count FASTA_CHUNK = 500 -MAX_LEN = 6000 #Found by trial and error +MAX_LEN = 6000 # Found by trial and error -if len(sys.argv) not in [6,8]: - sys_exit("Require five (or 7) arguments, organism, truncate, threads, " +if len(sys.argv) not in [6, 8]: + sys.exit("Require five (or 7) arguments, organism, truncate, threads, " "input protein FASTA file & output tabular file (plus " "optionally cut method and GFF3 output file). " - "Got %i arguments." % (len(sys.argv)-1)) + "Got %i arguments." % (len(sys.argv) - 1)) organism = sys.argv[1] if organism not in ["euk", "gram+", "gram-"]: - sys_exit("Organism argument %s is not one of euk, gram+ or gram-" % organism) + sys.exit("Organism argument %s is not one of euk, gram+ or gram-" % organism) try: truncate = int(sys.argv[2]) -except: +except ValueError: truncate = 0 if truncate < 0: - sys_exit("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) + sys.exit("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) num_threads = thread_count(sys.argv[3], default=4) fasta_file = sys.argv[4] @@ -86,7 +86,7 @@ if len(sys.argv) == 8: cut_method = sys.argv[6] if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]: - sys_exit("Invalid cut method %r" % cut_method) + sys.exit("Invalid cut method %r" % cut_method) gff3_file = sys.argv[7] else: cut_method = None @@ -95,39 +95,41 @@ tmp_dir = tempfile.mkdtemp() + def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None): """Clean up SignalP output to make it tabular.""" if cut_method: - cut_col = {"NN_Cmax" : 2, - "NN_Ymax" : 5, - "NN_Smax" : 8, - "HMM_Cmax" : 16}[cut_method] + cut_col = {"NN_Cmax": 2, + "NN_Ymax": 5, + "NN_Smax": 8, + "HMM_Cmax": 16}[cut_method] else: cut_col = None for line in raw_handle: if not line or line.startswith("#"): continue parts = line.rstrip("\r\n").split() - assert len(parts)==21, repr(line) + assert len(parts) == 21, repr(line) assert parts[14].startswith(parts[0]), \ "Bad entry in SignalP output, ID miss-match:\n%r" % line - #Remove redundant truncated name column (col 0) - #and put full name at start (col 14) + # Remove redundant truncated name column (col 0) + # and put full name at start (col 14) parts = parts[14:15] + parts[1:14] + parts[15:] out_handle.write("\t".join(parts) + "\n") + def make_gff(fasta_file, tabular_file, gff_file, cut_method): - cut_col, score_col = {"NN_Cmax" : (2,1), - "NN_Ymax" : (5,4), - "NN_Smax" : (8,7), - "HMM_Cmax" : (16,15), + cut_col, score_col = {"NN_Cmax": (2, 1), + "NN_Ymax": (5, 4), + "NN_Smax": (8, 7), + "HMM_Cmax": (16, 15), }[cut_method] source = "SignalP" - strand = "." #not stranded - phase = "." #not phased + strand = "." # not stranded + phase = "." # not phased tags = "Note=%s" % cut_method - + tab_handle = open(tabular_file) line = tab_handle.readline() assert line.startswith("#ID\t"), line @@ -139,53 +141,55 @@ parts = line.rstrip("\n").split("\t") seqid = parts[0] assert title.startswith(seqid), "%s vs %s" % (seqid, title) - if len(seq)==0: - #Is it possible to have a zero length reference in GFF3? + if not seq: + # Is it possible to have a zero length reference in GFF3? continue cut = int(parts[cut_col]) if cut == 0: assert cut_method == "HMM_Cmax", cut_method - #TODO - Why does it do this? + # TODO - Why does it do this? cut = 1 assert 1 <= cut <= len(seq), "%i for %s len %i" % (cut, seqid, len(seq)) score = parts[score_col] - gff_handle.write("##sequence-region %s %i %i\n" \ + gff_handle.write("##sequence-region %s %i %i\n" % (seqid, 1, len(seq))) - #If the cut is at the very begining, there is no signal peptide! + # If the cut is at the very begining, there is no signal peptide! if cut > 1: - #signal_peptide = SO:0000418 - gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \ + # signal_peptide = SO:0000418 + gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" % (seqid, source, - "signal_peptide", 1, cut-1, + "signal_peptide", 1, cut - 1, score, strand, phase, tags)) - #mature_protein_region = SO:0000419 - gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" \ + # mature_protein_region = SO:0000419 + gff_handle.write("%s\t%s\t%s\t%i\t%i\t%s\t%s\t%s\t%s\n" % (seqid, source, "mature_protein_region", cut, len(seq), score, strand, phase, tags)) - tab_handle.close() + tab_handle.close() gff_handle.close() fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "signalp"), n=FASTA_CHUNK, truncate=truncate, max_len=MAX_LEN) -temp_files = [f+".out" for f in fasta_files] +temp_files = [f + ".out" for f in fasta_files] assert len(fasta_files) == len(temp_files) jobs = ["signalp -short -t %s %s > %s" % (organism, fasta, temp) for (fasta, temp) in zip(fasta_files, temp_files)] assert len(fasta_files) == len(temp_files) == len(jobs) + def clean_up(file_list): + """Remove temp files, and if possible the temp directory.""" for f in file_list: if os.path.isfile(f): os.remove(f) try: os.rmdir(tmp_dir) - except: + except Exception: pass if len(jobs) > 1 and num_threads > 1: - #A small "info" message for Galaxy to show the user. + # A small "info" message for Galaxy to show the user. print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) results = run_jobs(jobs, num_threads) assert len(fasta_files) == len(temp_files) == len(jobs) @@ -197,17 +201,17 @@ output = "(no output)" if error_level or output.lower().startswith("error running"): clean_up(fasta_files + temp_files) - sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), + sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), error_level) del results out_handle = open(tabular_file, "w") fields = ["ID"] -#NN results: +# NN results: for name in ["Cmax", "Ymax", "Smax"]: - fields.extend(["NN_%s_score"%name, "NN_%s_pos"%name, "NN_%s_pred"%name]) + fields.extend(["NN_%s_score" % name, "NN_%s_pos" % name, "NN_%s_pred" % name]) fields.extend(["NN_Smean_score", "NN_Smean_pred", "NN_D_score", "NN_D_pred"]) -#HMM results: +# HMM results: fields.extend(["HMM_type", "HMM_Cmax_score", "HMM_Cmax_pos", "HMM_Cmax_pred", "HMM_Sprob_score", "HMM_Sprob_pred"]) out_handle.write("#" + "\t".join(fields) + "\n") @@ -217,7 +221,7 @@ data_handle.close() out_handle.close() -#GFF3: +# GFF3: if cut_method: make_gff(fasta_file, tabular_file, gff3_file, cut_method)