Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/signalp3.py @ 11:99b82a2b1272 draft
Uploaded v0.2.0 which added PSORTb wrapper (written with Konrad Paszkiewicz)
| author | peterjc |
|---|---|
| date | Wed, 03 Apr 2013 10:49:10 -0400 |
| parents | e52220a9ddad |
| children | eb6ac44d4b8e |
comparison
equal
deleted
inserted
replaced
| 10:09ff180d1615 | 11:99b82a2b1272 |
|---|---|
| 61 | 61 |
| 62 FASTA_CHUNK = 500 | 62 FASTA_CHUNK = 500 |
| 63 MAX_LEN = 6000 #Found by trial and error | 63 MAX_LEN = 6000 #Found by trial and error |
| 64 | 64 |
| 65 if len(sys.argv) not in [6,8]: | 65 if len(sys.argv) not in [6,8]: |
| 66 stop_err("Require five (or 7) arguments, organism, truncate, threads, " | 66 stop_err("Require five (or 7) arguments, organism, truncate, threads, " |
| 67 "input protein FASTA file & output tabular file (plus " | 67 "input protein FASTA file & output tabular file (plus " |
| 68 "optionally cut method and GFF3 output file). " | 68 "optionally cut method and GFF3 output file). " |
| 69 "Got %i arguments." % (len(sys.argv)-1)) | 69 "Got %i arguments." % (len(sys.argv)-1)) |
| 70 | 70 |
| 71 organism = sys.argv[1] | 71 organism = sys.argv[1] |
| 72 if organism not in ["euk", "gram+", "gram-"]: | 72 if organism not in ["euk", "gram+", "gram-"]: |
| 73 stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism) | 73 stop_err("Organism argument %s is not one of euk, gram+ or gram-" % organism) |
| 74 | 74 |
| 75 try: | 75 try: |
| 76 truncate = int(sys.argv[2]) | 76 truncate = int(sys.argv[2]) |
| 77 except: | 77 except: |
| 78 truncate = 0 | 78 truncate = 0 |
| 79 if truncate < 0: | 79 if truncate < 0: |
| 80 stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) | 80 stop_err("Truncate argument %s is not a positive integer (or zero)" % sys.argv[2]) |
| 81 | 81 |
| 82 num_threads = thread_count(sys.argv[3], default=4) | 82 num_threads = thread_count(sys.argv[3], default=4) |
| 83 fasta_file = sys.argv[4] | 83 fasta_file = sys.argv[4] |
| 84 tabular_file = sys.argv[5] | 84 tabular_file = sys.argv[5] |
| 85 | 85 |
| 86 if len(sys.argv) == 8: | 86 if len(sys.argv) == 8: |
| 87 cut_method = sys.argv[6] | 87 cut_method = sys.argv[6] |
| 88 if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]: | 88 if cut_method not in ["NN_Cmax", "NN_Ymax", "NN_Smax", "HMM_Cmax"]: |
| 89 stop_err("Invalid cut method %r" % cut_method) | 89 stop_err("Invalid cut method %r" % cut_method) |
| 90 gff3_file = sys.argv[7] | 90 gff3_file = sys.argv[7] |
| 91 else: | 91 else: |
| 92 cut_method = None | 92 cut_method = None |
| 93 gff3_file = None | 93 gff3_file = None |
| 94 | 94 |
| 95 | 95 |
| 96 tmp_dir = tempfile.mkdtemp() | 96 tmp_dir = tempfile.mkdtemp() |
| 97 | 97 |
| 98 def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None): | 98 def clean_tabular(raw_handle, out_handle, gff_handle=None, cut_method=None): |
| 99 """Clean up SignalP output to make it tabular.""" | 99 """Clean up SignalP output to make it tabular.""" |
| 100 if cut_method: | 100 if cut_method: |
| 101 cut_col = {"NN_Cmax" : 2, | 101 cut_col = {"NN_Cmax" : 2, |
| 102 "NN_Ymax" : 5, | 102 "NN_Ymax" : 5, |
| 103 "NN_Smax" : 8, | 103 "NN_Smax" : 8, |
| 104 "HMM_Cmax" : 16}[cut_method] | 104 "HMM_Cmax" : 16}[cut_method] |
| 105 else: | 105 else: |
| 106 cut_col = None | 106 cut_col = None |
| 107 for line in raw_handle: | 107 for line in raw_handle: |
| 108 if not line or line.startswith("#"): | 108 if not line or line.startswith("#"): |
| 109 continue | 109 continue |
| 110 parts = line.rstrip("\r\n").split() | 110 parts = line.rstrip("\r\n").split() |
| 111 assert len(parts)==21, repr(line) | 111 assert len(parts)==21, repr(line) |
| 112 assert parts[14].startswith(parts[0]) | 112 assert parts[14].startswith(parts[0]), \ |
| 113 "Bad entry in SignalP output, ID miss-match:\n%r" % line | |
| 113 #Remove redundant truncated name column (col 0) | 114 #Remove redundant truncated name column (col 0) |
| 114 #and put full name at start (col 14) | 115 #and put full name at start (col 14) |
| 115 parts = parts[14:15] + parts[1:14] + parts[15:] | 116 parts = parts[14:15] + parts[1:14] + parts[15:] |
| 116 out_handle.write("\t".join(parts) + "\n") | 117 out_handle.write("\t".join(parts) + "\n") |
| 117 | 118 |
| 216 data_handle.close() | 217 data_handle.close() |
| 217 out_handle.close() | 218 out_handle.close() |
| 218 | 219 |
| 219 #GFF3: | 220 #GFF3: |
| 220 if cut_method: | 221 if cut_method: |
| 221 make_gff(fasta_file, tabular_file, gff3_file, cut_method) | 222 make_gff(fasta_file, tabular_file, gff3_file, cut_method) |
| 222 | 223 |
| 223 clean_up(fasta_files + temp_files) | 224 clean_up(fasta_files + temp_files) |
