Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/psortb.py @ 21:238eae32483c draft
"Check this is up to date with all 2020 changes (black etc)"
author | peterjc |
---|---|
date | Thu, 17 Jun 2021 08:21:06 +0000 |
parents | a19b3ded8f33 |
children |
comparison
equal
deleted
inserted
replaced
20:a19b3ded8f33 | 21:238eae32483c |
---|---|
35 if "-v" in sys.argv or "--version" in sys.argv: | 35 if "-v" in sys.argv or "--version" in sys.argv: |
36 """Return underlying PSORTb's version""" | 36 """Return underlying PSORTb's version""" |
37 sys.exit(os.system("psort --version")) | 37 sys.exit(os.system("psort --version")) |
38 | 38 |
39 if len(sys.argv) != 8: | 39 if len(sys.argv) != 8: |
40 sys.exit("Require 7 arguments, number of threads (int), type (e.g. archaea), " | 40 sys.exit( |
41 "output (e.g. terse/normal/long), cutoff, divergent, input protein " | 41 "Require 7 arguments, number of threads (int), type (e.g. archaea), " |
42 "FASTA file & output tabular file") | 42 "output (e.g. terse/normal/long), cutoff, divergent, input protein " |
43 "FASTA file & output tabular file" | |
44 ) | |
43 | 45 |
44 num_threads = thread_count(sys.argv[1], default=4) | 46 num_threads = thread_count(sys.argv[1], default=4) |
45 org_type = sys.argv[2] | 47 org_type = sys.argv[2] |
46 out_type = sys.argv[3] | 48 out_type = sys.argv[3] |
47 cutoff = sys.argv[4] | 49 cutoff = sys.argv[4] |
56 divergent = "" | 58 divergent = "" |
57 fasta_file = sys.argv[6] | 59 fasta_file = sys.argv[6] |
58 tabular_file = sys.argv[7] | 60 tabular_file = sys.argv[7] |
59 | 61 |
60 if out_type == "terse": | 62 if out_type == "terse": |
61 header = ['SeqID', 'Localization', 'Score'] | 63 header = ["SeqID", "Localization", "Score"] |
62 elif out_type == "normal": | 64 elif out_type == "normal": |
63 sys.exit("Normal output not implemented yet, sorry.") | 65 sys.exit("Normal output not implemented yet, sorry.") |
64 elif out_type == "long": | 66 elif out_type == "long": |
65 if org_type == "-n": | 67 if org_type == "-n": |
66 # Gram negative bacteria | 68 # Gram negative bacteria |
67 header = ['SeqID', 'CMSVM-_Localization', 'CMSVM-_Details', 'CytoSVM-_Localization', 'CytoSVM-_Details', | 69 header = [ |
68 'ECSVM-_Localization', 'ECSVM-_Details', 'ModHMM-_Localization', 'ModHMM-_Details', | 70 "SeqID", |
69 'Motif-_Localization', 'Motif-_Details', 'OMPMotif-_Localization', 'OMPMotif-_Details', | 71 "CMSVM-_Localization", |
70 'OMSVM-_Localization', 'OMSVM-_Details', 'PPSVM-_Localization', 'PPSVM-_Details', | 72 "CMSVM-_Details", |
71 'Profile-_Localization', 'Profile-_Details', | 73 "CytoSVM-_Localization", |
72 'SCL-BLAST-_Localization', 'SCL-BLAST-_Details', | 74 "CytoSVM-_Details", |
73 'SCL-BLASTe-_Localization', 'SCL-BLASTe-_Details', | 75 "ECSVM-_Localization", |
74 'Signal-_Localization', 'Signal-_Details', | 76 "ECSVM-_Details", |
75 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score', | 77 "ModHMM-_Localization", |
76 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', | 78 "ModHMM-_Details", |
77 'Secondary_Localization', 'PSortb_Version'] | 79 "Motif-_Localization", |
80 "Motif-_Details", | |
81 "OMPMotif-_Localization", | |
82 "OMPMotif-_Details", | |
83 "OMSVM-_Localization", | |
84 "OMSVM-_Details", | |
85 "PPSVM-_Localization", | |
86 "PPSVM-_Details", | |
87 "Profile-_Localization", | |
88 "Profile-_Details", | |
89 "SCL-BLAST-_Localization", | |
90 "SCL-BLAST-_Details", | |
91 "SCL-BLASTe-_Localization", | |
92 "SCL-BLASTe-_Details", | |
93 "Signal-_Localization", | |
94 "Signal-_Details", | |
95 "Cytoplasmic_Score", | |
96 "CytoplasmicMembrane_Score", | |
97 "Periplasmic_Score", | |
98 "OuterMembrane_Score", | |
99 "Extracellular_Score", | |
100 "Final_Localization", | |
101 "Final_Localization_Details", | |
102 "Final_Score", | |
103 "Secondary_Localization", | |
104 "PSortb_Version", | |
105 ] | |
78 elif org_type == "-p": | 106 elif org_type == "-p": |
79 # Gram positive bacteria | 107 # Gram positive bacteria |
80 header = ['SeqID', 'CMSVM+_Localization', 'CMSVM+_Details', 'CWSVM+_Localization', 'CWSVM+_Details', | 108 header = [ |
81 'CytoSVM+_Localization', 'CytoSVM+_Details', 'ECSVM+_Localization', 'ECSVM+_Details', | 109 "SeqID", |
82 'ModHMM+_Localization', 'ModHMM+_Details', 'Motif+_Localization', 'Motif+_Details', | 110 "CMSVM+_Localization", |
83 'Profile+_Localization', 'Profile+_Details', | 111 "CMSVM+_Details", |
84 'SCL-BLAST+_Localization', 'SCL-BLAST+_Details', | 112 "CWSVM+_Localization", |
85 'SCL-BLASTe+_Localization', 'SCL-BLASTe+_Details', | 113 "CWSVM+_Details", |
86 'Signal+_Localization', 'Signal+_Details', | 114 "CytoSVM+_Localization", |
87 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score', | 115 "CytoSVM+_Details", |
88 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', | 116 "ECSVM+_Localization", |
89 'Secondary_Localization', 'PSortb_Version'] | 117 "ECSVM+_Details", |
118 "ModHMM+_Localization", | |
119 "ModHMM+_Details", | |
120 "Motif+_Localization", | |
121 "Motif+_Details", | |
122 "Profile+_Localization", | |
123 "Profile+_Details", | |
124 "SCL-BLAST+_Localization", | |
125 "SCL-BLAST+_Details", | |
126 "SCL-BLASTe+_Localization", | |
127 "SCL-BLASTe+_Details", | |
128 "Signal+_Localization", | |
129 "Signal+_Details", | |
130 "Cytoplasmic_Score", | |
131 "CytoplasmicMembrane_Score", | |
132 "Cellwall_Score", | |
133 "Extracellular_Score", | |
134 "Final_Localization", | |
135 "Final_Localization_Details", | |
136 "Final_Score", | |
137 "Secondary_Localization", | |
138 "PSortb_Version", | |
139 ] | |
90 elif org_type == "-a": | 140 elif org_type == "-a": |
91 # Archaea | 141 # Archaea |
92 header = ['SeqID', 'CMSVM_a_Localization', 'CMSVM_a_Details', 'CWSVM_a_Localization', 'CWSVM_a_Details', | 142 header = [ |
93 'CytoSVM_a_Localization', 'CytoSVM_a_Details', 'ECSVM_a_Localization', 'ECSVM_a_Details', | 143 "SeqID", |
94 'ModHMM_a_Localization', 'ModHMM_a_Details', 'Motif_a_Localization', 'Motif_a_Details', | 144 "CMSVM_a_Localization", |
95 'Profile_a_Localization', 'Profile_a_Details', | 145 "CMSVM_a_Details", |
96 'SCL-BLAST_a_Localization', 'SCL-BLAST_a_Details', | 146 "CWSVM_a_Localization", |
97 'SCL-BLASTe_a_Localization', 'SCL-BLASTe_a_Details', | 147 "CWSVM_a_Details", |
98 'Signal_a_Localization', 'Signal_a_Details', | 148 "CytoSVM_a_Localization", |
99 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score', | 149 "CytoSVM_a_Details", |
100 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', | 150 "ECSVM_a_Localization", |
101 'Secondary_Localization', 'PSortb_Version'] | 151 "ECSVM_a_Details", |
152 "ModHMM_a_Localization", | |
153 "ModHMM_a_Details", | |
154 "Motif_a_Localization", | |
155 "Motif_a_Details", | |
156 "Profile_a_Localization", | |
157 "Profile_a_Details", | |
158 "SCL-BLAST_a_Localization", | |
159 "SCL-BLAST_a_Details", | |
160 "SCL-BLASTe_a_Localization", | |
161 "SCL-BLASTe_a_Details", | |
162 "Signal_a_Localization", | |
163 "Signal_a_Details", | |
164 "Cytoplasmic_Score", | |
165 "CytoplasmicMembrane_Score", | |
166 "Cellwall_Score", | |
167 "Extracellular_Score", | |
168 "Final_Localization", | |
169 "Final_Localization_Details", | |
170 "Final_Score", | |
171 "Secondary_Localization", | |
172 "PSortb_Version", | |
173 ] | |
102 else: | 174 else: |
103 sys.exit("Expected -n, -p or -a for the organism type, not %r" % org_type) | 175 sys.exit("Expected -n, -p or -a for the organism type, not %r" % org_type) |
104 else: | 176 else: |
105 sys.exit("Expected terse, normal or long for the output type, not %r" % out_type) | 177 sys.exit("Expected terse, normal or long for the output type, not %r" % out_type) |
106 | 178 |
121 continue | 193 continue |
122 if not parts[-1] and len(parts) == len(header) + 1: | 194 if not parts[-1] and len(parts) == len(header) + 1: |
123 # Ignore dummy blank extra column, e.g. | 195 # Ignore dummy blank extra column, e.g. |
124 # "...2.0\t\tPSORTb version 3.0\t\n" | 196 # "...2.0\t\tPSORTb version 3.0\t\n" |
125 parts = parts[:-1] | 197 parts = parts[:-1] |
126 assert len(parts) == len(header), \ | 198 assert len(parts) == len(header), "%i fields, not %i, in line:\n%r" % ( |
127 "%i fields, not %i, in line:\n%r" % (len(line), len(header), line) | 199 len(line), |
200 len(header), | |
201 line, | |
202 ) | |
128 out_handle.write(line) | 203 out_handle.write(line) |
129 count += 1 | 204 count += 1 |
130 return count | 205 return count |
131 | 206 |
132 | 207 |
133 # Note that if the input FASTA file contains no sequences, | 208 # Note that if the input FASTA file contains no sequences, |
134 # split_fasta returns an empty list (i.e. zero temp files). | 209 # split_fasta returns an empty list (i.e. zero temp files). |
135 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) | 210 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) |
136 temp_files = [f + ".out" for f in fasta_files] | 211 temp_files = [f + ".out" for f in fasta_files] |
137 jobs = ["psort %s %s %s -o %s %s > %s" % (org_type, cutoff, divergent, out_type, fasta, temp) | 212 jobs = [ |
138 for fasta, temp in zip(fasta_files, temp_files)] | 213 "psort %s %s %s -o %s %s > %s" |
214 % (org_type, cutoff, divergent, out_type, fasta, temp) | |
215 for fasta, temp in zip(fasta_files, temp_files) | |
216 ] | |
139 | 217 |
140 | 218 |
141 def clean_up(file_list): | 219 def clean_up(file_list): |
142 for f in file_list: | 220 for f in file_list: |
143 if os.path.isfile(f): | 221 if os.path.isfile(f): |
158 try: | 236 try: |
159 output = open(temp).readline() | 237 output = open(temp).readline() |
160 except IOError: | 238 except IOError: |
161 output = "" | 239 output = "" |
162 clean_up(fasta_files + temp_files) | 240 clean_up(fasta_files + temp_files) |
163 sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | 241 sys.exit( |
164 error_level) | 242 "One or more tasks failed, e.g. %i from %r gave:\n%s" |
243 % (error_level, cmd, output), | |
244 error_level, | |
245 ) | |
165 del results | 246 del results |
166 del jobs | 247 del jobs |
167 | 248 |
168 out_handle = open(tabular_file, "w") | 249 out_handle = open(tabular_file, "w") |
169 out_handle.write("#%s\n" % "\t".join(header)) | 250 out_handle.write("#%s\n" % "\t".join(header)) |