comparison tools/protein_analysis/psortb.py @ 21:238eae32483c draft

"Check this is up to date with all 2020 changes (black etc)"
author peterjc
date Thu, 17 Jun 2021 08:21:06 +0000
parents a19b3ded8f33
children
comparison
equal deleted inserted replaced
20:a19b3ded8f33 21:238eae32483c
35 if "-v" in sys.argv or "--version" in sys.argv: 35 if "-v" in sys.argv or "--version" in sys.argv:
36 """Return underlying PSORTb's version""" 36 """Return underlying PSORTb's version"""
37 sys.exit(os.system("psort --version")) 37 sys.exit(os.system("psort --version"))
38 38
39 if len(sys.argv) != 8: 39 if len(sys.argv) != 8:
40 sys.exit("Require 7 arguments, number of threads (int), type (e.g. archaea), " 40 sys.exit(
41 "output (e.g. terse/normal/long), cutoff, divergent, input protein " 41 "Require 7 arguments, number of threads (int), type (e.g. archaea), "
42 "FASTA file & output tabular file") 42 "output (e.g. terse/normal/long), cutoff, divergent, input protein "
43 "FASTA file & output tabular file"
44 )
43 45
44 num_threads = thread_count(sys.argv[1], default=4) 46 num_threads = thread_count(sys.argv[1], default=4)
45 org_type = sys.argv[2] 47 org_type = sys.argv[2]
46 out_type = sys.argv[3] 48 out_type = sys.argv[3]
47 cutoff = sys.argv[4] 49 cutoff = sys.argv[4]
56 divergent = "" 58 divergent = ""
57 fasta_file = sys.argv[6] 59 fasta_file = sys.argv[6]
58 tabular_file = sys.argv[7] 60 tabular_file = sys.argv[7]
59 61
60 if out_type == "terse": 62 if out_type == "terse":
61 header = ['SeqID', 'Localization', 'Score'] 63 header = ["SeqID", "Localization", "Score"]
62 elif out_type == "normal": 64 elif out_type == "normal":
63 sys.exit("Normal output not implemented yet, sorry.") 65 sys.exit("Normal output not implemented yet, sorry.")
64 elif out_type == "long": 66 elif out_type == "long":
65 if org_type == "-n": 67 if org_type == "-n":
66 # Gram negative bacteria 68 # Gram negative bacteria
67 header = ['SeqID', 'CMSVM-_Localization', 'CMSVM-_Details', 'CytoSVM-_Localization', 'CytoSVM-_Details', 69 header = [
68 'ECSVM-_Localization', 'ECSVM-_Details', 'ModHMM-_Localization', 'ModHMM-_Details', 70 "SeqID",
69 'Motif-_Localization', 'Motif-_Details', 'OMPMotif-_Localization', 'OMPMotif-_Details', 71 "CMSVM-_Localization",
70 'OMSVM-_Localization', 'OMSVM-_Details', 'PPSVM-_Localization', 'PPSVM-_Details', 72 "CMSVM-_Details",
71 'Profile-_Localization', 'Profile-_Details', 73 "CytoSVM-_Localization",
72 'SCL-BLAST-_Localization', 'SCL-BLAST-_Details', 74 "CytoSVM-_Details",
73 'SCL-BLASTe-_Localization', 'SCL-BLASTe-_Details', 75 "ECSVM-_Localization",
74 'Signal-_Localization', 'Signal-_Details', 76 "ECSVM-_Details",
75 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score', 77 "ModHMM-_Localization",
76 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', 78 "ModHMM-_Details",
77 'Secondary_Localization', 'PSortb_Version'] 79 "Motif-_Localization",
80 "Motif-_Details",
81 "OMPMotif-_Localization",
82 "OMPMotif-_Details",
83 "OMSVM-_Localization",
84 "OMSVM-_Details",
85 "PPSVM-_Localization",
86 "PPSVM-_Details",
87 "Profile-_Localization",
88 "Profile-_Details",
89 "SCL-BLAST-_Localization",
90 "SCL-BLAST-_Details",
91 "SCL-BLASTe-_Localization",
92 "SCL-BLASTe-_Details",
93 "Signal-_Localization",
94 "Signal-_Details",
95 "Cytoplasmic_Score",
96 "CytoplasmicMembrane_Score",
97 "Periplasmic_Score",
98 "OuterMembrane_Score",
99 "Extracellular_Score",
100 "Final_Localization",
101 "Final_Localization_Details",
102 "Final_Score",
103 "Secondary_Localization",
104 "PSortb_Version",
105 ]
78 elif org_type == "-p": 106 elif org_type == "-p":
79 # Gram positive bacteria 107 # Gram positive bacteria
80 header = ['SeqID', 'CMSVM+_Localization', 'CMSVM+_Details', 'CWSVM+_Localization', 'CWSVM+_Details', 108 header = [
81 'CytoSVM+_Localization', 'CytoSVM+_Details', 'ECSVM+_Localization', 'ECSVM+_Details', 109 "SeqID",
82 'ModHMM+_Localization', 'ModHMM+_Details', 'Motif+_Localization', 'Motif+_Details', 110 "CMSVM+_Localization",
83 'Profile+_Localization', 'Profile+_Details', 111 "CMSVM+_Details",
84 'SCL-BLAST+_Localization', 'SCL-BLAST+_Details', 112 "CWSVM+_Localization",
85 'SCL-BLASTe+_Localization', 'SCL-BLASTe+_Details', 113 "CWSVM+_Details",
86 'Signal+_Localization', 'Signal+_Details', 114 "CytoSVM+_Localization",
87 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score', 115 "CytoSVM+_Details",
88 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', 116 "ECSVM+_Localization",
89 'Secondary_Localization', 'PSortb_Version'] 117 "ECSVM+_Details",
118 "ModHMM+_Localization",
119 "ModHMM+_Details",
120 "Motif+_Localization",
121 "Motif+_Details",
122 "Profile+_Localization",
123 "Profile+_Details",
124 "SCL-BLAST+_Localization",
125 "SCL-BLAST+_Details",
126 "SCL-BLASTe+_Localization",
127 "SCL-BLASTe+_Details",
128 "Signal+_Localization",
129 "Signal+_Details",
130 "Cytoplasmic_Score",
131 "CytoplasmicMembrane_Score",
132 "Cellwall_Score",
133 "Extracellular_Score",
134 "Final_Localization",
135 "Final_Localization_Details",
136 "Final_Score",
137 "Secondary_Localization",
138 "PSortb_Version",
139 ]
90 elif org_type == "-a": 140 elif org_type == "-a":
91 # Archaea 141 # Archaea
92 header = ['SeqID', 'CMSVM_a_Localization', 'CMSVM_a_Details', 'CWSVM_a_Localization', 'CWSVM_a_Details', 142 header = [
93 'CytoSVM_a_Localization', 'CytoSVM_a_Details', 'ECSVM_a_Localization', 'ECSVM_a_Details', 143 "SeqID",
94 'ModHMM_a_Localization', 'ModHMM_a_Details', 'Motif_a_Localization', 'Motif_a_Details', 144 "CMSVM_a_Localization",
95 'Profile_a_Localization', 'Profile_a_Details', 145 "CMSVM_a_Details",
96 'SCL-BLAST_a_Localization', 'SCL-BLAST_a_Details', 146 "CWSVM_a_Localization",
97 'SCL-BLASTe_a_Localization', 'SCL-BLASTe_a_Details', 147 "CWSVM_a_Details",
98 'Signal_a_Localization', 'Signal_a_Details', 148 "CytoSVM_a_Localization",
99 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score', 149 "CytoSVM_a_Details",
100 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', 150 "ECSVM_a_Localization",
101 'Secondary_Localization', 'PSortb_Version'] 151 "ECSVM_a_Details",
152 "ModHMM_a_Localization",
153 "ModHMM_a_Details",
154 "Motif_a_Localization",
155 "Motif_a_Details",
156 "Profile_a_Localization",
157 "Profile_a_Details",
158 "SCL-BLAST_a_Localization",
159 "SCL-BLAST_a_Details",
160 "SCL-BLASTe_a_Localization",
161 "SCL-BLASTe_a_Details",
162 "Signal_a_Localization",
163 "Signal_a_Details",
164 "Cytoplasmic_Score",
165 "CytoplasmicMembrane_Score",
166 "Cellwall_Score",
167 "Extracellular_Score",
168 "Final_Localization",
169 "Final_Localization_Details",
170 "Final_Score",
171 "Secondary_Localization",
172 "PSortb_Version",
173 ]
102 else: 174 else:
103 sys.exit("Expected -n, -p or -a for the organism type, not %r" % org_type) 175 sys.exit("Expected -n, -p or -a for the organism type, not %r" % org_type)
104 else: 176 else:
105 sys.exit("Expected terse, normal or long for the output type, not %r" % out_type) 177 sys.exit("Expected terse, normal or long for the output type, not %r" % out_type)
106 178
121 continue 193 continue
122 if not parts[-1] and len(parts) == len(header) + 1: 194 if not parts[-1] and len(parts) == len(header) + 1:
123 # Ignore dummy blank extra column, e.g. 195 # Ignore dummy blank extra column, e.g.
124 # "...2.0\t\tPSORTb version 3.0\t\n" 196 # "...2.0\t\tPSORTb version 3.0\t\n"
125 parts = parts[:-1] 197 parts = parts[:-1]
126 assert len(parts) == len(header), \ 198 assert len(parts) == len(header), "%i fields, not %i, in line:\n%r" % (
127 "%i fields, not %i, in line:\n%r" % (len(line), len(header), line) 199 len(line),
200 len(header),
201 line,
202 )
128 out_handle.write(line) 203 out_handle.write(line)
129 count += 1 204 count += 1
130 return count 205 return count
131 206
132 207
133 # Note that if the input FASTA file contains no sequences, 208 # Note that if the input FASTA file contains no sequences,
134 # split_fasta returns an empty list (i.e. zero temp files). 209 # split_fasta returns an empty list (i.e. zero temp files).
135 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) 210 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK)
136 temp_files = [f + ".out" for f in fasta_files] 211 temp_files = [f + ".out" for f in fasta_files]
137 jobs = ["psort %s %s %s -o %s %s > %s" % (org_type, cutoff, divergent, out_type, fasta, temp) 212 jobs = [
138 for fasta, temp in zip(fasta_files, temp_files)] 213 "psort %s %s %s -o %s %s > %s"
214 % (org_type, cutoff, divergent, out_type, fasta, temp)
215 for fasta, temp in zip(fasta_files, temp_files)
216 ]
139 217
140 218
141 def clean_up(file_list): 219 def clean_up(file_list):
142 for f in file_list: 220 for f in file_list:
143 if os.path.isfile(f): 221 if os.path.isfile(f):
158 try: 236 try:
159 output = open(temp).readline() 237 output = open(temp).readline()
160 except IOError: 238 except IOError:
161 output = "" 239 output = ""
162 clean_up(fasta_files + temp_files) 240 clean_up(fasta_files + temp_files)
163 sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), 241 sys.exit(
164 error_level) 242 "One or more tasks failed, e.g. %i from %r gave:\n%s"
243 % (error_level, cmd, output),
244 error_level,
245 )
165 del results 246 del results
166 del jobs 247 del jobs
167 248
168 out_handle = open(tabular_file, "w") 249 out_handle = open(tabular_file, "w")
169 out_handle.write("#%s\n" % "\t".join(header)) 250 out_handle.write("#%s\n" % "\t".join(header))