comparison tools/protein_analysis/psortb.py @ 19:f3ecd80850e2 draft

v0.2.9 Python style improvements
author peterjc
date Wed, 01 Feb 2017 09:46:42 -0500
parents eb6ac44d4b8e
children a19b3ded8f33
comparison
equal deleted inserted replaced
18:eb6ac44d4b8e 19:f3ecd80850e2
22 with a # character as used elsewhere in Galaxy. 22 with a # character as used elsewhere in Galaxy.
23 """ 23 """
24 import sys 24 import sys
25 import os 25 import os
26 import tempfile 26 import tempfile
27 from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count 27 from seq_analysis_utils import split_fasta, run_jobs, thread_count
28 28
29 FASTA_CHUNK = 500 29 FASTA_CHUNK = 500
30 30
31 if "-v" in sys.argv or "--version" in sys.argv: 31 if "-v" in sys.argv or "--version" in sys.argv:
32 """Return underlying PSORTb's version""" 32 """Return underlying PSORTb's version"""
33 sys.exit(os.system("psort --version")) 33 sys.exit(os.system("psort --version"))
34 34
35 if len(sys.argv) != 8: 35 if len(sys.argv) != 8:
36 sys_exit("Require 7 arguments, number of threads (int), type (e.g. archaea), " 36 sys.exit("Require 7 arguments, number of threads (int), type (e.g. archaea), "
37 "output (e.g. terse/normal/long), cutoff, divergent, input protein " 37 "output (e.g. terse/normal/long), cutoff, divergent, input protein "
38 "FASTA file & output tabular file") 38 "FASTA file & output tabular file")
39 39
40 num_threads = thread_count(sys.argv[1], default=4) 40 num_threads = thread_count(sys.argv[1], default=4)
41 org_type = sys.argv[2] 41 org_type = sys.argv[2]
54 tabular_file = sys.argv[7] 54 tabular_file = sys.argv[7]
55 55
56 if out_type == "terse": 56 if out_type == "terse":
57 header = ['SeqID', 'Localization', 'Score'] 57 header = ['SeqID', 'Localization', 'Score']
58 elif out_type == "normal": 58 elif out_type == "normal":
59 sys_exit("Normal output not implemented yet, sorry.") 59 sys.exit("Normal output not implemented yet, sorry.")
60 elif out_type == "long": 60 elif out_type == "long":
61 if org_type == "-n": 61 if org_type == "-n":
62 #Gram negative bacteria 62 # Gram negative bacteria
63 header = ['SeqID', 'CMSVM-_Localization', 'CMSVM-_Details', 'CytoSVM-_Localization', 'CytoSVM-_Details', 63 header = ['SeqID', 'CMSVM-_Localization', 'CMSVM-_Details', 'CytoSVM-_Localization', 'CytoSVM-_Details',
64 'ECSVM-_Localization', 'ECSVM-_Details', 'ModHMM-_Localization', 'ModHMM-_Details', 64 'ECSVM-_Localization', 'ECSVM-_Details', 'ModHMM-_Localization', 'ModHMM-_Details',
65 'Motif-_Localization', 'Motif-_Details', 'OMPMotif-_Localization', 'OMPMotif-_Details', 65 'Motif-_Localization', 'Motif-_Details', 'OMPMotif-_Localization', 'OMPMotif-_Details',
66 'OMSVM-_Localization', 'OMSVM-_Details', 'PPSVM-_Localization', 'PPSVM-_Details', 66 'OMSVM-_Localization', 'OMSVM-_Details', 'PPSVM-_Localization', 'PPSVM-_Details',
67 'Profile-_Localization', 'Profile-_Details', 67 'Profile-_Localization', 'Profile-_Details',
69 'Signal-_Localization', 'Signal-_Details', 69 'Signal-_Localization', 'Signal-_Details',
70 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score', 70 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score',
71 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', 71 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score',
72 'Secondary_Localization', 'PSortb_Version'] 72 'Secondary_Localization', 'PSortb_Version']
73 elif org_type == "-p": 73 elif org_type == "-p":
74 #Gram positive bacteria 74 # Gram positive bacteria
75 header = ['SeqID', 'CMSVM+_Localization', 'CMSVM+_Details', 'CWSVM+_Localization', 'CWSVM+_Details', 75 header = ['SeqID', 'CMSVM+_Localization', 'CMSVM+_Details', 'CWSVM+_Localization', 'CWSVM+_Details',
76 'CytoSVM+_Localization', 'CytoSVM+_Details', 'ECSVM+_Localization', 'ECSVM+_Details', 76 'CytoSVM+_Localization', 'CytoSVM+_Details', 'ECSVM+_Localization', 'ECSVM+_Details',
77 'ModHMM+_Localization', 'ModHMM+_Details', 'Motif+_Localization', 'Motif+_Details', 77 'ModHMM+_Localization', 'ModHMM+_Details', 'Motif+_Localization', 'Motif+_Details',
78 'Profile+_Localization', 'Profile+_Details', 78 'Profile+_Localization', 'Profile+_Details',
79 'SCL-BLAST+_Localization', 'SCL-BLAST+_Details', 'SCL-BLASTe+_Localization', 'SCL-BLASTe+_Details', 79 'SCL-BLAST+_Localization', 'SCL-BLAST+_Details', 'SCL-BLASTe+_Localization', 'SCL-BLASTe+_Details',
80 'Signal+_Localization', 'Signal+_Details', 80 'Signal+_Localization', 'Signal+_Details',
81 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score', 81 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score',
82 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', 82 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score',
83 'Secondary_Localization', 'PSortb_Version'] 83 'Secondary_Localization', 'PSortb_Version']
84 elif org_type == "-a": 84 elif org_type == "-a":
85 #Archaea 85 # Archaea
86 header = ['SeqID', 'CMSVM_a_Localization', 'CMSVM_a_Details', 'CWSVM_a_Localization', 'CWSVM_a_Details', 86 header = ['SeqID', 'CMSVM_a_Localization', 'CMSVM_a_Details', 'CWSVM_a_Localization', 'CWSVM_a_Details',
87 'CytoSVM_a_Localization', 'CytoSVM_a_Details', 'ECSVM_a_Localization', 'ECSVM_a_Details', 87 'CytoSVM_a_Localization', 'CytoSVM_a_Details', 'ECSVM_a_Localization', 'ECSVM_a_Details',
88 'ModHMM_a_Localization', 'ModHMM_a_Details', 'Motif_a_Localization', 'Motif_a_Details', 88 'ModHMM_a_Localization', 'ModHMM_a_Details', 'Motif_a_Localization', 'Motif_a_Details',
89 'Profile_a_Localization', 'Profile_a_Details', 89 'Profile_a_Localization', 'Profile_a_Details',
90 'SCL-BLAST_a_Localization', 'SCL-BLAST_a_Details', 'SCL-BLASTe_a_Localization', 'SCL-BLASTe_a_Details', 90 'SCL-BLAST_a_Localization', 'SCL-BLAST_a_Details', 'SCL-BLASTe_a_Localization', 'SCL-BLASTe_a_Details',
91 'Signal_a_Localization', 'Signal_a_Details', 91 'Signal_a_Localization', 'Signal_a_Details',
92 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score', 92 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score',
93 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', 93 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score',
94 'Secondary_Localization', 'PSortb_Version'] 94 'Secondary_Localization', 'PSortb_Version']
95 else: 95 else:
96 sys_exit("Expected -n, -p or -a for the organism type, not %r" % org_type) 96 sys.exit("Expected -n, -p or -a for the organism type, not %r" % org_type)
97 else: 97 else:
98 sys_exit("Expected terse, normal or long for the output type, not %r" % out_type) 98 sys.exit("Expected terse, normal or long for the output type, not %r" % out_type)
99 99
100 tmp_dir = tempfile.mkdtemp() 100 tmp_dir = tempfile.mkdtemp()
101
101 102
102 def clean_tabular(raw_handle, out_handle): 103 def clean_tabular(raw_handle, out_handle):
103 """Clean up tabular TMHMM output, returns output line count.""" 104 """Clean up tabular TMHMM output, returns output line count."""
104 global header 105 global header
105 count = 0 106 count = 0
106 for line in raw_handle: 107 for line in raw_handle:
107 if not line.strip() or line.startswith("#"): 108 if not line.strip() or line.startswith("#"):
108 #Ignore any blank lines or comment lines 109 # Ignore any blank lines or comment lines
109 continue 110 continue
110 parts = [x.strip() for x in line.rstrip("\r\n").split("\t")] 111 parts = [x.strip() for x in line.rstrip("\r\n").split("\t")]
111 if parts == header: 112 if parts == header:
112 #Ignore the header line 113 # Ignore the header line
113 continue 114 continue
114 if not parts[-1] and len(parts) == len(header) + 1: 115 if not parts[-1] and len(parts) == len(header) + 1:
115 #Ignore dummy blank extra column, e.g. 116 # Ignore dummy blank extra column, e.g.
116 #"...2.0\t\tPSORTb version 3.0\t\n" 117 # "...2.0\t\tPSORTb version 3.0\t\n"
117 parts = parts[:-1] 118 parts = parts[:-1]
118 assert len(parts) == len(header), \ 119 assert len(parts) == len(header), \
119 "%i fields, not %i, in line:\n%r" % (len(line), len(header), line) 120 "%i fields, not %i, in line:\n%r" % (len(line), len(header), line)
120 out_handle.write(line) 121 out_handle.write(line)
121 count += 1 122 count += 1
122 return count 123 return count
123 124
124 #Note that if the input FASTA file contains no sequences, 125 # Note that if the input FASTA file contains no sequences,
125 #split_fasta returns an empty list (i.e. zero temp files). 126 # split_fasta returns an empty list (i.e. zero temp files).
126 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) 127 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK)
127 temp_files = [f+".out" for f in fasta_files] 128 temp_files = [f + ".out" for f in fasta_files]
128 jobs = ["psort %s %s %s -o %s %s > %s" % (org_type, cutoff, divergent, out_type, fasta, temp) 129 jobs = ["psort %s %s %s -o %s %s > %s" % (org_type, cutoff, divergent, out_type, fasta, temp)
129 for fasta, temp in zip(fasta_files, temp_files)] 130 for fasta, temp in zip(fasta_files, temp_files)]
131
130 132
131 def clean_up(file_list): 133 def clean_up(file_list):
132 for f in file_list: 134 for f in file_list:
133 if os.path.isfile(f): 135 if os.path.isfile(f):
134 os.remove(f) 136 os.remove(f)
135 try: 137 try:
136 os.rmdir(tmp_dir) 138 os.rmdir(tmp_dir)
137 except: 139 except Exception:
138 pass 140 pass
139 141
140 if len(jobs) > 1 and num_threads > 1: 142 if len(jobs) > 1 and num_threads > 1:
141 #A small "info" message for Galaxy to show the user. 143 # A small "info" message for Galaxy to show the user.
142 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) 144 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
143 results = run_jobs(jobs, num_threads) 145 results = run_jobs(jobs, num_threads)
144 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): 146 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
145 error_level = results[cmd] 147 error_level = results[cmd]
146 if error_level: 148 if error_level:
147 try: 149 try:
148 output = open(temp).readline() 150 output = open(temp).readline()
149 except IOError: 151 except IOError:
150 output = "" 152 output = ""
151 clean_up(fasta_files + temp_files) 153 clean_up(fasta_files + temp_files)
152 sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), 154 sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
153 error_level) 155 error_level)
154 del results 156 del results
155 del jobs 157 del jobs
156 158
157 out_handle = open(tabular_file, "w") 159 out_handle = open(tabular_file, "w")
161 data_handle = open(temp) 163 data_handle = open(temp)
162 count += clean_tabular(data_handle, out_handle) 164 count += clean_tabular(data_handle, out_handle)
163 data_handle.close() 165 data_handle.close()
164 if not count: 166 if not count:
165 clean_up(fasta_files + temp_files) 167 clean_up(fasta_files + temp_files)
166 sys_exit("No output from psortb") 168 sys.exit("No output from psortb")
167 out_handle.close() 169 out_handle.close()
168 print "%i records" % count 170 print "%i records" % count
169 171
170 clean_up(fasta_files + temp_files) 172 clean_up(fasta_files + temp_files)