Mercurial > repos > peterjc > tmhmm_and_signalp
comparison tools/protein_analysis/psortb.py @ 19:f3ecd80850e2 draft
v0.2.9 Python style improvements
author | peterjc |
---|---|
date | Wed, 01 Feb 2017 09:46:42 -0500 |
parents | eb6ac44d4b8e |
children | a19b3ded8f33 |
comparison
equal
deleted
inserted
replaced
18:eb6ac44d4b8e | 19:f3ecd80850e2 |
---|---|
22 with a # character as used elsewhere in Galaxy. | 22 with a # character as used elsewhere in Galaxy. |
23 """ | 23 """ |
24 import sys | 24 import sys |
25 import os | 25 import os |
26 import tempfile | 26 import tempfile |
27 from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count | 27 from seq_analysis_utils import split_fasta, run_jobs, thread_count |
28 | 28 |
29 FASTA_CHUNK = 500 | 29 FASTA_CHUNK = 500 |
30 | 30 |
31 if "-v" in sys.argv or "--version" in sys.argv: | 31 if "-v" in sys.argv or "--version" in sys.argv: |
32 """Return underlying PSORTb's version""" | 32 """Return underlying PSORTb's version""" |
33 sys.exit(os.system("psort --version")) | 33 sys.exit(os.system("psort --version")) |
34 | 34 |
35 if len(sys.argv) != 8: | 35 if len(sys.argv) != 8: |
36 sys_exit("Require 7 arguments, number of threads (int), type (e.g. archaea), " | 36 sys.exit("Require 7 arguments, number of threads (int), type (e.g. archaea), " |
37 "output (e.g. terse/normal/long), cutoff, divergent, input protein " | 37 "output (e.g. terse/normal/long), cutoff, divergent, input protein " |
38 "FASTA file & output tabular file") | 38 "FASTA file & output tabular file") |
39 | 39 |
40 num_threads = thread_count(sys.argv[1], default=4) | 40 num_threads = thread_count(sys.argv[1], default=4) |
41 org_type = sys.argv[2] | 41 org_type = sys.argv[2] |
54 tabular_file = sys.argv[7] | 54 tabular_file = sys.argv[7] |
55 | 55 |
56 if out_type == "terse": | 56 if out_type == "terse": |
57 header = ['SeqID', 'Localization', 'Score'] | 57 header = ['SeqID', 'Localization', 'Score'] |
58 elif out_type == "normal": | 58 elif out_type == "normal": |
59 sys_exit("Normal output not implemented yet, sorry.") | 59 sys.exit("Normal output not implemented yet, sorry.") |
60 elif out_type == "long": | 60 elif out_type == "long": |
61 if org_type == "-n": | 61 if org_type == "-n": |
62 #Gram negative bacteria | 62 # Gram negative bacteria |
63 header = ['SeqID', 'CMSVM-_Localization', 'CMSVM-_Details', 'CytoSVM-_Localization', 'CytoSVM-_Details', | 63 header = ['SeqID', 'CMSVM-_Localization', 'CMSVM-_Details', 'CytoSVM-_Localization', 'CytoSVM-_Details', |
64 'ECSVM-_Localization', 'ECSVM-_Details', 'ModHMM-_Localization', 'ModHMM-_Details', | 64 'ECSVM-_Localization', 'ECSVM-_Details', 'ModHMM-_Localization', 'ModHMM-_Details', |
65 'Motif-_Localization', 'Motif-_Details', 'OMPMotif-_Localization', 'OMPMotif-_Details', | 65 'Motif-_Localization', 'Motif-_Details', 'OMPMotif-_Localization', 'OMPMotif-_Details', |
66 'OMSVM-_Localization', 'OMSVM-_Details', 'PPSVM-_Localization', 'PPSVM-_Details', | 66 'OMSVM-_Localization', 'OMSVM-_Details', 'PPSVM-_Localization', 'PPSVM-_Details', |
67 'Profile-_Localization', 'Profile-_Details', | 67 'Profile-_Localization', 'Profile-_Details', |
69 'Signal-_Localization', 'Signal-_Details', | 69 'Signal-_Localization', 'Signal-_Details', |
70 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score', | 70 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score', |
71 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', | 71 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', |
72 'Secondary_Localization', 'PSortb_Version'] | 72 'Secondary_Localization', 'PSortb_Version'] |
73 elif org_type == "-p": | 73 elif org_type == "-p": |
74 #Gram positive bacteria | 74 # Gram positive bacteria |
75 header = ['SeqID', 'CMSVM+_Localization', 'CMSVM+_Details', 'CWSVM+_Localization', 'CWSVM+_Details', | 75 header = ['SeqID', 'CMSVM+_Localization', 'CMSVM+_Details', 'CWSVM+_Localization', 'CWSVM+_Details', |
76 'CytoSVM+_Localization', 'CytoSVM+_Details', 'ECSVM+_Localization', 'ECSVM+_Details', | 76 'CytoSVM+_Localization', 'CytoSVM+_Details', 'ECSVM+_Localization', 'ECSVM+_Details', |
77 'ModHMM+_Localization', 'ModHMM+_Details', 'Motif+_Localization', 'Motif+_Details', | 77 'ModHMM+_Localization', 'ModHMM+_Details', 'Motif+_Localization', 'Motif+_Details', |
78 'Profile+_Localization', 'Profile+_Details', | 78 'Profile+_Localization', 'Profile+_Details', |
79 'SCL-BLAST+_Localization', 'SCL-BLAST+_Details', 'SCL-BLASTe+_Localization', 'SCL-BLASTe+_Details', | 79 'SCL-BLAST+_Localization', 'SCL-BLAST+_Details', 'SCL-BLASTe+_Localization', 'SCL-BLASTe+_Details', |
80 'Signal+_Localization', 'Signal+_Details', | 80 'Signal+_Localization', 'Signal+_Details', |
81 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score', | 81 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score', |
82 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', | 82 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', |
83 'Secondary_Localization', 'PSortb_Version'] | 83 'Secondary_Localization', 'PSortb_Version'] |
84 elif org_type == "-a": | 84 elif org_type == "-a": |
85 #Archaea | 85 # Archaea |
86 header = ['SeqID', 'CMSVM_a_Localization', 'CMSVM_a_Details', 'CWSVM_a_Localization', 'CWSVM_a_Details', | 86 header = ['SeqID', 'CMSVM_a_Localization', 'CMSVM_a_Details', 'CWSVM_a_Localization', 'CWSVM_a_Details', |
87 'CytoSVM_a_Localization', 'CytoSVM_a_Details', 'ECSVM_a_Localization', 'ECSVM_a_Details', | 87 'CytoSVM_a_Localization', 'CytoSVM_a_Details', 'ECSVM_a_Localization', 'ECSVM_a_Details', |
88 'ModHMM_a_Localization', 'ModHMM_a_Details', 'Motif_a_Localization', 'Motif_a_Details', | 88 'ModHMM_a_Localization', 'ModHMM_a_Details', 'Motif_a_Localization', 'Motif_a_Details', |
89 'Profile_a_Localization', 'Profile_a_Details', | 89 'Profile_a_Localization', 'Profile_a_Details', |
90 'SCL-BLAST_a_Localization', 'SCL-BLAST_a_Details', 'SCL-BLASTe_a_Localization', 'SCL-BLASTe_a_Details', | 90 'SCL-BLAST_a_Localization', 'SCL-BLAST_a_Details', 'SCL-BLASTe_a_Localization', 'SCL-BLASTe_a_Details', |
91 'Signal_a_Localization', 'Signal_a_Details', | 91 'Signal_a_Localization', 'Signal_a_Details', |
92 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score', | 92 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score', |
93 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', | 93 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', |
94 'Secondary_Localization', 'PSortb_Version'] | 94 'Secondary_Localization', 'PSortb_Version'] |
95 else: | 95 else: |
96 sys_exit("Expected -n, -p or -a for the organism type, not %r" % org_type) | 96 sys.exit("Expected -n, -p or -a for the organism type, not %r" % org_type) |
97 else: | 97 else: |
98 sys_exit("Expected terse, normal or long for the output type, not %r" % out_type) | 98 sys.exit("Expected terse, normal or long for the output type, not %r" % out_type) |
99 | 99 |
100 tmp_dir = tempfile.mkdtemp() | 100 tmp_dir = tempfile.mkdtemp() |
101 | |
101 | 102 |
102 def clean_tabular(raw_handle, out_handle): | 103 def clean_tabular(raw_handle, out_handle): |
103 """Clean up tabular TMHMM output, returns output line count.""" | 104 """Clean up tabular TMHMM output, returns output line count.""" |
104 global header | 105 global header |
105 count = 0 | 106 count = 0 |
106 for line in raw_handle: | 107 for line in raw_handle: |
107 if not line.strip() or line.startswith("#"): | 108 if not line.strip() or line.startswith("#"): |
108 #Ignore any blank lines or comment lines | 109 # Ignore any blank lines or comment lines |
109 continue | 110 continue |
110 parts = [x.strip() for x in line.rstrip("\r\n").split("\t")] | 111 parts = [x.strip() for x in line.rstrip("\r\n").split("\t")] |
111 if parts == header: | 112 if parts == header: |
112 #Ignore the header line | 113 # Ignore the header line |
113 continue | 114 continue |
114 if not parts[-1] and len(parts) == len(header) + 1: | 115 if not parts[-1] and len(parts) == len(header) + 1: |
115 #Ignore dummy blank extra column, e.g. | 116 # Ignore dummy blank extra column, e.g. |
116 #"...2.0\t\tPSORTb version 3.0\t\n" | 117 # "...2.0\t\tPSORTb version 3.0\t\n" |
117 parts = parts[:-1] | 118 parts = parts[:-1] |
118 assert len(parts) == len(header), \ | 119 assert len(parts) == len(header), \ |
119 "%i fields, not %i, in line:\n%r" % (len(line), len(header), line) | 120 "%i fields, not %i, in line:\n%r" % (len(line), len(header), line) |
120 out_handle.write(line) | 121 out_handle.write(line) |
121 count += 1 | 122 count += 1 |
122 return count | 123 return count |
123 | 124 |
124 #Note that if the input FASTA file contains no sequences, | 125 # Note that if the input FASTA file contains no sequences, |
125 #split_fasta returns an empty list (i.e. zero temp files). | 126 # split_fasta returns an empty list (i.e. zero temp files). |
126 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) | 127 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) |
127 temp_files = [f+".out" for f in fasta_files] | 128 temp_files = [f + ".out" for f in fasta_files] |
128 jobs = ["psort %s %s %s -o %s %s > %s" % (org_type, cutoff, divergent, out_type, fasta, temp) | 129 jobs = ["psort %s %s %s -o %s %s > %s" % (org_type, cutoff, divergent, out_type, fasta, temp) |
129 for fasta, temp in zip(fasta_files, temp_files)] | 130 for fasta, temp in zip(fasta_files, temp_files)] |
131 | |
130 | 132 |
131 def clean_up(file_list): | 133 def clean_up(file_list): |
132 for f in file_list: | 134 for f in file_list: |
133 if os.path.isfile(f): | 135 if os.path.isfile(f): |
134 os.remove(f) | 136 os.remove(f) |
135 try: | 137 try: |
136 os.rmdir(tmp_dir) | 138 os.rmdir(tmp_dir) |
137 except: | 139 except Exception: |
138 pass | 140 pass |
139 | 141 |
140 if len(jobs) > 1 and num_threads > 1: | 142 if len(jobs) > 1 and num_threads > 1: |
141 #A small "info" message for Galaxy to show the user. | 143 # A small "info" message for Galaxy to show the user. |
142 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) | 144 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) |
143 results = run_jobs(jobs, num_threads) | 145 results = run_jobs(jobs, num_threads) |
144 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): | 146 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): |
145 error_level = results[cmd] | 147 error_level = results[cmd] |
146 if error_level: | 148 if error_level: |
147 try: | 149 try: |
148 output = open(temp).readline() | 150 output = open(temp).readline() |
149 except IOError: | 151 except IOError: |
150 output = "" | 152 output = "" |
151 clean_up(fasta_files + temp_files) | 153 clean_up(fasta_files + temp_files) |
152 sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), | 154 sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), |
153 error_level) | 155 error_level) |
154 del results | 156 del results |
155 del jobs | 157 del jobs |
156 | 158 |
157 out_handle = open(tabular_file, "w") | 159 out_handle = open(tabular_file, "w") |
161 data_handle = open(temp) | 163 data_handle = open(temp) |
162 count += clean_tabular(data_handle, out_handle) | 164 count += clean_tabular(data_handle, out_handle) |
163 data_handle.close() | 165 data_handle.close() |
164 if not count: | 166 if not count: |
165 clean_up(fasta_files + temp_files) | 167 clean_up(fasta_files + temp_files) |
166 sys_exit("No output from psortb") | 168 sys.exit("No output from psortb") |
167 out_handle.close() | 169 out_handle.close() |
168 print "%i records" % count | 170 print "%i records" % count |
169 | 171 |
170 clean_up(fasta_files + temp_files) | 172 clean_up(fasta_files + temp_files) |