comparison tools/protein_analysis/psortb.py @ 20:a19b3ded8f33 draft

v0.2.11 Job splitting fast-fail; RXLR tools supports HMMER2 from BioConda; Capture more version information; misc internal changes
author peterjc
date Thu, 21 Sep 2017 11:35:20 -0400
parents f3ecd80850e2
children 238eae32483c
comparison
equal deleted inserted replaced
19:f3ecd80850e2 20:a19b3ded8f33
19 itself (see the SignalP XML file for settings), but both can be applied. 19 itself (see the SignalP XML file for settings), but both can be applied.
20 20
21 Additionally it ensures the header line (with the column names) starts 21 Additionally it ensures the header line (with the column names) starts
22 with a # character as used elsewhere in Galaxy. 22 with a # character as used elsewhere in Galaxy.
23 """ 23 """
24
25 from __future__ import print_function
26
27 import os
24 import sys 28 import sys
25 import os
26 import tempfile 29 import tempfile
27 from seq_analysis_utils import split_fasta, run_jobs, thread_count 30
31 from seq_analysis_utils import run_jobs, split_fasta, thread_count
28 32
29 FASTA_CHUNK = 500 33 FASTA_CHUNK = 500
30 34
31 if "-v" in sys.argv or "--version" in sys.argv: 35 if "-v" in sys.argv or "--version" in sys.argv:
32 """Return underlying PSORTb's version""" 36 """Return underlying PSORTb's version"""
63 header = ['SeqID', 'CMSVM-_Localization', 'CMSVM-_Details', 'CytoSVM-_Localization', 'CytoSVM-_Details', 67 header = ['SeqID', 'CMSVM-_Localization', 'CMSVM-_Details', 'CytoSVM-_Localization', 'CytoSVM-_Details',
64 'ECSVM-_Localization', 'ECSVM-_Details', 'ModHMM-_Localization', 'ModHMM-_Details', 68 'ECSVM-_Localization', 'ECSVM-_Details', 'ModHMM-_Localization', 'ModHMM-_Details',
65 'Motif-_Localization', 'Motif-_Details', 'OMPMotif-_Localization', 'OMPMotif-_Details', 69 'Motif-_Localization', 'Motif-_Details', 'OMPMotif-_Localization', 'OMPMotif-_Details',
66 'OMSVM-_Localization', 'OMSVM-_Details', 'PPSVM-_Localization', 'PPSVM-_Details', 70 'OMSVM-_Localization', 'OMSVM-_Details', 'PPSVM-_Localization', 'PPSVM-_Details',
67 'Profile-_Localization', 'Profile-_Details', 71 'Profile-_Localization', 'Profile-_Details',
68 'SCL-BLAST-_Localization', 'SCL-BLAST-_Details', 'SCL-BLASTe-_Localization', 'SCL-BLASTe-_Details', 72 'SCL-BLAST-_Localization', 'SCL-BLAST-_Details',
73 'SCL-BLASTe-_Localization', 'SCL-BLASTe-_Details',
69 'Signal-_Localization', 'Signal-_Details', 74 'Signal-_Localization', 'Signal-_Details',
70 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score', 75 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Periplasmic_Score', 'OuterMembrane_Score',
71 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', 76 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score',
72 'Secondary_Localization', 'PSortb_Version'] 77 'Secondary_Localization', 'PSortb_Version']
73 elif org_type == "-p": 78 elif org_type == "-p":
74 # Gram positive bacteria 79 # Gram positive bacteria
75 header = ['SeqID', 'CMSVM+_Localization', 'CMSVM+_Details', 'CWSVM+_Localization', 'CWSVM+_Details', 80 header = ['SeqID', 'CMSVM+_Localization', 'CMSVM+_Details', 'CWSVM+_Localization', 'CWSVM+_Details',
76 'CytoSVM+_Localization', 'CytoSVM+_Details', 'ECSVM+_Localization', 'ECSVM+_Details', 81 'CytoSVM+_Localization', 'CytoSVM+_Details', 'ECSVM+_Localization', 'ECSVM+_Details',
77 'ModHMM+_Localization', 'ModHMM+_Details', 'Motif+_Localization', 'Motif+_Details', 82 'ModHMM+_Localization', 'ModHMM+_Details', 'Motif+_Localization', 'Motif+_Details',
78 'Profile+_Localization', 'Profile+_Details', 83 'Profile+_Localization', 'Profile+_Details',
79 'SCL-BLAST+_Localization', 'SCL-BLAST+_Details', 'SCL-BLASTe+_Localization', 'SCL-BLASTe+_Details', 84 'SCL-BLAST+_Localization', 'SCL-BLAST+_Details',
85 'SCL-BLASTe+_Localization', 'SCL-BLASTe+_Details',
80 'Signal+_Localization', 'Signal+_Details', 86 'Signal+_Localization', 'Signal+_Details',
81 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score', 87 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score',
82 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', 88 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score',
83 'Secondary_Localization', 'PSortb_Version'] 89 'Secondary_Localization', 'PSortb_Version']
84 elif org_type == "-a": 90 elif org_type == "-a":
85 # Archaea 91 # Archaea
86 header = ['SeqID', 'CMSVM_a_Localization', 'CMSVM_a_Details', 'CWSVM_a_Localization', 'CWSVM_a_Details', 92 header = ['SeqID', 'CMSVM_a_Localization', 'CMSVM_a_Details', 'CWSVM_a_Localization', 'CWSVM_a_Details',
87 'CytoSVM_a_Localization', 'CytoSVM_a_Details', 'ECSVM_a_Localization', 'ECSVM_a_Details', 93 'CytoSVM_a_Localization', 'CytoSVM_a_Details', 'ECSVM_a_Localization', 'ECSVM_a_Details',
88 'ModHMM_a_Localization', 'ModHMM_a_Details', 'Motif_a_Localization', 'Motif_a_Details', 94 'ModHMM_a_Localization', 'ModHMM_a_Details', 'Motif_a_Localization', 'Motif_a_Details',
89 'Profile_a_Localization', 'Profile_a_Details', 95 'Profile_a_Localization', 'Profile_a_Details',
90 'SCL-BLAST_a_Localization', 'SCL-BLAST_a_Details', 'SCL-BLASTe_a_Localization', 'SCL-BLASTe_a_Details', 96 'SCL-BLAST_a_Localization', 'SCL-BLAST_a_Details',
97 'SCL-BLASTe_a_Localization', 'SCL-BLASTe_a_Details',
91 'Signal_a_Localization', 'Signal_a_Details', 98 'Signal_a_Localization', 'Signal_a_Details',
92 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score', 99 'Cytoplasmic_Score', 'CytoplasmicMembrane_Score', 'Cellwall_Score',
93 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score', 100 'Extracellular_Score', 'Final_Localization', 'Final_Localization_Details', 'Final_Score',
94 'Secondary_Localization', 'PSortb_Version'] 101 'Secondary_Localization', 'PSortb_Version']
95 else: 102 else:
120 "%i fields, not %i, in line:\n%r" % (len(line), len(header), line) 127 "%i fields, not %i, in line:\n%r" % (len(line), len(header), line)
121 out_handle.write(line) 128 out_handle.write(line)
122 count += 1 129 count += 1
123 return count 130 return count
124 131
132
125 # Note that if the input FASTA file contains no sequences, 133 # Note that if the input FASTA file contains no sequences,
126 # split_fasta returns an empty list (i.e. zero temp files). 134 # split_fasta returns an empty list (i.e. zero temp files).
127 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK) 135 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "tmhmm"), FASTA_CHUNK)
128 temp_files = [f + ".out" for f in fasta_files] 136 temp_files = [f + ".out" for f in fasta_files]
129 jobs = ["psort %s %s %s -o %s %s > %s" % (org_type, cutoff, divergent, out_type, fasta, temp) 137 jobs = ["psort %s %s %s -o %s %s > %s" % (org_type, cutoff, divergent, out_type, fasta, temp)
137 try: 145 try:
138 os.rmdir(tmp_dir) 146 os.rmdir(tmp_dir)
139 except Exception: 147 except Exception:
140 pass 148 pass
141 149
150
142 if len(jobs) > 1 and num_threads > 1: 151 if len(jobs) > 1 and num_threads > 1:
143 # A small "info" message for Galaxy to show the user. 152 # A small "info" message for Galaxy to show the user.
144 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) 153 print("Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)))
145 results = run_jobs(jobs, num_threads) 154 results = run_jobs(jobs, num_threads)
146 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs): 155 for fasta, temp, cmd in zip(fasta_files, temp_files, jobs):
147 error_level = results[cmd] 156 error_level = results[cmd]
148 if error_level: 157 if error_level:
149 try: 158 try:
165 data_handle.close() 174 data_handle.close()
166 if not count: 175 if not count:
167 clean_up(fasta_files + temp_files) 176 clean_up(fasta_files + temp_files)
168 sys.exit("No output from psortb") 177 sys.exit("No output from psortb")
169 out_handle.close() 178 out_handle.close()
170 print "%i records" % count 179 print("%i records" % count)
171 180
172 clean_up(fasta_files + temp_files) 181 clean_up(fasta_files + temp_files)