comparison tools/protein_analysis/promoter2.py @ 19:f3ecd80850e2 draft

v0.2.9 Python style improvements
author peterjc
date Wed, 01 Feb 2017 09:46:42 -0500
parents eb6ac44d4b8e
children a19b3ded8f33
comparison
equal deleted inserted replaced
18:eb6ac44d4b8e 19:f3ecd80850e2
28 """ 28 """
29 import sys 29 import sys
30 import os 30 import os
31 import commands 31 import commands
32 import tempfile 32 import tempfile
33 from seq_analysis_utils import sys_exit, split_fasta, run_jobs, thread_count 33 from seq_analysis_utils import split_fasta, run_jobs, thread_count
34 34
35 FASTA_CHUNK = 500 35 FASTA_CHUNK = 500
36 36
37 if "-v" in sys.argv or "--version" in sys.argv: 37 if "-v" in sys.argv or "--version" in sys.argv:
38 sys.exit(os.system("promoter -V")) 38 sys.exit(os.system("promoter -V"))
39 39
40 if len(sys.argv) != 4: 40 if len(sys.argv) != 4:
41 sys_exit("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. " 41 sys.exit("Require three arguments, number of threads (int), input DNA FASTA file & output tabular file. "
42 "Got %i arguments." % (len(sys.argv)-1)) 42 "Got %i arguments." % (len(sys.argv) - 1))
43 43
44 num_threads = thread_count(sys.argv[3],default=4) 44 num_threads = thread_count(sys.argv[3], default=4)
45 fasta_file = os.path.abspath(sys.argv[2]) 45 fasta_file = os.path.abspath(sys.argv[2])
46 tabular_file = os.path.abspath(sys.argv[3]) 46 tabular_file = os.path.abspath(sys.argv[3])
47 47
48 tmp_dir = tempfile.mkdtemp() 48 tmp_dir = tempfile.mkdtemp()
49 49
50
50 def get_path_and_binary(): 51 def get_path_and_binary():
51 platform = commands.getoutput("uname") #e.g. Linux 52 platform = commands.getoutput("uname") # e.g. Linux
52 shell_script = commands.getoutput("which promoter") 53 shell_script = commands.getoutput("which promoter")
53 if not os.path.isfile(shell_script): 54 if not os.path.isfile(shell_script):
54 sys_exit("ERROR: Missing promoter executable shell script") 55 sys.exit("ERROR: Missing promoter executable shell script")
55 path = None 56 path = None
56 for line in open(shell_script): 57 for line in open(shell_script):
57 if line.startswith("setenv"): #could then be tab or space! 58 if line.startswith("setenv"): # could then be tab or space!
58 parts = line.rstrip().split(None, 2) 59 parts = line.rstrip().split(None, 2)
59 if parts[0] == "setenv" and parts[1] == "PROM": 60 if parts[0] == "setenv" and parts[1] == "PROM":
60 path = parts[2] 61 path = parts[2]
61 if not path: 62 if not path:
62 sys_exit("ERROR: Could not find promoter path (PROM) in %r" % shell_script) 63 sys.exit("ERROR: Could not find promoter path (PROM) in %r" % shell_script)
63 if not os.path.isdir(path): 64 if not os.path.isdir(path):
64 sys_exit("ERROR: %r is not a directory" % path) 65 sys.exit("ERROR: %r is not a directory" % path)
65 bin = "%s/bin/promoter_%s" % (path, platform) 66 bin = "%s/bin/promoter_%s" % (path, platform)
66 if not os.path.isfile(bin): 67 if not os.path.isfile(bin):
67 sys_exit("ERROR: Missing promoter binary %r" % bin) 68 sys.exit("ERROR: Missing promoter binary %r" % bin)
68 return path, bin 69 return path, bin
70
69 71
70 def make_tabular(raw_handle, out_handle): 72 def make_tabular(raw_handle, out_handle):
71 """Parse text output into tabular, return query count.""" 73 """Parse text output into tabular, return query count."""
72 identifier = None 74 identifier = None
73 queries = 0 75 queries = 0
74 for line in raw_handle: 76 for line in raw_handle:
75 #print repr(line) 77 # print repr(line)
76 if not line.strip() or line == "Promoter prediction:\n": 78 if not line.strip() or line == "Promoter prediction:\n":
77 pass 79 pass
78 elif line[0] != " ": 80 elif line[0] != " ":
79 identifier = line.strip().replace("\t", " ").split(None,1)[0] 81 identifier = line.strip().replace("\t", " ").split(None, 1)[0]
80 queries += 1 82 queries += 1
81 elif line == " No promoter predicted\n": 83 elif line == " No promoter predicted\n":
82 #End of a record 84 # End of a record
83 identifier = None 85 identifier = None
84 elif line == " Position Score Likelihood\n": 86 elif line == " Position Score Likelihood\n":
85 assert identifier 87 assert identifier
86 else: 88 else:
87 try: 89 try:
88 position, score, likelihood = line.strip().split(None,2) 90 position, score, likelihood = line.strip().split(None, 2)
89 except ValueError: 91 except ValueError:
90 print "WARNING: Problem with line: %r" % line 92 print "WARNING: Problem with line: %r" % line
91 continue 93 continue
92 #sys_exit("ERROR: Problem with line: %r" % line) 94 # sys.exit("ERROR: Problem with line: %r" % line)
93 if likelihood not in ["ignored", 95 if likelihood not in ["ignored",
94 "Marginal prediction", 96 "Marginal prediction",
95 "Medium likely prediction", 97 "Medium likely prediction",
96 "Highly likely prediction"]: 98 "Highly likely prediction"]:
97 sys_exit("ERROR: Problem with line: %r" % line) 99 sys.exit("ERROR: Problem with line: %r" % line)
98 out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood)) 100 out_handle.write("%s\t%s\t%s\t%s\n" % (identifier, position, score, likelihood))
99 return queries 101 return queries
100 102
101 working_dir, bin = get_path_and_binary() 103 working_dir, bin = get_path_and_binary()
102 104
103 if not os.path.isfile(fasta_file): 105 if not os.path.isfile(fasta_file):
104 sys_exit("ERROR: Missing input FASTA file %r" % fasta_file) 106 sys.exit("ERROR: Missing input FASTA file %r" % fasta_file)
105 107
106 #Note that if the input FASTA file contains no sequences, 108 # Note that if the input FASTA file contains no sequences,
107 #split_fasta returns an empty list (i.e. zero temp files). 109 # split_fasta returns an empty list (i.e. zero temp files).
108 #We deliberately omit the FASTA descriptions to avoid a 110 # We deliberately omit the FASTA descriptions to avoid a
109 #bug in promoter2 with descriptions over 200 characters. 111 # bug in promoter2 with descriptions over 200 characters.
110 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "promoter"), FASTA_CHUNK, keep_descr=False) 112 fasta_files = split_fasta(fasta_file, os.path.join(tmp_dir, "promoter"), FASTA_CHUNK, keep_descr=False)
111 temp_files = [f+".out" for f in fasta_files] 113 temp_files = [f + ".out" for f in fasta_files]
112 jobs = ["%s %s > %s" % (bin, fasta, temp) 114 jobs = ["%s %s > %s" % (bin, fasta, temp)
113 for fasta, temp in zip(fasta_files, temp_files)] 115 for fasta, temp in zip(fasta_files, temp_files)]
116
114 117
115 def clean_up(file_list): 118 def clean_up(file_list):
116 for f in file_list: 119 for f in file_list:
117 if os.path.isfile(f): 120 if os.path.isfile(f):
118 os.remove(f) 121 os.remove(f)
119 try: 122 try:
120 os.rmdir(tmp_dir) 123 os.rmdir(tmp_dir)
121 except: 124 except Exception:
122 pass 125 pass
123 126
124 if len(jobs) > 1 and num_threads > 1: 127 if len(jobs) > 1 and num_threads > 1:
125 #A small "info" message for Galaxy to show the user. 128 # A small "info" message for Galaxy to show the user.
126 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs)) 129 print "Using %i threads for %i tasks" % (min(num_threads, len(jobs)), len(jobs))
127 cur_dir = os.path.abspath(os.curdir) 130 cur_dir = os.path.abspath(os.curdir)
128 os.chdir(working_dir) 131 os.chdir(working_dir)
129 results = run_jobs(jobs, num_threads) 132 results = run_jobs(jobs, num_threads)
130 os.chdir(cur_dir) 133 os.chdir(cur_dir)
134 try: 137 try:
135 output = open(temp).readline() 138 output = open(temp).readline()
136 except IOError: 139 except IOError:
137 output = "" 140 output = ""
138 clean_up(fasta_files + temp_files) 141 clean_up(fasta_files + temp_files)
139 sys_exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output), 142 sys.exit("One or more tasks failed, e.g. %i from %r gave:\n%s" % (error_level, cmd, output),
140 error_level) 143 error_level)
141 144
142 del results 145 del results
143 del jobs 146 del jobs
144 147
149 data_handle = open(temp) 152 data_handle = open(temp)
150 count = make_tabular(data_handle, out_handle) 153 count = make_tabular(data_handle, out_handle)
151 data_handle.close() 154 data_handle.close()
152 if not count: 155 if not count:
153 clean_up(fasta_files + temp_files) 156 clean_up(fasta_files + temp_files)
154 sys_exit("No output from promoter2") 157 sys.exit("No output from promoter2")
155 queries += count 158 queries += count
156 out_handle.close() 159 out_handle.close()
157 160
158 clean_up(fasta_files + temp_files) 161 clean_up(fasta_files + temp_files)
159 print "Results for %i queries" % queries 162 print "Results for %i queries" % queries