comparison msconvert_wrapper.py @ 7:37e5502441cc draft

Uploaded
author galaxyp
date Mon, 17 Feb 2014 17:00:29 -0500
parents
children d2c61663e33c
comparison
equal deleted inserted replaced
6:ffe1dca94595 7:37e5502441cc
1 #!/usr/bin/env python
2 import optparse
3 import os
4 import sys
5 import tempfile
6 import shutil
7 import subprocess
8 import re
9 import logging
10
11 assert sys.version_info[:2] >= (2, 6)
12
13 log = logging.getLogger(__name__)
14 working_directory = os.getcwd()
15 tmp_stderr_name = tempfile.NamedTemporaryFile(dir=working_directory, suffix='.stderr').name
16 tmp_stdout_name = tempfile.NamedTemporaryFile(dir=working_directory, suffix='.stdout').name
17
18
19 def stop_err(msg):
20 sys.stderr.write("%s\n" % msg)
21 sys.exit()
22
23
24 def read_stderr():
25 stderr = ''
26 if(os.path.exists(tmp_stderr_name)):
27 with open(tmp_stderr_name, 'rb') as tmp_stderr:
28 buffsize = 1048576
29 try:
30 while True:
31 stderr += tmp_stderr.read(buffsize)
32 if not stderr or len(stderr) % buffsize != 0:
33 break
34 except OverflowError:
35 pass
36 return stderr
37
38
39 def execute(command, stdin=None):
40 try:
41 with open(tmp_stderr_name, 'wb') as tmp_stderr:
42 with open(tmp_stdout_name, 'wb') as tmp_stdout:
43 proc = subprocess.Popen(args=command, shell=True, stderr=tmp_stderr.fileno(), stdout=tmp_stdout.fileno(), stdin=stdin, env=os.environ)
44 returncode = proc.wait()
45 if returncode != 0:
46 raise Exception("Program returned with non-zero exit code %d. stderr: %s" % (returncode, read_stderr()))
47 finally:
48 print open(tmp_stderr_name, "r").read()
49 print open(tmp_stdout_name, "r").read()
50
51
52 def delete_file(path):
53 if os.path.exists(path):
54 try:
55 os.remove(path)
56 except:
57 pass
58
59
60 def delete_directory(directory):
61 if os.path.exists(directory):
62 try:
63 shutil.rmtree(directory)
64 except:
65 pass
66
67
68 def symlink(source, link_name):
69 import platform
70 if platform.system() == 'Windows':
71 try:
72 import win32file
73 win32file.CreateSymbolicLink(source, link_name, 1)
74 except:
75 shutil.copy(source, link_name)
76 else:
77 os.symlink(source, link_name)
78
79
80 def copy_to_working_directory(data_file, relative_path):
81 if os.path.abspath(data_file) != os.path.abspath(relative_path):
82 symlink(data_file, relative_path)
83 return relative_path
84
85
86 def __main__():
87 run_script()
88
89 #ENDTEMPLATE
90
91 to_extensions = ['mzML', 'mzXML', 'unindexed_mzML', 'unindexed_mzXML', 'mgf', 'txt', 'ms2', 'cms2']
92
93
94 def str_to_bool(v):
95 """ From http://stackoverflow.com/questions/715417/converting-from-a-string-to-boolean-in-python """
96 return v.lower() in ["yes", "true", "t", "1"]
97
98
99 def _add_filter(filters_file, contents):
100 filters_file.write("filter=%s\n" % contents)
101
102
103 def _skip_line(options, file_num, line_parts):
104 file_num_column = options.filter_table_file_column
105 if not file_num_column:
106 return False
107 else:
108 target_file_num_val = str(file_num).strip()
109 query_file_num_val = line_parts[int(file_num_column) - 1].strip()
110 #print "target %s, query %s" % (target_file_num_val, query_file_num_val)
111 return target_file_num_val != query_file_num_val
112
113
114 def _read_table_numbers(path, options, file_num=None):
115 unique_numbers = set([])
116 column_num = options.filter_table_column
117 input = open(path, "r")
118 first_line = True
119 for line in input:
120 if not line:
121 continue
122 line = line.strip()
123 if line.startswith("#"):
124 first_line = False
125 continue
126 if column_num == None:
127 column = line
128 else:
129 line_parts = line.split("\t")
130 if _skip_line(options, file_num, line_parts):
131 continue
132 column = line_parts[int(column_num) - 1]
133 match = re.match("\d+", column)
134 if match:
135 unique_numbers.add(int(match.group()))
136 first_line = False
137 return unique_numbers
138
139
140 def shellquote(s):
141 return '"' + s.replace('"', '\\"') + '"'
142
143
144 def _add_filter_line_from_file(filter_file, options, file_num=None):
145 file = options.filter_table
146 if not file:
147 return
148 numbers = _read_table_numbers(file, options, file_num)
149 msconvert_int_set = " ".join([str(number) for number in numbers])
150 filter_type = options.filter_table_type
151 if filter_type == 'number':
152 filter_prefix = 'scanNumber'
153 else:
154 filter_prefix = 'index'
155 _add_filter(filter_file, "%s %s" % (filter_prefix, msconvert_int_set))
156
157
158 def _create_filters_file(options, file_num=None, debug=False):
159 suffix = "" if not file_num else str(file_num)
160 filters_file_path = "filters%s" % suffix
161 filters_file = open(filters_file_path, "w")
162 if options.filters_file:
163 filters_file.write(open(options.filters_file, "r").read())
164 for filter in options.filter:
165 _add_filter(filters_file, filter)
166 _add_filter_line_from_file(filters_file, options, file_num=file_num)
167
168 filters_file.close()
169 if debug:
170 print open(filters_file_path, "r").read()
171 return filters_file_path
172
173
174 def _build_base_cmd(options):
175 to_extension = options.toextension
176 if to_extension.startswith("unindexed_"):
177 to_extension = to_extension[len("unindexed_"):]
178 to_params = "--noindex"
179 else:
180 to_params = ""
181 cmd = "msconvert --%s %s" % (to_extension, to_params)
182 if str_to_bool(options.zlib):
183 cmd = "%s %s" % (cmd, "--zlib")
184 if options.binaryencoding:
185 cmd = "%s --%s" % (cmd, options.binaryencoding)
186 if options.mzencoding:
187 cmd = "%s --mz%s" % (cmd, options.mzencoding)
188 if options.intensityencoding:
189 cmd = "%s --inten%s" % (cmd, options.intensityencoding)
190 return cmd
191
192
193 def _run(base_cmd, output_dir='output', inputs=[], debug=False):
194 inputs_as_str = " ".join(['%s' % shellquote(input) for input in inputs])
195 os.mkdir(output_dir)
196 cmd = "%s -o %s %s" % (base_cmd, shellquote(output_dir), inputs_as_str)
197 if debug:
198 print cmd
199 execute(cmd)
200 output_files = os.listdir(output_dir)
201 assert len(output_files) == 1
202 output_file = output_files[0]
203 return os.path.join(output_dir, output_file)
204
205
206 def run_script():
207 parser = optparse.OptionParser()
208 parser.add_option('--input', dest='inputs', action='append', default=[])
209 parser.add_option('--input_name', dest='input_names', action='append', default=[])
210 parser.add_option('--output', dest='output')
211 parser.add_option('--fromextension', dest='fromextension')
212 parser.add_option('--toextension', dest='toextension', default='mzML', choices=to_extensions)
213 parser.add_option('--binaryencoding', dest='binaryencoding', choices=['32', '64'])
214 parser.add_option('--mzencoding', dest='mzencoding', choices=['32', '64'])
215 parser.add_option('--intensityencoding', dest='intensityencoding', choices=['32', '64'])
216 parser.add_option('--zlib', dest='zlib', default="false")
217 parser.add_option('--filter', dest='filter', action='append', default=[])
218 parser.add_option('--filters_file', dest='filters_file', default=None)
219 parser.add_option('--filter_table', default=None)
220 parser.add_option('--filter_table_type', default='index', choices=['index', 'number'])
221 parser.add_option('--filter_table_column', default=None)
222 parser.add_option('--filter_table_file_column', default=None)
223 parser.add_option('--debug', dest='debug', action='store_true', default=False)
224
225 (options, args) = parser.parse_args()
226 if len(options.inputs) < 1:
227 stop_err("No input files to msconvert specified")
228 if len(options.input_names) > 0 and len(options.input_names) != len(options.inputs):
229 stop_err("Number(s) of supplied input names and input files do not match")
230 if not options.output:
231 stop_err("Must specify output location")
232 input_files = []
233 for i, input in enumerate(options.inputs):
234 input_base = None
235 if len(options.input_names) > i:
236 input_base = options.input_names[i]
237 if not input_base:
238 input_base = 'input%s' % i
239 if not input_base.lower().endswith(options.fromextension.lower()):
240 input_file = '%s.%s' % (input_base, options.fromextension)
241 else:
242 input_file = input_base
243 input_file = input_file
244 copy_to_working_directory(input, input_file)
245 input_files.append(input_file)
246
247 cmd = _build_base_cmd(options)
248 file_column = options.filter_table_file_column
249 if not file_column:
250 # Apply same filters to all files, just create a unviersal filter files
251 # and run msconvert once.
252 filters_file_path = _create_filters_file(options, debug=options.debug)
253 cmd = "%s -c %s" % (cmd, filters_file_path)
254 else:
255 # Dispatching on a column to filter different files differently, need to filter
256 # each input once with msconvert and then merge once.
257 filtered_files = []
258 for index, input_file in enumerate(input_files):
259 filters_file_path = _create_filters_file(options, index + 1, debug=options.debug)
260 filter_cmd = "%s -c %s" % (cmd, filters_file_path)
261 filtered_output_file = _run(filter_cmd, output_dir='output%d' % index, inputs=[input_file], debug=options.debug)
262 filtered_files.append(filtered_output_file)
263 input_files = filtered_files
264 if len(input_files) > 1:
265 cmd = "%s --merge" % cmd
266 output_file = _run(cmd, output_dir='output', inputs=input_files, debug=options.debug)
267 shutil.copy(output_file, options.output)
268
269
270 if __name__ == '__main__':
271 __main__()