Mercurial > repos > fubar > tool_factory_2
comparison toolfactory/rgToolFactory2.py @ 28:ad564ab3cf7b draft
Uploaded
author | fubar |
---|---|
date | Fri, 31 Jul 2020 23:00:31 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
27:c4e3cf28545f | 28:ad564ab3cf7b |
---|---|
1 # rgToolFactory.py | |
2 # see https://github.com/fubar2/toolfactory | |
3 # | |
4 # copyright ross lazarus (ross stop lazarus at gmail stop com) May 2012 | |
5 # | |
6 # all rights reserved | |
7 # Licensed under the LGPL | |
8 # suggestions for improvement and bug fixes welcome at https://github.com/fubar2/toolfactory | |
9 # | |
10 # July 2020: BCC was fun and I feel like rip van winkle after 5 years. | |
11 # Decided to | |
12 # 1. Fix the toolfactory so it works - done for simplest case | |
13 # 2. Fix planemo so the toolfactory function works | |
14 # 3. Rewrite bits using galaxyxml functions where that makes sense - done | |
15 # | |
16 # removed all the old complications including making the new tool use this same script | |
17 # galaxyxml now generates the tool xml https://github.com/hexylena/galaxyxml | |
18 # No support for automatic HTML file creation from arbitrary outputs | |
19 # TODO: add option to run that code as a post execution hook | |
20 # TODO: add additional history input parameters - currently only one | |
21 | |
22 import sys | |
23 import subprocess | |
24 import shutil | |
25 import os | |
26 import time | |
27 import tempfile | |
28 import argparse | |
29 import tarfile | |
30 import re | |
31 import galaxyxml.tool as gxt | |
32 import galaxyxml.tool.parameters as gxtp | |
33 import logging | |
34 | |
35 | |
36 progname = os.path.split(sys.argv[0])[1] | |
37 myversion = 'V2.1 July 2020' | |
38 verbose = True | |
39 debug = True | |
40 toolFactoryURL = 'https://github.com/fubar2/toolfactory' | |
41 ourdelim = '~~~' | |
42 | |
43 | |
44 def timenow(): | |
45 """return current time as a string | |
46 """ | |
47 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) | |
48 | |
49 | |
50 def quote_non_numeric(s): | |
51 """return a prequoted string for non-numerics | |
52 useful for perl and Rscript parameter passing? | |
53 """ | |
54 try: | |
55 _ = float(s) | |
56 return s | |
57 except ValueError: | |
58 return '"%s"' % s | |
59 | |
60 | |
61 html_escape_table = { | |
62 "&": "&", | |
63 ">": ">", | |
64 "<": "<", | |
65 "$": r"\$" | |
66 } | |
67 | |
68 | |
69 def html_escape(text): | |
70 """Produce entities within text.""" | |
71 return "".join(html_escape_table.get(c, c) for c in text) | |
72 | |
73 | |
74 def html_unescape(text): | |
75 """Revert entities within text. Multiple character targets so use replace""" | |
76 t = text.replace('&', '&') | |
77 t = t.replace('>', '>') | |
78 t = t.replace('<', '<') | |
79 t = t.replace('\\$', '$') | |
80 return t | |
81 | |
82 | |
83 def parse_citations(citations_text): | |
84 """ | |
85 """ | |
86 citations = [c for c in citations_text.split("**ENTRY**") if c.strip()] | |
87 citation_tuples = [] | |
88 for citation in citations: | |
89 if citation.startswith("doi"): | |
90 citation_tuples.append(("doi", citation[len("doi"):].strip())) | |
91 else: | |
92 citation_tuples.append( | |
93 ("bibtex", citation[len("bibtex"):].strip())) | |
94 return citation_tuples | |
95 | |
96 | |
97 class ScriptRunner: | |
98 """Wrapper for an arbitrary script | |
99 uses galaxyxml | |
100 | |
101 """ | |
102 | |
103 def __init__(self, args=None): | |
104 """ | |
105 prepare command line cl for running the tool here | |
106 and prepare elements needed for galaxyxml tool generation | |
107 """ | |
108 lastclredirect = None | |
109 self.cl = [] | |
110 aCL = self.cl.append | |
111 if args.output_dir: # simplify for the tool tarball | |
112 os.chdir(args.output_dir) | |
113 self.args = args | |
114 # a sanitizer now does this but.. | |
115 self.tool_name = re.sub('[^a-zA-Z0-9_]+', '', args.tool_name) | |
116 self.tool_id = self.tool_name | |
117 self.xmlfile = '%s.xml' % self.tool_name | |
118 if self.args.interpreter_name == "Executable": # binary - no need | |
119 aCL(self.args.exe_package) # this little CL will just run | |
120 else: # a script has been provided | |
121 rx = open(self.args.script_path, 'r').readlines() | |
122 # remove pesky dos line endings if needed | |
123 rx = [x.rstrip() for x in rx] | |
124 self.script = '\n'.join(rx) | |
125 fhandle, self.sfile = tempfile.mkstemp( | |
126 prefix=self.tool_name, suffix=".%s" % (args.interpreter_name)) | |
127 # use self.sfile as script source for Popen | |
128 tscript = open(self.sfile, 'w') | |
129 tscript.write(self.script) | |
130 tscript.close() | |
131 self.indentedScript = " %s" % '\n'.join( | |
132 [' %s' % html_escape(x) for x in rx]) # for restructured text in help | |
133 self.escapedScript = "%s" % '\n'.join( | |
134 [' %s' % html_escape(x) for x in rx]) | |
135 aCL(self.args.interpreter_name) | |
136 aCL(self.sfile) | |
137 self.elog = os.path.join(self.args.output_dir, | |
138 "%s_error.log" % self.tool_name) | |
139 if args.output_dir: # may not want these complexities | |
140 self.tlog = os.path.join( | |
141 self.args.output_dir, "%s_runner.log" % self.tool_name) | |
142 art = '%s.%s' % (self.tool_name, args.interpreter_name) | |
143 artpath = os.path.join( | |
144 self.args.output_dir, | |
145 art) # need full path | |
146 # use self.sfile as script source for Popen | |
147 artifact = open(artpath, 'w') | |
148 artifact.write(self.script) | |
149 artifact.close() | |
150 self.infile_paths = [] | |
151 self.infile_format = [] | |
152 self.infile_cl = [] | |
153 self.infile_label = [] | |
154 self.infile_help = [] | |
155 if self.args.input_files: | |
156 aif = [x.split(ourdelim) for x in self.args.input_files] | |
157 # transpose the input_files array passed as | |
158 # --input_files="$input_files~~~$CL~~~$input_formats~~~$input_label~~~$input_help" | |
159 laif = list(map(list, zip(*aif))) | |
160 self.infile_paths, self.infile_cl, self.infile_format, self.infile_label, self.infile_help = laif | |
161 self.infile_name = [] | |
162 # positionals have integers indicating order - need valid internal | |
163 # names | |
164 for i, scl in enumerate(self.infile_cl): | |
165 if scl.isdigit(): | |
166 scl = 'input%s' % scl | |
167 if scl.upper() in ['STDOUT', 'STDIN']: | |
168 scl = 'input%d' % (i + 1) | |
169 # make a list of internal names for each input file | |
170 self.infile_name.append(scl) | |
171 # list all (cl param) pairs - positional needs sorting by cl index so decorate | |
172 clsuffix = [] | |
173 clsuffix.append([self.args.output_cl, self.args.output_tab]) | |
174 if self.args.parampass == '0': # only need two | |
175 aCL('<') | |
176 aCL('%s' % self.infile_paths[0]) | |
177 aCL('>') | |
178 aCL('%s' % self.args.output_tab) | |
179 else: | |
180 for i, p in enumerate(self.infile_paths): | |
181 # decorator is cl - sort for positional | |
182 clsuffix.append([self.infile_cl[i], p]) | |
183 for p in self.args.additional_parameters: | |
184 psplit = p.split(ourdelim) | |
185 pform = psplit[5] | |
186 if pform == 'STDOUT': | |
187 lastclredirect = ['>', psplit[1]] | |
188 else: | |
189 clsuffix.append([pform, psplit[1]]) # cl,value | |
190 clsuffix.sort() | |
191 if self.args.parampass == "positional": | |
192 # inputs in order then params in order TODO fix ordering using | |
193 # self.infile_cl | |
194 for (k, v) in clsuffix: | |
195 if ' ' in v: | |
196 aCL("v") | |
197 else: | |
198 aCL(v) | |
199 elif self.args.parampass == "argparse": | |
200 # inputs then params in argparse named form | |
201 for (k, v) in clsuffix: | |
202 if ' ' in v: | |
203 aCL('--%s' % k) | |
204 aCL('"%s"' % v) | |
205 else: | |
206 aCL('--%s' % k) | |
207 aCL('%s' % v) | |
208 if lastclredirect: | |
209 for v in lastclredirect: | |
210 aCL(v) # add the stdout parameter last | |
211 self.test1Output = '%s_test1_output.xls' % self.tool_name | |
212 self.test1HTML = '%s_test1_output.html' % self.tool_name | |
213 | |
214 def makeXML(self): | |
215 """ | |
216 Create a Galaxy xml tool wrapper for the new script | |
217 Uses galaxyhtml | |
218 """ | |
219 # need interp and executable (?script) or else executable only | |
220 if self.args.interpreter_name: | |
221 exe = "$runMe" # our dynamic script from the tool builder | |
222 interp = self.args.interpreter_name | |
223 else: | |
224 interp = None | |
225 exe = self.args.exe_package | |
226 assert exe is not None, 'No interpeter or executable passed in to makeXML' | |
227 tool = gxt.Tool(self.args.tool_name, self.tool_id, | |
228 self.args.tool_version, self.args.tool_desc, exe) | |
229 if interp: | |
230 tool.interpreter = interp | |
231 if self.args.help_text: | |
232 helptext = open(self.args.help_text, 'r').readlines() | |
233 # must html escape here too - thanks to Marius van den Beek | |
234 helptext = [html_escape(x) for x in helptext] | |
235 tool.help = ''.join([x for x in helptext]) | |
236 else: | |
237 tool.help = 'Please ask the tool author (%s) for help \ | |
238 as none was supplied at tool generation\n' % (self.args.user_email) | |
239 tool.version_command = None # do not want | |
240 inputs = gxtp.Inputs() | |
241 outputs = gxtp.Outputs() | |
242 requirements = gxtp.Requirements() | |
243 testparam = [] | |
244 is_positional = (self.args.parampass == 'positional') | |
245 if self.args.include_dependencies == "yes": | |
246 requirements.append(gxtp.Requirement('package', 'ghostscript')) | |
247 requirements.append(gxtp.Requirement('package', 'graphicsmagick')) | |
248 if self.args.interpreter_name: | |
249 if self.args.interpreter_name == 'python': # always needed for this runner script | |
250 requirements.append(gxtp.Requirement( | |
251 'package', 'python', self.args.interpreter_version)) | |
252 elif self.args.interpreter_name not in ['bash', 'sh']: | |
253 requirements.append(gxtp.Requirement( | |
254 'package', self.args.interpreter_name, self.args.interpreter_version)) | |
255 else: | |
256 if self.args.exe_package: # uses exe not interpreter | |
257 requirements.append(gxtp.Requirement( | |
258 'package', self.args.exe_package, self.args.exe_package_version)) | |
259 tool.requirements = requirements | |
260 for i, infpath in enumerate(self.infile_paths): | |
261 if self.args.parampass == 0: | |
262 assert len( | |
263 self.infile_name) == 1, 'Maximum one "<" if parampass is 0 - more than one input files supplied' | |
264 newname = self.infile_name[i] | |
265 if len(newname) > 1: | |
266 ndash = 2 | |
267 else: | |
268 ndash = 1 | |
269 if not len(self.infile_label[i]) > 0: | |
270 alab = self.infile_name[i] | |
271 else: | |
272 alab = self.infile_label[i] | |
273 aninput = gxtp.DataParam(self.infile_name[i], optional=False, label=alab, help=self.infile_help[i], | |
274 format=self.infile_format[i], multiple=False, num_dashes=ndash) | |
275 if self.args.parampass == '0': | |
276 aninput.command_line_override = '< $%s' % self.infile_name[i] | |
277 aninput.positional = is_positional | |
278 inputs.append(aninput) | |
279 for parm in self.args.additional_parameters: | |
280 newname, newval, newlabel, newhelp, newtype, newcl = parm.split( | |
281 ourdelim) | |
282 if not len(newlabel) > 0: | |
283 newlabel = newname | |
284 if len(newname) > 1: | |
285 ndash = 2 | |
286 else: | |
287 ndash = 1 | |
288 if newtype == "text": | |
289 aparm = gxtp.TextParam( | |
290 newname, label=newlabel, help=newhelp, value=newval, num_dashes=ndash) | |
291 elif newtype == "integer": | |
292 aparm = gxtp.IntegerParam( | |
293 newname, label=newname, help=newhelp, value=newval, num_dashes=ndash) | |
294 elif newtype == "float": | |
295 aparm = gxtp.FloatParam( | |
296 newname, label=newname, help=newhelp, value=newval, num_dashes=ndash) | |
297 else: | |
298 raise ValueError('Unrecognised parameter type "%s" for\ | |
299 additional parameter %s in makeXML' % (newtype, newname)) | |
300 aparm.positional = is_positional | |
301 inputs.append(aparm) | |
302 tparm = gxtp.TestParam(newname, value=newval) | |
303 testparam.append(tparm) | |
304 tool.inputs = inputs | |
305 configfiles = gxtp.Configfiles() | |
306 configfiles.append(gxtp.Configfile(name="runMe", text=self.script)) | |
307 tool.configfiles = configfiles | |
308 if self.args.output_tab: | |
309 ext = self.args.output_format | |
310 aparm = gxtp.OutputData( | |
311 self.args.output_cl, format=ext, num_dashes=ndash) | |
312 if is_positional: | |
313 aparm.command_line_override = '> $output1' | |
314 aparm.positional = is_positional | |
315 outputs.append(aparm) | |
316 tool.outputs = outputs | |
317 tests = gxtp.Tests() | |
318 test_a = gxtp.Test() | |
319 ext = self.infile_format[0].split(',')[0] | |
320 if is_positional: | |
321 param = gxtp.TestParam( | |
322 'input1', value='input1.%s' % ext, ftype=ext) | |
323 else: | |
324 param = gxtp.TestParam(self.infile_name[0], value='%s.%s' % ( | |
325 self.infile_name[0], ext), ftype=ext) | |
326 test_a.append(param) | |
327 param = gxtp.TestParam('job_name', value='test_a') | |
328 test_a.append(param) | |
329 param = gxtp.TestParam('runMe', value="$runMe") | |
330 test_a.append(param) | |
331 for aparam in testparam: | |
332 test_a.append(aparam) | |
333 test_out = gxtp.TestOutput( | |
334 name=self.args.output_cl, value=self.test1Output) | |
335 test_a.append(test_out) | |
336 tests.append(test_a) | |
337 tool.tests = tests | |
338 tool.add_comment('Created by %s at %s using the Galaxy Tool Factory.' % ( | |
339 self.args.user_email, timenow())) | |
340 tool.add_comment('Source in git at: %s' % (toolFactoryURL)) | |
341 tool.add_comment( | |
342 'Cite: Creating re-usable tools from scripts doi: 10.1093/bioinformatics/bts573') | |
343 exml = tool.export() | |
344 xf = open(self.xmlfile, 'w') | |
345 xf.write(exml) | |
346 xf.write('\n') | |
347 xf.close() | |
348 # ready for the tarball | |
349 | |
350 def makeTooltar(self): | |
351 """ | |
352 a tool is a gz tarball with eg | |
353 /toolname/tool.xml /toolname/tool.py /toolname/test-data/test1_in.foo ... | |
354 """ | |
355 retval = self.run() | |
356 if retval: | |
357 sys.stderr.write( | |
358 '## Run failed. Cannot build yet. Please fix and retry') | |
359 sys.exit(1) | |
360 tdir = 'tdir_%s' % self.tool_name | |
361 if not os.path.exists(tdir): | |
362 os.mkdir(tdir) | |
363 self.makeXML() | |
364 testdir = os.path.join(tdir, 'test-data') | |
365 if not os.path.exists(testdir): | |
366 os.mkdir(testdir) # make tests directory | |
367 for i, infile in enumerate(self.infile_paths): | |
368 dest = os.path.join(testdir, '%s.%s' % | |
369 (self.infile_name[i], self.infile_format[i])) | |
370 if infile != dest: | |
371 shutil.copyfile(infile, dest) | |
372 if self.args.output_tab and os.path.exists(self.args.output_tab): | |
373 shutil.copyfile(self.args.output_tab, | |
374 os.path.join(testdir, self.test1Output)) | |
375 else: | |
376 print('#### no output_tab %s exists' % self.args.output_tab) | |
377 if self.args.output_dir: | |
378 if os.path.exists(self.tlog): | |
379 shutil.copyfile(self.tlog, os.path.join( | |
380 testdir, 'test1_out.log')) | |
381 stname = os.path.join(tdir, self.sfile) | |
382 if not os.path.exists(stname): | |
383 shutil.copyfile(self.sfile, stname) | |
384 xtname = os.path.join(tdir, self.xmlfile) | |
385 if not os.path.exists(xtname): | |
386 shutil.copyfile(self.xmlfile, xtname) | |
387 tarpath = "%s.tar.gz" % self.tool_name | |
388 tar = tarfile.open(tarpath, "w:gz") | |
389 tar.add(tdir, recursive=True, arcname='%s' % self.tool_name) | |
390 tar.close() | |
391 shutil.copyfile(tarpath, self.args.new_tool) | |
392 shutil.rmtree(tdir) | |
393 # TODO: replace with optional direct upload to local toolshed? | |
394 return retval | |
395 | |
396 def run(self): | |
397 """ | |
398 Some devteam tools have this defensive stderr read so I'm keeping with the faith | |
399 Feel free to update. | |
400 """ | |
401 logging.debug('run cl=%s' % str(self.cl)) | |
402 scl = ' '.join(self.cl) | |
403 err = None | |
404 if self.args.parampass != '0': | |
405 ste = open(self.elog, 'wb') | |
406 sto = open(self.tlog, 'wb') | |
407 sto.write( | |
408 bytes('## Executing Toolfactory generated command line = %s\n' % scl, "utf8")) | |
409 sto.flush() | |
410 p = subprocess.run(self.cl, shell=False, stdout=sto, | |
411 stderr=ste, cwd=self.args.output_dir) | |
412 sto.close() | |
413 ste.close() | |
414 tmp_stderr = open(self.elog, 'rb') | |
415 err = '' | |
416 buffsize = 1048576 | |
417 try: | |
418 while True: | |
419 err += str(tmp_stderr.read(buffsize)) | |
420 if not err or len(err) % buffsize != 0: | |
421 break | |
422 except OverflowError: | |
423 pass | |
424 tmp_stderr.close() | |
425 retval = p.returncode | |
426 else: # work around special case of simple scripts that take stdin and write to stdout | |
427 sti = open(self.infile_paths[0], 'rb') | |
428 sto = open(self.args.output_tab, 'wb') | |
429 # must use shell to redirect | |
430 p = subprocess.run(self.cl, shell=False, stdout=sto, stdin=sti) | |
431 retval = p.returncode | |
432 sto.close() | |
433 sti.close() | |
434 if self.args.output_dir: | |
435 if p.returncode != 0 and err: # problem | |
436 sys.stderr.write(err) | |
437 logging.debug('run done') | |
438 return retval | |
439 | |
440 | |
441 def main(): | |
442 """ | |
443 This is a Galaxy wrapper. It expects to be called by a special purpose tool.xml as: | |
444 <command interpreter="python">rgBaseScriptWrapper.py --script_path "$scriptPath" --tool_name "foo" --interpreter "Rscript" | |
445 </command> | |
446 """ | |
447 parser = argparse.ArgumentParser() | |
448 a = parser.add_argument | |
449 a('--script_path', default='') | |
450 a('--tool_name', default=None) | |
451 a('--interpreter_name', default=None) | |
452 a('--interpreter_version', default=None) | |
453 a('--exe_package', default=None) | |
454 a('--exe_package_version', default=None) | |
455 a('--output_dir', default='./') | |
456 a('--input_files', default=[], action="append") | |
457 a("--input_formats", default="tabular") | |
458 a('--output_tab', default=None) | |
459 a('--output_format', default='tabular') | |
460 a('--output_cl', default=None) | |
461 a('--user_email', default='Unknown') | |
462 a('--bad_user', default=None) | |
463 a('--make_Tool', default=None) | |
464 a('--help_text', default=None) | |
465 a('--tool_desc', default=None) | |
466 a('--new_tool', default=None) | |
467 a('--tool_version', default=None) | |
468 a('--include_dependencies', default=None) | |
469 a('--citations', default=None) | |
470 a('--additional_parameters', dest='additional_parameters', | |
471 action='append', default=[]) | |
472 a('--edit_additional_parameters', action="store_true", default=False) | |
473 a('--parampass', default="positional") | |
474 args = parser.parse_args() | |
475 assert not args.bad_user, 'UNAUTHORISED: %s is NOT authorized to use this tool until Galaxy admin adds %s to "admin_users" in the Galaxy configuration file' % ( | |
476 args.bad_user, args.bad_user) | |
477 assert args.tool_name, '## Tool Factory expects a tool name - eg --tool_name=DESeq' | |
478 assert (args.interpreter_name or args.exe_package), '## Tool Factory wrapper expects an interpreter - eg --interpreter_name=Rscript or an executable package findable by the dependency management package' | |
479 assert args.exe_package or (len(args.script_path) > 0 and os.path.isfile( | |
480 args.script_path)), '## Tool Factory wrapper expects a script path - eg --script_path=foo.R if no executable' | |
481 if args.output_dir: | |
482 try: | |
483 os.makedirs(args.output_dir) | |
484 except BaseException: | |
485 pass | |
486 args.input_files = [x.replace('"', '').replace("'", '') | |
487 for x in args.input_files] | |
488 # remove quotes we need to deal with spaces in CL params | |
489 for i, x in enumerate(args.additional_parameters): | |
490 args.additional_parameters[i] = args.additional_parameters[i].replace( | |
491 '"', '') | |
492 r = ScriptRunner(args) | |
493 if args.make_Tool: | |
494 retcode = r.makeTooltar() | |
495 else: | |
496 retcode = r.run() | |
497 if retcode: | |
498 sys.exit(retcode) # indicate failure to job runner | |
499 | |
500 | |
501 if __name__ == "__main__": | |
502 main() |