comparison toolfactory/rgToolFactory2.py @ 28:ad564ab3cf7b draft

Uploaded
author fubar
date Fri, 31 Jul 2020 23:00:31 -0400
parents
children
comparison
equal deleted inserted replaced
27:c4e3cf28545f 28:ad564ab3cf7b
1 # rgToolFactory.py
2 # see https://github.com/fubar2/toolfactory
3 #
4 # copyright ross lazarus (ross stop lazarus at gmail stop com) May 2012
5 #
6 # all rights reserved
7 # Licensed under the LGPL
8 # suggestions for improvement and bug fixes welcome at https://github.com/fubar2/toolfactory
9 #
10 # July 2020: BCC was fun and I feel like rip van winkle after 5 years.
11 # Decided to
12 # 1. Fix the toolfactory so it works - done for simplest case
13 # 2. Fix planemo so the toolfactory function works
14 # 3. Rewrite bits using galaxyxml functions where that makes sense - done
15 #
16 # removed all the old complications including making the new tool use this same script
17 # galaxyxml now generates the tool xml https://github.com/hexylena/galaxyxml
18 # No support for automatic HTML file creation from arbitrary outputs
19 # TODO: add option to run that code as a post execution hook
20 # TODO: add additional history input parameters - currently only one
21
22 import sys
23 import subprocess
24 import shutil
25 import os
26 import time
27 import tempfile
28 import argparse
29 import tarfile
30 import re
31 import galaxyxml.tool as gxt
32 import galaxyxml.tool.parameters as gxtp
33 import logging
34
35
36 progname = os.path.split(sys.argv[0])[1]
37 myversion = 'V2.1 July 2020'
38 verbose = True
39 debug = True
40 toolFactoryURL = 'https://github.com/fubar2/toolfactory'
41 ourdelim = '~~~'
42
43
44 def timenow():
45 """return current time as a string
46 """
47 return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
48
49
50 def quote_non_numeric(s):
51 """return a prequoted string for non-numerics
52 useful for perl and Rscript parameter passing?
53 """
54 try:
55 _ = float(s)
56 return s
57 except ValueError:
58 return '"%s"' % s
59
60
61 html_escape_table = {
62 "&": "&",
63 ">": ">",
64 "<": "&lt;",
65 "$": r"\$"
66 }
67
68
69 def html_escape(text):
70 """Produce entities within text."""
71 return "".join(html_escape_table.get(c, c) for c in text)
72
73
74 def html_unescape(text):
75 """Revert entities within text. Multiple character targets so use replace"""
76 t = text.replace('&amp;', '&')
77 t = t.replace('&gt;', '>')
78 t = t.replace('&lt;', '<')
79 t = t.replace('\\$', '$')
80 return t
81
82
83 def parse_citations(citations_text):
84 """
85 """
86 citations = [c for c in citations_text.split("**ENTRY**") if c.strip()]
87 citation_tuples = []
88 for citation in citations:
89 if citation.startswith("doi"):
90 citation_tuples.append(("doi", citation[len("doi"):].strip()))
91 else:
92 citation_tuples.append(
93 ("bibtex", citation[len("bibtex"):].strip()))
94 return citation_tuples
95
96
97 class ScriptRunner:
98 """Wrapper for an arbitrary script
99 uses galaxyxml
100
101 """
102
103 def __init__(self, args=None):
104 """
105 prepare command line cl for running the tool here
106 and prepare elements needed for galaxyxml tool generation
107 """
108 lastclredirect = None
109 self.cl = []
110 aCL = self.cl.append
111 if args.output_dir: # simplify for the tool tarball
112 os.chdir(args.output_dir)
113 self.args = args
114 # a sanitizer now does this but..
115 self.tool_name = re.sub('[^a-zA-Z0-9_]+', '', args.tool_name)
116 self.tool_id = self.tool_name
117 self.xmlfile = '%s.xml' % self.tool_name
118 if self.args.interpreter_name == "Executable": # binary - no need
119 aCL(self.args.exe_package) # this little CL will just run
120 else: # a script has been provided
121 rx = open(self.args.script_path, 'r').readlines()
122 # remove pesky dos line endings if needed
123 rx = [x.rstrip() for x in rx]
124 self.script = '\n'.join(rx)
125 fhandle, self.sfile = tempfile.mkstemp(
126 prefix=self.tool_name, suffix=".%s" % (args.interpreter_name))
127 # use self.sfile as script source for Popen
128 tscript = open(self.sfile, 'w')
129 tscript.write(self.script)
130 tscript.close()
131 self.indentedScript = " %s" % '\n'.join(
132 [' %s' % html_escape(x) for x in rx]) # for restructured text in help
133 self.escapedScript = "%s" % '\n'.join(
134 [' %s' % html_escape(x) for x in rx])
135 aCL(self.args.interpreter_name)
136 aCL(self.sfile)
137 self.elog = os.path.join(self.args.output_dir,
138 "%s_error.log" % self.tool_name)
139 if args.output_dir: # may not want these complexities
140 self.tlog = os.path.join(
141 self.args.output_dir, "%s_runner.log" % self.tool_name)
142 art = '%s.%s' % (self.tool_name, args.interpreter_name)
143 artpath = os.path.join(
144 self.args.output_dir,
145 art) # need full path
146 # use self.sfile as script source for Popen
147 artifact = open(artpath, 'w')
148 artifact.write(self.script)
149 artifact.close()
150 self.infile_paths = []
151 self.infile_format = []
152 self.infile_cl = []
153 self.infile_label = []
154 self.infile_help = []
155 if self.args.input_files:
156 aif = [x.split(ourdelim) for x in self.args.input_files]
157 # transpose the input_files array passed as
158 # --input_files="$input_files~~~$CL~~~$input_formats~~~$input_label~~~$input_help"
159 laif = list(map(list, zip(*aif)))
160 self.infile_paths, self.infile_cl, self.infile_format, self.infile_label, self.infile_help = laif
161 self.infile_name = []
162 # positionals have integers indicating order - need valid internal
163 # names
164 for i, scl in enumerate(self.infile_cl):
165 if scl.isdigit():
166 scl = 'input%s' % scl
167 if scl.upper() in ['STDOUT', 'STDIN']:
168 scl = 'input%d' % (i + 1)
169 # make a list of internal names for each input file
170 self.infile_name.append(scl)
171 # list all (cl param) pairs - positional needs sorting by cl index so decorate
172 clsuffix = []
173 clsuffix.append([self.args.output_cl, self.args.output_tab])
174 if self.args.parampass == '0': # only need two
175 aCL('<')
176 aCL('%s' % self.infile_paths[0])
177 aCL('>')
178 aCL('%s' % self.args.output_tab)
179 else:
180 for i, p in enumerate(self.infile_paths):
181 # decorator is cl - sort for positional
182 clsuffix.append([self.infile_cl[i], p])
183 for p in self.args.additional_parameters:
184 psplit = p.split(ourdelim)
185 pform = psplit[5]
186 if pform == 'STDOUT':
187 lastclredirect = ['>', psplit[1]]
188 else:
189 clsuffix.append([pform, psplit[1]]) # cl,value
190 clsuffix.sort()
191 if self.args.parampass == "positional":
192 # inputs in order then params in order TODO fix ordering using
193 # self.infile_cl
194 for (k, v) in clsuffix:
195 if ' ' in v:
196 aCL("v")
197 else:
198 aCL(v)
199 elif self.args.parampass == "argparse":
200 # inputs then params in argparse named form
201 for (k, v) in clsuffix:
202 if ' ' in v:
203 aCL('--%s' % k)
204 aCL('"%s"' % v)
205 else:
206 aCL('--%s' % k)
207 aCL('%s' % v)
208 if lastclredirect:
209 for v in lastclredirect:
210 aCL(v) # add the stdout parameter last
211 self.test1Output = '%s_test1_output.xls' % self.tool_name
212 self.test1HTML = '%s_test1_output.html' % self.tool_name
213
214 def makeXML(self):
215 """
216 Create a Galaxy xml tool wrapper for the new script
217 Uses galaxyhtml
218 """
219 # need interp and executable (?script) or else executable only
220 if self.args.interpreter_name:
221 exe = "$runMe" # our dynamic script from the tool builder
222 interp = self.args.interpreter_name
223 else:
224 interp = None
225 exe = self.args.exe_package
226 assert exe is not None, 'No interpeter or executable passed in to makeXML'
227 tool = gxt.Tool(self.args.tool_name, self.tool_id,
228 self.args.tool_version, self.args.tool_desc, exe)
229 if interp:
230 tool.interpreter = interp
231 if self.args.help_text:
232 helptext = open(self.args.help_text, 'r').readlines()
233 # must html escape here too - thanks to Marius van den Beek
234 helptext = [html_escape(x) for x in helptext]
235 tool.help = ''.join([x for x in helptext])
236 else:
237 tool.help = 'Please ask the tool author (%s) for help \
238 as none was supplied at tool generation\n' % (self.args.user_email)
239 tool.version_command = None # do not want
240 inputs = gxtp.Inputs()
241 outputs = gxtp.Outputs()
242 requirements = gxtp.Requirements()
243 testparam = []
244 is_positional = (self.args.parampass == 'positional')
245 if self.args.include_dependencies == "yes":
246 requirements.append(gxtp.Requirement('package', 'ghostscript'))
247 requirements.append(gxtp.Requirement('package', 'graphicsmagick'))
248 if self.args.interpreter_name:
249 if self.args.interpreter_name == 'python': # always needed for this runner script
250 requirements.append(gxtp.Requirement(
251 'package', 'python', self.args.interpreter_version))
252 elif self.args.interpreter_name not in ['bash', 'sh']:
253 requirements.append(gxtp.Requirement(
254 'package', self.args.interpreter_name, self.args.interpreter_version))
255 else:
256 if self.args.exe_package: # uses exe not interpreter
257 requirements.append(gxtp.Requirement(
258 'package', self.args.exe_package, self.args.exe_package_version))
259 tool.requirements = requirements
260 for i, infpath in enumerate(self.infile_paths):
261 if self.args.parampass == 0:
262 assert len(
263 self.infile_name) == 1, 'Maximum one "<" if parampass is 0 - more than one input files supplied'
264 newname = self.infile_name[i]
265 if len(newname) > 1:
266 ndash = 2
267 else:
268 ndash = 1
269 if not len(self.infile_label[i]) > 0:
270 alab = self.infile_name[i]
271 else:
272 alab = self.infile_label[i]
273 aninput = gxtp.DataParam(self.infile_name[i], optional=False, label=alab, help=self.infile_help[i],
274 format=self.infile_format[i], multiple=False, num_dashes=ndash)
275 if self.args.parampass == '0':
276 aninput.command_line_override = '< $%s' % self.infile_name[i]
277 aninput.positional = is_positional
278 inputs.append(aninput)
279 for parm in self.args.additional_parameters:
280 newname, newval, newlabel, newhelp, newtype, newcl = parm.split(
281 ourdelim)
282 if not len(newlabel) > 0:
283 newlabel = newname
284 if len(newname) > 1:
285 ndash = 2
286 else:
287 ndash = 1
288 if newtype == "text":
289 aparm = gxtp.TextParam(
290 newname, label=newlabel, help=newhelp, value=newval, num_dashes=ndash)
291 elif newtype == "integer":
292 aparm = gxtp.IntegerParam(
293 newname, label=newname, help=newhelp, value=newval, num_dashes=ndash)
294 elif newtype == "float":
295 aparm = gxtp.FloatParam(
296 newname, label=newname, help=newhelp, value=newval, num_dashes=ndash)
297 else:
298 raise ValueError('Unrecognised parameter type "%s" for\
299 additional parameter %s in makeXML' % (newtype, newname))
300 aparm.positional = is_positional
301 inputs.append(aparm)
302 tparm = gxtp.TestParam(newname, value=newval)
303 testparam.append(tparm)
304 tool.inputs = inputs
305 configfiles = gxtp.Configfiles()
306 configfiles.append(gxtp.Configfile(name="runMe", text=self.script))
307 tool.configfiles = configfiles
308 if self.args.output_tab:
309 ext = self.args.output_format
310 aparm = gxtp.OutputData(
311 self.args.output_cl, format=ext, num_dashes=ndash)
312 if is_positional:
313 aparm.command_line_override = '> $output1'
314 aparm.positional = is_positional
315 outputs.append(aparm)
316 tool.outputs = outputs
317 tests = gxtp.Tests()
318 test_a = gxtp.Test()
319 ext = self.infile_format[0].split(',')[0]
320 if is_positional:
321 param = gxtp.TestParam(
322 'input1', value='input1.%s' % ext, ftype=ext)
323 else:
324 param = gxtp.TestParam(self.infile_name[0], value='%s.%s' % (
325 self.infile_name[0], ext), ftype=ext)
326 test_a.append(param)
327 param = gxtp.TestParam('job_name', value='test_a')
328 test_a.append(param)
329 param = gxtp.TestParam('runMe', value="$runMe")
330 test_a.append(param)
331 for aparam in testparam:
332 test_a.append(aparam)
333 test_out = gxtp.TestOutput(
334 name=self.args.output_cl, value=self.test1Output)
335 test_a.append(test_out)
336 tests.append(test_a)
337 tool.tests = tests
338 tool.add_comment('Created by %s at %s using the Galaxy Tool Factory.' % (
339 self.args.user_email, timenow()))
340 tool.add_comment('Source in git at: %s' % (toolFactoryURL))
341 tool.add_comment(
342 'Cite: Creating re-usable tools from scripts doi: 10.1093/bioinformatics/bts573')
343 exml = tool.export()
344 xf = open(self.xmlfile, 'w')
345 xf.write(exml)
346 xf.write('\n')
347 xf.close()
348 # ready for the tarball
349
350 def makeTooltar(self):
351 """
352 a tool is a gz tarball with eg
353 /toolname/tool.xml /toolname/tool.py /toolname/test-data/test1_in.foo ...
354 """
355 retval = self.run()
356 if retval:
357 sys.stderr.write(
358 '## Run failed. Cannot build yet. Please fix and retry')
359 sys.exit(1)
360 tdir = 'tdir_%s' % self.tool_name
361 if not os.path.exists(tdir):
362 os.mkdir(tdir)
363 self.makeXML()
364 testdir = os.path.join(tdir, 'test-data')
365 if not os.path.exists(testdir):
366 os.mkdir(testdir) # make tests directory
367 for i, infile in enumerate(self.infile_paths):
368 dest = os.path.join(testdir, '%s.%s' %
369 (self.infile_name[i], self.infile_format[i]))
370 if infile != dest:
371 shutil.copyfile(infile, dest)
372 if self.args.output_tab and os.path.exists(self.args.output_tab):
373 shutil.copyfile(self.args.output_tab,
374 os.path.join(testdir, self.test1Output))
375 else:
376 print('#### no output_tab %s exists' % self.args.output_tab)
377 if self.args.output_dir:
378 if os.path.exists(self.tlog):
379 shutil.copyfile(self.tlog, os.path.join(
380 testdir, 'test1_out.log'))
381 stname = os.path.join(tdir, self.sfile)
382 if not os.path.exists(stname):
383 shutil.copyfile(self.sfile, stname)
384 xtname = os.path.join(tdir, self.xmlfile)
385 if not os.path.exists(xtname):
386 shutil.copyfile(self.xmlfile, xtname)
387 tarpath = "%s.tar.gz" % self.tool_name
388 tar = tarfile.open(tarpath, "w:gz")
389 tar.add(tdir, recursive=True, arcname='%s' % self.tool_name)
390 tar.close()
391 shutil.copyfile(tarpath, self.args.new_tool)
392 shutil.rmtree(tdir)
393 # TODO: replace with optional direct upload to local toolshed?
394 return retval
395
396 def run(self):
397 """
398 Some devteam tools have this defensive stderr read so I'm keeping with the faith
399 Feel free to update.
400 """
401 logging.debug('run cl=%s' % str(self.cl))
402 scl = ' '.join(self.cl)
403 err = None
404 if self.args.parampass != '0':
405 ste = open(self.elog, 'wb')
406 sto = open(self.tlog, 'wb')
407 sto.write(
408 bytes('## Executing Toolfactory generated command line = %s\n' % scl, "utf8"))
409 sto.flush()
410 p = subprocess.run(self.cl, shell=False, stdout=sto,
411 stderr=ste, cwd=self.args.output_dir)
412 sto.close()
413 ste.close()
414 tmp_stderr = open(self.elog, 'rb')
415 err = ''
416 buffsize = 1048576
417 try:
418 while True:
419 err += str(tmp_stderr.read(buffsize))
420 if not err or len(err) % buffsize != 0:
421 break
422 except OverflowError:
423 pass
424 tmp_stderr.close()
425 retval = p.returncode
426 else: # work around special case of simple scripts that take stdin and write to stdout
427 sti = open(self.infile_paths[0], 'rb')
428 sto = open(self.args.output_tab, 'wb')
429 # must use shell to redirect
430 p = subprocess.run(self.cl, shell=False, stdout=sto, stdin=sti)
431 retval = p.returncode
432 sto.close()
433 sti.close()
434 if self.args.output_dir:
435 if p.returncode != 0 and err: # problem
436 sys.stderr.write(err)
437 logging.debug('run done')
438 return retval
439
440
441 def main():
442 """
443 This is a Galaxy wrapper. It expects to be called by a special purpose tool.xml as:
444 <command interpreter="python">rgBaseScriptWrapper.py --script_path "$scriptPath" --tool_name "foo" --interpreter "Rscript"
445 </command>
446 """
447 parser = argparse.ArgumentParser()
448 a = parser.add_argument
449 a('--script_path', default='')
450 a('--tool_name', default=None)
451 a('--interpreter_name', default=None)
452 a('--interpreter_version', default=None)
453 a('--exe_package', default=None)
454 a('--exe_package_version', default=None)
455 a('--output_dir', default='./')
456 a('--input_files', default=[], action="append")
457 a("--input_formats", default="tabular")
458 a('--output_tab', default=None)
459 a('--output_format', default='tabular')
460 a('--output_cl', default=None)
461 a('--user_email', default='Unknown')
462 a('--bad_user', default=None)
463 a('--make_Tool', default=None)
464 a('--help_text', default=None)
465 a('--tool_desc', default=None)
466 a('--new_tool', default=None)
467 a('--tool_version', default=None)
468 a('--include_dependencies', default=None)
469 a('--citations', default=None)
470 a('--additional_parameters', dest='additional_parameters',
471 action='append', default=[])
472 a('--edit_additional_parameters', action="store_true", default=False)
473 a('--parampass', default="positional")
474 args = parser.parse_args()
475 assert not args.bad_user, 'UNAUTHORISED: %s is NOT authorized to use this tool until Galaxy admin adds %s to "admin_users" in the Galaxy configuration file' % (
476 args.bad_user, args.bad_user)
477 assert args.tool_name, '## Tool Factory expects a tool name - eg --tool_name=DESeq'
478 assert (args.interpreter_name or args.exe_package), '## Tool Factory wrapper expects an interpreter - eg --interpreter_name=Rscript or an executable package findable by the dependency management package'
479 assert args.exe_package or (len(args.script_path) > 0 and os.path.isfile(
480 args.script_path)), '## Tool Factory wrapper expects a script path - eg --script_path=foo.R if no executable'
481 if args.output_dir:
482 try:
483 os.makedirs(args.output_dir)
484 except BaseException:
485 pass
486 args.input_files = [x.replace('"', '').replace("'", '')
487 for x in args.input_files]
488 # remove quotes we need to deal with spaces in CL params
489 for i, x in enumerate(args.additional_parameters):
490 args.additional_parameters[i] = args.additional_parameters[i].replace(
491 '"', '')
492 r = ScriptRunner(args)
493 if args.make_Tool:
494 retcode = r.makeTooltar()
495 else:
496 retcode = r.run()
497 if retcode:
498 sys.exit(retcode) # indicate failure to job runner
499
500
501 if __name__ == "__main__":
502 main()