Mercurial > repos > fubar > tool_factory_2
diff toolfactory/rgToolFactory2.py @ 28:ad564ab3cf7b draft
Uploaded
author | fubar |
---|---|
date | Fri, 31 Jul 2020 23:00:31 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/toolfactory/rgToolFactory2.py Fri Jul 31 23:00:31 2020 -0400 @@ -0,0 +1,502 @@ +# rgToolFactory.py +# see https://github.com/fubar2/toolfactory +# +# copyright ross lazarus (ross stop lazarus at gmail stop com) May 2012 +# +# all rights reserved +# Licensed under the LGPL +# suggestions for improvement and bug fixes welcome at https://github.com/fubar2/toolfactory +# +# July 2020: BCC was fun and I feel like rip van winkle after 5 years. +# Decided to +# 1. Fix the toolfactory so it works - done for simplest case +# 2. Fix planemo so the toolfactory function works +# 3. Rewrite bits using galaxyxml functions where that makes sense - done +# +# removed all the old complications including making the new tool use this same script +# galaxyxml now generates the tool xml https://github.com/hexylena/galaxyxml +# No support for automatic HTML file creation from arbitrary outputs +# TODO: add option to run that code as a post execution hook +# TODO: add additional history input parameters - currently only one + +import sys +import subprocess +import shutil +import os +import time +import tempfile +import argparse +import tarfile +import re +import galaxyxml.tool as gxt +import galaxyxml.tool.parameters as gxtp +import logging + + +progname = os.path.split(sys.argv[0])[1] +myversion = 'V2.1 July 2020' +verbose = True +debug = True +toolFactoryURL = 'https://github.com/fubar2/toolfactory' +ourdelim = '~~~' + + +def timenow(): + """return current time as a string + """ + return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) + + +def quote_non_numeric(s): + """return a prequoted string for non-numerics + useful for perl and Rscript parameter passing? + """ + try: + _ = float(s) + return s + except ValueError: + return '"%s"' % s + + +html_escape_table = { + "&": "&", + ">": ">", + "<": "<", + "$": r"\$" +} + + +def html_escape(text): + """Produce entities within text.""" + return "".join(html_escape_table.get(c, c) for c in text) + + +def html_unescape(text): + """Revert entities within text. Multiple character targets so use replace""" + t = text.replace('&', '&') + t = t.replace('>', '>') + t = t.replace('<', '<') + t = t.replace('\\$', '$') + return t + + +def parse_citations(citations_text): + """ + """ + citations = [c for c in citations_text.split("**ENTRY**") if c.strip()] + citation_tuples = [] + for citation in citations: + if citation.startswith("doi"): + citation_tuples.append(("doi", citation[len("doi"):].strip())) + else: + citation_tuples.append( + ("bibtex", citation[len("bibtex"):].strip())) + return citation_tuples + + +class ScriptRunner: + """Wrapper for an arbitrary script + uses galaxyxml + + """ + + def __init__(self, args=None): + """ + prepare command line cl for running the tool here + and prepare elements needed for galaxyxml tool generation + """ + lastclredirect = None + self.cl = [] + aCL = self.cl.append + if args.output_dir: # simplify for the tool tarball + os.chdir(args.output_dir) + self.args = args + # a sanitizer now does this but.. + self.tool_name = re.sub('[^a-zA-Z0-9_]+', '', args.tool_name) + self.tool_id = self.tool_name + self.xmlfile = '%s.xml' % self.tool_name + if self.args.interpreter_name == "Executable": # binary - no need + aCL(self.args.exe_package) # this little CL will just run + else: # a script has been provided + rx = open(self.args.script_path, 'r').readlines() + # remove pesky dos line endings if needed + rx = [x.rstrip() for x in rx] + self.script = '\n'.join(rx) + fhandle, self.sfile = tempfile.mkstemp( + prefix=self.tool_name, suffix=".%s" % (args.interpreter_name)) + # use self.sfile as script source for Popen + tscript = open(self.sfile, 'w') + tscript.write(self.script) + tscript.close() + self.indentedScript = " %s" % '\n'.join( + [' %s' % html_escape(x) for x in rx]) # for restructured text in help + self.escapedScript = "%s" % '\n'.join( + [' %s' % html_escape(x) for x in rx]) + aCL(self.args.interpreter_name) + aCL(self.sfile) + self.elog = os.path.join(self.args.output_dir, + "%s_error.log" % self.tool_name) + if args.output_dir: # may not want these complexities + self.tlog = os.path.join( + self.args.output_dir, "%s_runner.log" % self.tool_name) + art = '%s.%s' % (self.tool_name, args.interpreter_name) + artpath = os.path.join( + self.args.output_dir, + art) # need full path + # use self.sfile as script source for Popen + artifact = open(artpath, 'w') + artifact.write(self.script) + artifact.close() + self.infile_paths = [] + self.infile_format = [] + self.infile_cl = [] + self.infile_label = [] + self.infile_help = [] + if self.args.input_files: + aif = [x.split(ourdelim) for x in self.args.input_files] + # transpose the input_files array passed as + # --input_files="$input_files~~~$CL~~~$input_formats~~~$input_label~~~$input_help" + laif = list(map(list, zip(*aif))) + self.infile_paths, self.infile_cl, self.infile_format, self.infile_label, self.infile_help = laif + self.infile_name = [] + # positionals have integers indicating order - need valid internal + # names + for i, scl in enumerate(self.infile_cl): + if scl.isdigit(): + scl = 'input%s' % scl + if scl.upper() in ['STDOUT', 'STDIN']: + scl = 'input%d' % (i + 1) + # make a list of internal names for each input file + self.infile_name.append(scl) + # list all (cl param) pairs - positional needs sorting by cl index so decorate + clsuffix = [] + clsuffix.append([self.args.output_cl, self.args.output_tab]) + if self.args.parampass == '0': # only need two + aCL('<') + aCL('%s' % self.infile_paths[0]) + aCL('>') + aCL('%s' % self.args.output_tab) + else: + for i, p in enumerate(self.infile_paths): + # decorator is cl - sort for positional + clsuffix.append([self.infile_cl[i], p]) + for p in self.args.additional_parameters: + psplit = p.split(ourdelim) + pform = psplit[5] + if pform == 'STDOUT': + lastclredirect = ['>', psplit[1]] + else: + clsuffix.append([pform, psplit[1]]) # cl,value + clsuffix.sort() + if self.args.parampass == "positional": + # inputs in order then params in order TODO fix ordering using + # self.infile_cl + for (k, v) in clsuffix: + if ' ' in v: + aCL("v") + else: + aCL(v) + elif self.args.parampass == "argparse": + # inputs then params in argparse named form + for (k, v) in clsuffix: + if ' ' in v: + aCL('--%s' % k) + aCL('"%s"' % v) + else: + aCL('--%s' % k) + aCL('%s' % v) + if lastclredirect: + for v in lastclredirect: + aCL(v) # add the stdout parameter last + self.test1Output = '%s_test1_output.xls' % self.tool_name + self.test1HTML = '%s_test1_output.html' % self.tool_name + + def makeXML(self): + """ + Create a Galaxy xml tool wrapper for the new script + Uses galaxyhtml + """ + # need interp and executable (?script) or else executable only + if self.args.interpreter_name: + exe = "$runMe" # our dynamic script from the tool builder + interp = self.args.interpreter_name + else: + interp = None + exe = self.args.exe_package + assert exe is not None, 'No interpeter or executable passed in to makeXML' + tool = gxt.Tool(self.args.tool_name, self.tool_id, + self.args.tool_version, self.args.tool_desc, exe) + if interp: + tool.interpreter = interp + if self.args.help_text: + helptext = open(self.args.help_text, 'r').readlines() + # must html escape here too - thanks to Marius van den Beek + helptext = [html_escape(x) for x in helptext] + tool.help = ''.join([x for x in helptext]) + else: + tool.help = 'Please ask the tool author (%s) for help \ + as none was supplied at tool generation\n' % (self.args.user_email) + tool.version_command = None # do not want + inputs = gxtp.Inputs() + outputs = gxtp.Outputs() + requirements = gxtp.Requirements() + testparam = [] + is_positional = (self.args.parampass == 'positional') + if self.args.include_dependencies == "yes": + requirements.append(gxtp.Requirement('package', 'ghostscript')) + requirements.append(gxtp.Requirement('package', 'graphicsmagick')) + if self.args.interpreter_name: + if self.args.interpreter_name == 'python': # always needed for this runner script + requirements.append(gxtp.Requirement( + 'package', 'python', self.args.interpreter_version)) + elif self.args.interpreter_name not in ['bash', 'sh']: + requirements.append(gxtp.Requirement( + 'package', self.args.interpreter_name, self.args.interpreter_version)) + else: + if self.args.exe_package: # uses exe not interpreter + requirements.append(gxtp.Requirement( + 'package', self.args.exe_package, self.args.exe_package_version)) + tool.requirements = requirements + for i, infpath in enumerate(self.infile_paths): + if self.args.parampass == 0: + assert len( + self.infile_name) == 1, 'Maximum one "<" if parampass is 0 - more than one input files supplied' + newname = self.infile_name[i] + if len(newname) > 1: + ndash = 2 + else: + ndash = 1 + if not len(self.infile_label[i]) > 0: + alab = self.infile_name[i] + else: + alab = self.infile_label[i] + aninput = gxtp.DataParam(self.infile_name[i], optional=False, label=alab, help=self.infile_help[i], + format=self.infile_format[i], multiple=False, num_dashes=ndash) + if self.args.parampass == '0': + aninput.command_line_override = '< $%s' % self.infile_name[i] + aninput.positional = is_positional + inputs.append(aninput) + for parm in self.args.additional_parameters: + newname, newval, newlabel, newhelp, newtype, newcl = parm.split( + ourdelim) + if not len(newlabel) > 0: + newlabel = newname + if len(newname) > 1: + ndash = 2 + else: + ndash = 1 + if newtype == "text": + aparm = gxtp.TextParam( + newname, label=newlabel, help=newhelp, value=newval, num_dashes=ndash) + elif newtype == "integer": + aparm = gxtp.IntegerParam( + newname, label=newname, help=newhelp, value=newval, num_dashes=ndash) + elif newtype == "float": + aparm = gxtp.FloatParam( + newname, label=newname, help=newhelp, value=newval, num_dashes=ndash) + else: + raise ValueError('Unrecognised parameter type "%s" for\ + additional parameter %s in makeXML' % (newtype, newname)) + aparm.positional = is_positional + inputs.append(aparm) + tparm = gxtp.TestParam(newname, value=newval) + testparam.append(tparm) + tool.inputs = inputs + configfiles = gxtp.Configfiles() + configfiles.append(gxtp.Configfile(name="runMe", text=self.script)) + tool.configfiles = configfiles + if self.args.output_tab: + ext = self.args.output_format + aparm = gxtp.OutputData( + self.args.output_cl, format=ext, num_dashes=ndash) + if is_positional: + aparm.command_line_override = '> $output1' + aparm.positional = is_positional + outputs.append(aparm) + tool.outputs = outputs + tests = gxtp.Tests() + test_a = gxtp.Test() + ext = self.infile_format[0].split(',')[0] + if is_positional: + param = gxtp.TestParam( + 'input1', value='input1.%s' % ext, ftype=ext) + else: + param = gxtp.TestParam(self.infile_name[0], value='%s.%s' % ( + self.infile_name[0], ext), ftype=ext) + test_a.append(param) + param = gxtp.TestParam('job_name', value='test_a') + test_a.append(param) + param = gxtp.TestParam('runMe', value="$runMe") + test_a.append(param) + for aparam in testparam: + test_a.append(aparam) + test_out = gxtp.TestOutput( + name=self.args.output_cl, value=self.test1Output) + test_a.append(test_out) + tests.append(test_a) + tool.tests = tests + tool.add_comment('Created by %s at %s using the Galaxy Tool Factory.' % ( + self.args.user_email, timenow())) + tool.add_comment('Source in git at: %s' % (toolFactoryURL)) + tool.add_comment( + 'Cite: Creating re-usable tools from scripts doi: 10.1093/bioinformatics/bts573') + exml = tool.export() + xf = open(self.xmlfile, 'w') + xf.write(exml) + xf.write('\n') + xf.close() + # ready for the tarball + + def makeTooltar(self): + """ + a tool is a gz tarball with eg + /toolname/tool.xml /toolname/tool.py /toolname/test-data/test1_in.foo ... + """ + retval = self.run() + if retval: + sys.stderr.write( + '## Run failed. Cannot build yet. Please fix and retry') + sys.exit(1) + tdir = 'tdir_%s' % self.tool_name + if not os.path.exists(tdir): + os.mkdir(tdir) + self.makeXML() + testdir = os.path.join(tdir, 'test-data') + if not os.path.exists(testdir): + os.mkdir(testdir) # make tests directory + for i, infile in enumerate(self.infile_paths): + dest = os.path.join(testdir, '%s.%s' % + (self.infile_name[i], self.infile_format[i])) + if infile != dest: + shutil.copyfile(infile, dest) + if self.args.output_tab and os.path.exists(self.args.output_tab): + shutil.copyfile(self.args.output_tab, + os.path.join(testdir, self.test1Output)) + else: + print('#### no output_tab %s exists' % self.args.output_tab) + if self.args.output_dir: + if os.path.exists(self.tlog): + shutil.copyfile(self.tlog, os.path.join( + testdir, 'test1_out.log')) + stname = os.path.join(tdir, self.sfile) + if not os.path.exists(stname): + shutil.copyfile(self.sfile, stname) + xtname = os.path.join(tdir, self.xmlfile) + if not os.path.exists(xtname): + shutil.copyfile(self.xmlfile, xtname) + tarpath = "%s.tar.gz" % self.tool_name + tar = tarfile.open(tarpath, "w:gz") + tar.add(tdir, recursive=True, arcname='%s' % self.tool_name) + tar.close() + shutil.copyfile(tarpath, self.args.new_tool) + shutil.rmtree(tdir) + # TODO: replace with optional direct upload to local toolshed? + return retval + + def run(self): + """ + Some devteam tools have this defensive stderr read so I'm keeping with the faith + Feel free to update. + """ + logging.debug('run cl=%s' % str(self.cl)) + scl = ' '.join(self.cl) + err = None + if self.args.parampass != '0': + ste = open(self.elog, 'wb') + sto = open(self.tlog, 'wb') + sto.write( + bytes('## Executing Toolfactory generated command line = %s\n' % scl, "utf8")) + sto.flush() + p = subprocess.run(self.cl, shell=False, stdout=sto, + stderr=ste, cwd=self.args.output_dir) + sto.close() + ste.close() + tmp_stderr = open(self.elog, 'rb') + err = '' + buffsize = 1048576 + try: + while True: + err += str(tmp_stderr.read(buffsize)) + if not err or len(err) % buffsize != 0: + break + except OverflowError: + pass + tmp_stderr.close() + retval = p.returncode + else: # work around special case of simple scripts that take stdin and write to stdout + sti = open(self.infile_paths[0], 'rb') + sto = open(self.args.output_tab, 'wb') + # must use shell to redirect + p = subprocess.run(self.cl, shell=False, stdout=sto, stdin=sti) + retval = p.returncode + sto.close() + sti.close() + if self.args.output_dir: + if p.returncode != 0 and err: # problem + sys.stderr.write(err) + logging.debug('run done') + return retval + + +def main(): + """ + This is a Galaxy wrapper. It expects to be called by a special purpose tool.xml as: + <command interpreter="python">rgBaseScriptWrapper.py --script_path "$scriptPath" --tool_name "foo" --interpreter "Rscript" + </command> + """ + parser = argparse.ArgumentParser() + a = parser.add_argument + a('--script_path', default='') + a('--tool_name', default=None) + a('--interpreter_name', default=None) + a('--interpreter_version', default=None) + a('--exe_package', default=None) + a('--exe_package_version', default=None) + a('--output_dir', default='./') + a('--input_files', default=[], action="append") + a("--input_formats", default="tabular") + a('--output_tab', default=None) + a('--output_format', default='tabular') + a('--output_cl', default=None) + a('--user_email', default='Unknown') + a('--bad_user', default=None) + a('--make_Tool', default=None) + a('--help_text', default=None) + a('--tool_desc', default=None) + a('--new_tool', default=None) + a('--tool_version', default=None) + a('--include_dependencies', default=None) + a('--citations', default=None) + a('--additional_parameters', dest='additional_parameters', + action='append', default=[]) + a('--edit_additional_parameters', action="store_true", default=False) + a('--parampass', default="positional") + args = parser.parse_args() + assert not args.bad_user, 'UNAUTHORISED: %s is NOT authorized to use this tool until Galaxy admin adds %s to "admin_users" in the Galaxy configuration file' % ( + args.bad_user, args.bad_user) + assert args.tool_name, '## Tool Factory expects a tool name - eg --tool_name=DESeq' + assert (args.interpreter_name or args.exe_package), '## Tool Factory wrapper expects an interpreter - eg --interpreter_name=Rscript or an executable package findable by the dependency management package' + assert args.exe_package or (len(args.script_path) > 0 and os.path.isfile( + args.script_path)), '## Tool Factory wrapper expects a script path - eg --script_path=foo.R if no executable' + if args.output_dir: + try: + os.makedirs(args.output_dir) + except BaseException: + pass + args.input_files = [x.replace('"', '').replace("'", '') + for x in args.input_files] + # remove quotes we need to deal with spaces in CL params + for i, x in enumerate(args.additional_parameters): + args.additional_parameters[i] = args.additional_parameters[i].replace( + '"', '') + r = ScriptRunner(args) + if args.make_Tool: + retcode = r.makeTooltar() + else: + retcode = r.run() + if retcode: + sys.exit(retcode) # indicate failure to job runner + + +if __name__ == "__main__": + main()