Mercurial > repos > iuc > data_manager_interproscan
changeset 0:e93e32359b67 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_interproscan commit 2f5d27a375fcc2e8d77914b3d9e402a9e2df2d97"
author | iuc |
---|---|
date | Mon, 15 Nov 2021 17:21:22 +0000 |
parents | |
children | 0db4f153d86d |
files | data_manager/interproscan.py data_manager/interproscan.xml data_manager_conf.xml test-data/interproscan.loc tool-data/interproscan.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 7 files changed, 276 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/interproscan.py Mon Nov 15 17:21:22 2021 +0000 @@ -0,0 +1,160 @@ +#!/usr/bin/env python + +import argparse +import hashlib +import json +import operator +import os +import re +import shutil +import subprocess +import sys +import tarfile + +import requests + + +GH_REPO_API = 'https://api.github.com/repos/ebi-pf-team/interproscan/' +MD5_URL = 'http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/{version}/interproscan-{version}-64-bit.tar.gz.md5' +DATA_URL = 'http://ftp.ebi.ac.uk/pub/software/unix/iprscan/5/{version}/interproscan-{version}-64-bit.tar.gz' + +# For tests: download a smaller archive containing *some* data +PARTIAL_URL = 'https://github.com/ebi-pf-team/interproscan/archive/{version}.tar.gz' + + +def list_tags(url=None): + + if not url: + url = GH_REPO_API + 'tags' + + resp = requests.get(url=url) + data = resp.json() + + tags = [] + for tag in data: + if re.match(r"^[0-9]\.[0-9]{2}-[0-9]{2}\.[0-9]$", tag['name']): + tags.append(tag['name']) + + if 'next' in resp.links: + tags += list_tags(resp.links['next']['url']) + + return sorted(tags) + + +def download_file(url, dest): + with requests.get(url, stream=True) as r: + r.raise_for_status() + with open(dest, 'wb') as f: + for chunk in r.iter_content(chunk_size=8192): + f.write(chunk) + + +def main(): + parser = argparse.ArgumentParser(description='Download data for InterProScan') + parser.add_argument('--partial', dest='partial', action='store_true', help='Only download a small subset of data (for testing)') + parser.add_argument('-v', '--version', help='Specify an InterProScan version (default: latest)') + parser.add_argument("datatable_name") + parser.add_argument("galaxy_datamanager_filename") + + args = parser.parse_args() + + with open(args.galaxy_datamanager_filename) as fh: + config = json.load(fh) + + output_directory = config.get("output_data", [{}])[0].get("extra_files_path", None) + data_manager_dict = {} + data_manager_dict["data_tables"] = config.get("data_tables", {}) + data_manager_dict["data_tables"][args.datatable_name] = data_manager_dict[ + "data_tables" + ].get(args.datatable_name, []) + + os.mkdir(output_directory) + + all_tags = list_tags() + + if args.version: + if args.version not in all_tags: + raise RuntimeError("Version '%s' is not valid" % args.version) + tag = args.version + else: + tag = all_tags[-1] + + print("Will download data for InterProScan version: %s" % tag) + + print("Getting MD5 checksum:") + md5 = requests.get(url=MD5_URL.format(version=tag)).text + if not re.match(r"^([a-fA-F\d]{32}) interproscan-[0-9]\.[0-9]{2}-[0-9]{2}\.[0-9]-64-bit.tar.gz$", md5): + raise RuntimeError("Got invalid MD5 from the InterProScan FTP server: '%s'" % md5) + print("%s" % md5) + + if args.partial: + print("Downloading partial data tarball...") + dest_tar = os.path.join(output_directory, PARTIAL_URL.format(version=tag).split('/')[-1]) + download_file(PARTIAL_URL.format(version=tag), dest_tar) + else: + print("Downloading data tarball...") + dest_tar = os.path.join(output_directory, DATA_URL.format(version=tag).split('/')[-1]) + download_file(DATA_URL.format(version=tag), dest_tar) + + print("Finished, now checking md5...") + md5_computed = hashlib.md5(open(dest_tar, 'rb').read()).hexdigest() + if not md5.startswith(md5_computed): + raise RuntimeError("MD5 check failed: computed '%s', expected '%s'" % (md5_computed, md5)) + + print("Ok, now extracting data...") + tar = tarfile.open(dest_tar, "r:gz") + tar.extractall(output_directory) + tar.close() + + if args.partial: + print("Moving partial data files around...") + shutil.move(os.path.join(output_directory, 'interproscan-%s' % tag, 'core/jms-implementation/support-mini-x86-32/data/'), os.path.join(output_directory, 'data')) + else: + print("Moving data files around...") + shutil.move(os.path.join(output_directory, 'interproscan-%s' % tag), os.path.join(output_directory, 'data')) + + print("Done, removing tarball and unneeded files...") + os.remove(dest_tar) + shutil.rmtree(os.path.join(output_directory, 'interproscan-%s' % tag)) + + print("Running initial_setup.py (index hmm models)...") + # Write a temp properties file in work dir + prop_file_src = os.path.join(os.path.dirname(os.path.realpath(shutil.which("interproscan.sh"))), 'interproscan.properties') + with open(prop_file_src, 'r') as prop: + prop_content = prop.read() + prop_content = re.sub(r'^data\.directory=.*$', 'data.directory=%s' % os.path.join(output_directory, 'data'), prop_content, flags=re.M) + with open('interproscan.properties', 'w') as prop: + prop.write(prop_content) + # Run the index command + cmd_args = [os.path.join(os.path.dirname(os.path.realpath(shutil.which("interproscan.sh"))), 'initial_setup.py')] + proc = subprocess.Popen(args=cmd_args, shell=False) + out, err = proc.communicate() + print(out) + print(err, file=sys.stderr) + return_code = proc.wait() + if return_code: + print("Error running initial_setup.py.", file=sys.stderr) + sys.exit(return_code) + + data_manager_dict["data_tables"][args.datatable_name].append( + dict( + value=tag, + description="InterProScan %s" % tag, + interproscan_version=tag, + path=output_directory, + ) + ) + + print("Saving data table content...") + + data_manager_dict["data_tables"][args.datatable_name].sort( + key=operator.itemgetter("value"), reverse=True + ) + with open(args.galaxy_datamanager_filename, "w") as fh: + json.dump(data_manager_dict, fh, indent=2, sort_keys=True) + + print("Finished.") + + +if __name__ == "__main__": + main()
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/interproscan.xml Mon Nov 15 17:21:22 2021 +0000 @@ -0,0 +1,69 @@ +<tool id="data_manager_interproscan" name="InterProScan data manager" version="0.0.1" tool_type="manage_data" profile="20.01"> + <requirements> + <requirement type="package" version="5.52-86.0">interproscan</requirement> + <requirement type="package" version="2.26.0">requests</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ +python '$__tool_directory__/interproscan.py' +$partial_data +--version '$version' +'interproscan' +'${output_file}' + ]]></command> + <inputs> + <param name="partial_data" type="hidden" value="" help="Used for testing"/> + <param name="version" type="text" value="" label="Version to download" help="Leave empty to download the latest version"> + <validator type="regex" message="Version must be a valid InterProScan version (e.g. 5.52-86.0)">^([0-9]+\.[0-9]+-[0-9]+\.[0-9]+)?$</validator> + </param> + </inputs> + <outputs> + <data name="output_file" format="data_manager_json"/> + </outputs> + <tests> + <test> + <param name="partial_data" value="--partial"/> + <output name="output_file"> + <assert_contents> + <has_text text="InterProScan 5."/> + <has_text text='"interproscan_version": "5.'/> + </assert_contents> + </output> + <assert_stdout> + <has_text text="Pressed and indexed" /> + <has_text text="Completed indexing the hmm models" /> + </assert_stdout> + </test> + <test> + <param name="partial_data" value="--partial"/> + <param name="version" value="5.51-85.0"/> + <output name="output_file"> + <assert_contents> + <has_text text="InterProScan 5.51-85.0"/> + <has_text text='"interproscan_version": "5.51-85.0'/> + </assert_contents> + </output> + <assert_stdout> + <has_text text="Pressed and indexed" /> + <has_text text="Completed indexing the hmm models" /> + </assert_stdout> + </test> + <test expect_failure="true"> + <param name="partial_data" value="--partial"/> + <param name="version" value="xxxx"/> + <assert_stderr> + <has_text text="Version must be a valid InterProScan version" /> + </assert_stderr> + </test> + </tests> + <help><![CDATA[ + This data managers fetches data from EBI FTP server for the InterProScan + annotation tool and updates the InterProScan data table. + ]]></help> + <citations> + <citation type="doi">10.1093/bioinformatics/btu031</citation> + <citation type="doi">10.7717/peerj.167</citation> + <citation type="doi">10.1093/bioinformatics/17.9.847</citation> + <citation type="doi">10.1093/nar/gki442</citation> + <citation type="doi">10.1093/nar/gkn785</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Mon Nov 15 17:21:22 2021 +0000 @@ -0,0 +1,19 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/interproscan.xml" id="data_manager_interproscan"> + <data_table name="interproscan"> + <output> + <column name="value" /> + <column name="description" /> + <column name="format_version" /> + <column name="path" output_ref="output_file" > + <move type="directory" relativize_symlinks="True"> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">interproscan/${value}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/interproscan/${value}</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/interproscan.loc Mon Nov 15 17:21:22 2021 +0000 @@ -0,0 +1,8 @@ +# this is a tab separated file describing the location of interproscan databases used for the +# interproscan annotation tool +# +# the columns are: +# value description interproscan_version path +# +# for example +# 5.52-86.0 InterProScan 5.52-86.0 5.52-86.0 /tmp/database/interproscan/5.52-86.0/
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/interproscan.loc.sample Mon Nov 15 17:21:22 2021 +0000 @@ -0,0 +1,8 @@ +# this is a tab separated file describing the location of interproscan databases used for the +# interproscan annotation tool +# +# the columns are: +# value description interproscan_version path +# +# for example +# 5.52-86.0 InterProScan 5.52-86.0 5.52-86.0 /tmp/database/interproscan/5.52-86.0/
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Mon Nov 15 17:21:22 2021 +0000 @@ -0,0 +1,6 @@ +<tables> + <table name="interproscan" comment_char="#" allow_duplicate_entries="False"> + <columns>value, description, interproscan_version, path</columns> + <file path="tool-data/interproscan.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Mon Nov 15 17:21:22 2021 +0000 @@ -0,0 +1,6 @@ +<tables> + <table name="interproscan" comment_char="#" allow_duplicate_entries="False"> + <columns>value, description, interproscan_version, path</columns> + <file path="${__HERE__}/test-data/interproscan.loc" /> + </table> +</tables>