Mercurial > repos > iuc > data_manager_packaged_annotation_data
changeset 0:2a546f92b1ba draft default tip
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_packaged_annotation_data commit 339a6c16fb6d944d4e147b5192cbeb0ebd26d18e"
author | iuc |
---|---|
date | Tue, 04 Jan 2022 18:34:18 +0000 |
parents | |
children | |
files | data_manager/install_packaged_annotation_data.py data_manager/install_packaged_annotation_data.xml data_manager_conf.xml test-data/dbkeys.loc test-data/from_test-meta.data_manager.json test-data/packaged_annotation_data.loc test-data/test-meta.yml tool-data/dbkeys.loc.sample tool-data/packaged_annotation_data.loc.sample tool_data_table_conf.xml.sample tool_data_table_conf.xml.test |
diffstat | 11 files changed, 284 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/install_packaged_annotation_data.py Tue Jan 04 18:34:18 2022 +0000 @@ -0,0 +1,150 @@ +#!/usr/bin/env python + +import argparse +import datetime +import json +import os +import re +from urllib.request import urlretrieve + +import yaml + + +class PackagedAnnotationMeta(): + @classmethod + def from_file(cls, fname): + meta = yaml.safe_load(open(fname)) + return cls(meta) + + def __init__(self, meta_dict): + if 'build' not in meta_dict: + meta_dict['build'] = datetime.date.today().isoformat() + if 'volume' not in meta_dict: + meta_dict['volume'] = 1 + + required_meta = ['name', 'build', 'volume', 'refgenome', 'records'] + for key in required_meta: + if not meta_dict.get(key): + raise KeyError( + 'Required info "{0}" missing from metadata' + .format(key) + ) + required_record_meta = ['id', 'name', 'version', 'format', 'source'] + for key in required_record_meta: + for record in meta_dict['records']: + if not record.get(key): + raise KeyError( + '{0}\n' + 'Required info "{1}" missing from record metadata' + .format(record, key) + ) + self.meta = meta_dict + self.meta['id'] = self._get_id() + + def _get_id(self): + components = [ + self.meta['name'], + self.meta['refgenome'], + str(self.meta['volume']), + str(self.meta['build']) + ] + return '__'.join( + [ + re.sub(r'[^a-zA-Z_0-9\-]', '', i.replace(' ', '_')) + for i in components + ] + ) + + def records(self, full_record_names=False): + for record in self.meta['records']: + ret = record.copy() + if full_record_names: + ret['name'] = self._full_record_name(record) + yield ret + + def fullname(self): + return '{0} ({1}, vol:{2}/build:{3})'.format( + self.meta['name'], + self.meta['refgenome'], + self.meta['volume'], + self.meta['build'] + ) + + def _full_record_name(self, record): + return '{0} ({1}, {2}; from {3}/vol:{4}/build:{5})'.format( + record['name'], record['version'], + self.meta['refgenome'], + self.meta['name'], + self.meta['volume'], + self.meta['build'] + ) + + def dump(self, fname): + with open(fname, 'w') as fo: + yaml.dump( + self.meta, fo, allow_unicode=False, default_flow_style=False + ) + + +def fetch_data(source_url, target_file): + final_file, headers = urlretrieve(source_url, target_file) + + +def meta_to_dm_records(meta, dbkey=None): + data_table_rows = [] + for record in meta.records(full_record_names=True): + data_table_rows.append( + { + 'value': '{0}:{1}'.format(meta.meta['id'], record['id']), + 'dbkey': dbkey or meta.meta['refgenome'], + 'data_name': record['name'], + 'data_id': record['id'], + 'data_format': record['format'], + 'package_id': meta.meta['id'], + 'package_name': meta.fullname(), + 'path': '{0}/{1}'.format( + meta.meta['volume'], + meta.meta['build'] + ) + } + ) + return data_table_rows + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('metadata') + parser.add_argument( + '-o', '--galaxy-datamanager-json', + required=True + ) + parser.add_argument('-t', '--target-directory', default=None) + parser.add_argument('--dbkey', default=None) + args = parser.parse_args() + + if args.target_directory: + if not os.path.isdir(args.target_directory): + os.mkdir(args.target_directory) + else: + args.target_directory = os.getcwd() + + meta = PackagedAnnotationMeta.from_file(args.metadata) + + for record in meta.records(): + fetch_data( + record['source'], + os.path.join(args.target_directory, record['id']) + ) + + meta.dump(os.path.join(args.target_directory, 'meta.yml')) + + # Finally, we prepare the metadata for the new data table record ... + data_manager_dict = { + 'data_tables': { + 'packaged_annotation_data': meta_to_dm_records(meta, args.dbkey) + } + } + + # ... and save it to the json results file + with open(args.galaxy_datamanager_json, 'w') as fh: + json.dump(data_manager_dict, fh, sort_keys=True)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/install_packaged_annotation_data.xml Tue Jan 04 18:34:18 2022 +0000 @@ -0,0 +1,40 @@ +<tool id="data_manager_packaged_annotation_data" name="Download and install packaged collections of genome annotation data" version="0.1" tool_type="manage_data" profile="20.05"> + <description>fetching</description> + <requirements> + <requirement type="package" version="3.9">python</requirement> + <requirement type="package" version="6.0">pyyaml</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ +python '$__tool_directory__/install_packaged_annotation_data.py' + --target-directory '${out_file.extra_files_path}' + -o '$out_file' + --dbkey '$dbkey' + '$metadata' + ]]></command> + <inputs> + <param name="dbkey" type="genomebuild" + label="DBKEY of genome that the annotation data is for" + help="Take a look at the refgenome value from the metadata file to guide you in your selection." /> + <param name="metadata" type="data" format="txt" label="Metadata describing the package and its contents" /> + </inputs> + <outputs> + <data name="out_file" format="data_manager_json" /> + </outputs> + <tests> + <!-- TODO: need some way to test that new entry was added to data table --> + <test> + <param name="dbkey" value="hg19"/> + <param name="metadata" value="test-meta.yml"/> + <output name="out_file" file="from_test-meta.data_manager.json"/> + </test> + </tests> + <help> +**What it does** + +This tool fetches and installs packages of genome annotation datasets that are +not tightly bound to specific tools, but generic enough to be of use for many +different tools. + +It populates the "packaged_annotation_data" data table. + </help> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Tue Jan 04 18:34:18 2022 +0000 @@ -0,0 +1,25 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/install_packaged_annotation_data.xml" id="data_manager_packaged_annotation_data" > + <data_table name="packaged_annotation_data"> <!-- Defines a Data Table to be modified. --> + <output> <!-- Handle the output of the Data Manager Tool --> + <column name="value" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="dbkey" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="data_name" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="data_id" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="data_format" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="package_id" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="package_name" /> <!-- columns that are going to be specified by the Data Manager Tool --> + <column name="path" output_ref="out_file" > + <move type="directory" relativize_symlinks="True"> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">packaged_annotation_data/${dbkey}/${package_id}/${path}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/packaged_annotation_data/${dbkey}/${package_id}/${path}/</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers> + +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/dbkeys.loc Tue Jan 04 18:34:18 2022 +0000 @@ -0,0 +1,2 @@ +#<dbkey> <display_name> <len_file_path> +hg19 Human hg19 a_path
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/from_test-meta.data_manager.json Tue Jan 04 18:34:18 2022 +0000 @@ -0,0 +1,1 @@ +{"data_tables": {"packaged_annotation_data": [{"data_format": "bed", "data_id": "hotspots.data", "data_name": "CancerHotspots (v2, hg19; from Cancer variant data/vol:1/build:1)", "dbkey": "hg19", "package_id": "Cancer_variant_data__hg19__1__1", "package_name": "Cancer variant data (hg19, vol:1/build:1)", "path": "1/1", "value": "Cancer_variant_data__hg19__1__1:hotspots.data"}, {"data_format": "bed", "data_id": "civic.variants", "data_name": "CIViC variants (01-Feb-2019, hg19; from Cancer variant data/vol:1/build:1)", "dbkey": "hg19", "package_id": "Cancer_variant_data__hg19__1__1", "package_name": "Cancer variant data (hg19, vol:1/build:1)", "path": "1/1", "value": "Cancer_variant_data__hg19__1__1:civic.variants"}]}} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/packaged_annotation_data.loc Tue Jan 04 18:34:18 2022 +0000 @@ -0,0 +1,3 @@ +#<value> <dbkey> <data_name> <data_id> <data_format> <package_id> <package_name> <path> +# +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/test-meta.yml Tue Jan 04 18:34:18 2022 +0000 @@ -0,0 +1,18 @@ +name: Cancer variant data +build: 1 +refgenome: hg19 +records: + - id: hotspots.data + name: CancerHotspots + version: v2 + doi: 10.1158/2159-8290.CD-17-0321 + format: bed + source: https://zenodo.org/api/files/a89ff3af-261e-4c24-a9fb-5050ce8807b2/hotspots.bed + checksum: md5:ec8ec9afd4ae4935ac474e150e4e90aa + - id: civic.variants + name: CIViC variants + version: 01-Feb-2019 + doi: http://dx.doi.org/10.1038/ng.3774 + format: bed + source: https://zenodo.org/api/files/a89ff3af-261e-4c24-a9fb-5050ce8807b2/01-Feb-2019-CIVic.bed + checksum: md5:9e42bb7492be9e0011bf29b7e4f83f41
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/dbkeys.loc.sample Tue Jan 04 18:34:18 2022 +0000 @@ -0,0 +1,1 @@ +#<dbkey> <display_name> <len_file_path>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/packaged_annotation_data.loc.sample Tue Jan 04 18:34:18 2022 +0000 @@ -0,0 +1,20 @@ +#This file describes genome annotation data packages and their contents +#available on the server. +#Such data can consist of any number of individual files in a variety of +#formats (e.g., bed, vcf, tabular) describing any features with respect to the +#genome with the associated dbkey. +#The directory referenced in the <path> column of the table is expected to +#contain the file listed under <data_id> and a meta.yml file with details about +#the annotation package volume and all of its contents. +#This data table has the format (white space characters are TAB characters): +# +#<value> <dbkey> <data_name> <data_id> <data_format> <package_id> <package_name> <path> +# +#So, packaged_annotation_data.loc tables could look like this: +# +#dbSNP_hg19__1__1:dbSNP.tidy hg19 dbSNP tidy (b147.20160601, hg19; from dbSNP/vol:1/build:1) dbSNP.tidy vcf_bgzip dbSNP__hg19__1__1 dbSNP (hg19, vol:1/build:1) /path/to/packaged_annotation_data/hg19/dbSNP/1/1 +#Cancer_variant_data__1__1:hotspots.data hg19 CancerHotspots (v2, hg19; from Cancer variant data/vol:1/build:1) hotspots.data bed Cancer_variant_data__hg19__1__1 Cancer variant data (hg19, vol:1/build:1) /path/to/packaged_annotation_data/hg19/Cancer_variant_data/1/1 +#Cancer_genes_data__1__1:civic.genes hg19 CIViC genes (01-Feb-2019, hg19; from Cancer gene data/vol:1/build:1) civic.genes tabular Cancer_gene_data__hg19__1__1 Cancer gene data (hg19, vol:1/build:1) /path/to/packaged_annotation_data/hg19/Cancer_variant_data/1/1 +#SARS-CoV-2_amplicon_primer_sets__NC_045512.2__1__1:ARTICv3 NC_045512.2 ARTIC (v3, NC_045512.2; from SARS-CoV-2 amplicon primer sets/vol:1/build:1) ARTICv3 bed6 SARS-CoV-2_amplicon_primer_sets__NC_045512.2__1__1 SARS-CoV-2 amplicon primer sets (NC_045512.2, vol:1/build:1) /path/to/packaged_annotation_data/NC_045512.2/SARS-CoV-2_amplicon_primer_sets/1/1 +#SARS-CoV-2_amplicon_primer_sets__NC_045512.2__1__1:ARTICv4 NC_045512.2 ARTIC (v4, NC_045512.2; from SARS-CoV-2 amplicon primer sets/vol:1/build:1) ARTICv4 bed6 SARS-CoV-2_amplicon_primer_sets__NC_045512.2__1__1 SARS-CoV-2 amplicon primer sets (NC_045512.2, vol:1/build:1) /path/to/packaged_annotation_data/NC_045512.2/SARS-CoV-2_amplicon_primer_sets/1/1 +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Tue Jan 04 18:34:18 2022 +0000 @@ -0,0 +1,12 @@ +<tables> + <!-- Table of installed structured annotation data --> + <table name="packaged_annotation_data" comment_char="#"> + <columns>value, dbkey, data_name, data_id, data_format, package_id, package_name, path</columns> + <file path="tool-data/packaged_annotation_data.loc" /> + </table> + <!-- Locations of dbkeys and len files under genome directory --> + <table name="__dbkeys__" comment_char="#"> + <columns>value, name, len_path</columns> + <file path="tool-data/dbkeys.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.test Tue Jan 04 18:34:18 2022 +0000 @@ -0,0 +1,12 @@ +<tables> + <!-- Table of installed structured annotation data --> + <table name="packaged_annotation_data" comment_char="#"> + <columns>value, dbkey, data_name, data_id, data_format, package_id, package_name, path</columns> + <file path="${__HERE__}/test-data/packaged_annotation_data.loc" /> + </table> + <!-- Locations of dbkeys and len files under genome directory --> + <table name="__dbkeys__" comment_char="#"> + <columns>value, name, len_path</columns> + <file path="${__HERE__}/test-data/dbkeys.loc" /> + </table> +</tables>