Mercurial > repos > iuc > data_manager_fetch_refseq
changeset 0:8b91891ae805 draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_refseq commit a572f1f01161527ff6ed4af05bb2e073a8ca903b
author | iuc |
---|---|
date | Mon, 01 Oct 2018 15:36:01 -0400 |
parents | |
children | 937f167631b1 |
files | data_manager/fetch_refseq.py data_manager/fetch_refseq.xml data_manager_conf.xml test-data/plastid.json tool-data/all_fasta.loc.sample tool_data_table_conf.xml.sample |
diffstat | 6 files changed, 299 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/fetch_refseq.py Mon Oct 01 15:36:01 2018 -0400 @@ -0,0 +1,165 @@ +#!/usr/bin/env python + +from __future__ import division, print_function + +import argparse +import functools +import gzip +import json +import os +import os.path +import sys +from datetime import date +from multiprocessing import Process, Queue + +import requests + +try: + from io import StringIO +except ImportError: + from StringIO import StringIO +# Refseq structure +# - Release number +# - Divisions +# 1. archea +# 2. bacteria +# 3. fungi +# 4. invertebrate +# 5. mitochondrion +# 6. other +# 7. plant +# 8. plasmid +# 9. plastid +# 10. protozoa +# 11. vertebrate mammalian +# 12. vertebrate other +# 13. viral +# within each division +# DIVNAME.\d+(.\d+)?.(genomic|protein|rna).(fna|gbff|faa|gpff).gz +# where fna and faa are FASTA, gbff and gpff are Genbank + + +def _add_data_table_entry(data_manager_dict, data_table_entry, data_table_name): + data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) + data_manager_dict['data_tables'][data_table_name] = data_manager_dict['data_tables'].get('all_fasta', []) + data_manager_dict['data_tables'][data_table_name].append(data_table_entry) + return data_manager_dict + + +def unzip_to(conn, out_dir, output_filename, chunk_size=4096, debug=False, compress=False): + input_filename = conn.get() + if compress: + open_output = gzip.open + else: + open_output = open + with open_output(os.path.join(out_dir, output_filename), 'wb') as output_file: + while input_filename != 'STOP': + if debug: + print('Reading', input_filename, file=sys.stderr) + with gzip.open(input_filename, 'rb') as input_file: + read_chunk = functools.partial(input_file.read, (chunk_size)) + for data in iter(read_chunk, b''): # use b'' as a sentinel to stop the loop. note '' != b'' in Python 3 + output_file.write(data) + os.unlink(input_filename) + input_filename = conn.get() + + +def get_refseq_division(division_name, mol_types, output_directory, debug=False, compress=False): + base_url = 'https://ftp.ncbi.nlm.nih.gov/refseq/release/' + valid_divisions = set(['archea', 'bacteria', 'complete', 'fungi', 'invertebrate', 'mitochondrion', 'other', + 'plant', 'plasmid', 'plastid', 'protozoa', 'vertebrate_mammalian', 'vertebrate_other', 'viral']) + ending_mappings = { + 'genomic': '.genomic.fna.gz', + 'protein': '.protein.faa.gz', + 'rna': 'rna.fna.gz' + } + assert division_name in valid_divisions, "Unknown division name ({})".format(division_name) + for mol_type in mol_types: + assert mol_type in ending_mappings, "Unknown molecule type ({})".format(mol_type) + if not os.path.exists(output_directory): + os.mkdir(output_directory) + release_num_file = base_url + 'RELEASE_NUMBER' + r = requests.get(release_num_file) + release_num = r.text.strip() + division_base_url = base_url + division_name + if debug: + print('Retrieving {}'.format(division_base_url), file=sys.stderr) + r = requests.get(division_base_url) + listing_text = r.text + + unzip_queues = {} + unzip_processes = [] + final_output_filenames = [] + for mol_type in mol_types: + q = unzip_queues[mol_type] = Queue() + output_filename = division_name + '.' + release_num + '.' + mol_type + '.fasta' + if compress: + output_filename += '.gz' + final_output_filenames.append(output_filename) + unzip_processes.append(Process(target=unzip_to, args=(q, output_directory, output_filename), + kwargs=dict(debug=debug, compress=compress))) + unzip_processes[-1].start() + + # sample line: <a href="vertebrate_other.86.genomic.gbff.gz">vertebrate_other.86.genomic.gbff.gz</a> 2018-07-13 00:59 10M + for line in StringIO(listing_text): + if '.gz' not in line: + continue + parts = line.split('"') + assert len(parts) == 3, "Unexpected line format: {}".format(line.rstrip()) + filename = parts[1] + for mol_type in mol_types: + ending = ending_mappings[mol_type] + if filename.endswith(ending): + if debug: + print('Downloading:', filename, ending, mol_type, file=sys.stderr) + output_filename = os.path.join(output_directory, filename) + with open(output_filename, 'wb') as output_file: + r = requests.get(division_base_url + '/' + filename) + for chunk in r.iter_content(chunk_size=4096): + output_file.write(chunk) + conn = unzip_queues[mol_type] + conn.put(output_filename) + + for mol_type in mol_types: + conn = unzip_queues[mol_type] + conn.put('STOP') + + return [release_num, final_output_filenames] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Download RefSeq databases') + parser.add_argument('--debug', default=False, action='store_true', help='Print debugging output to stderr (verbose)') + parser.add_argument('--compress', default=False, action='store_true', help='Compress output files') + parser.add_argument('--output_directory', default='tmp', help='Directory to write output to') + parser.add_argument('--galaxy_datamanager_filename', help='Galaxy JSON format file describing data manager inputs') + parser.add_argument('--division_names', help='RefSeq divisions to download') + parser.add_argument('--mol_types', help='Molecule types (genomic, rna, protein) to fetch') + parser.add_argument('--pin_date', help='Force download date to this version string') + args = parser.parse_args() + + division_names = args.division_names.split(',') + mol_types = args.mol_types.split(',') + if args.galaxy_datamanager_filename is not None: + dm_opts = json.loads(open(args.galaxy_datamanager_filename).read()) + output_directory = dm_opts['output_data'][0]['extra_files_path'] # take the extra_files_path of the first output parameter + data_manager_dict = {} + else: + output_directory = args.output_directory + for division_name in division_names: + if args.pin_date is not None: + today_str = args.pin_date + else: + today_str = date.today().strftime('%Y-%m-%d') # ISO 8601 date format + [release_num, fasta_files] = get_refseq_division(division_name, mol_types, output_directory, args.debug, args.compress) + if args.galaxy_datamanager_filename is not None: + for i, mol_type in enumerate(mol_types): + assert mol_type in fasta_files[i], "Filename does not contain expected mol_type ({}, {})".format(mol_type, fasta_files[i]) + unique_key = 'refseq_' + division_name + '.' + release_num + '.' + mol_type # note: this is now same as dbkey + dbkey = unique_key + desc = 'RefSeq ' + division_name + ' Release ' + release_num + ' ' + mol_type + ' (' + today_str + ')' + path = os.path.join(output_directory, fasta_files[i]) + _add_data_table_entry(data_manager_dict=data_manager_dict, + data_table_entry=dict(value=unique_key, dbkey=dbkey, name=desc, path=path), + data_table_name='all_fasta') + open(args.galaxy_datamanager_filename, 'w').write(json.dumps(data_manager_dict, sort_keys=True))
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager/fetch_refseq.xml Mon Oct 01 15:36:01 2018 -0400 @@ -0,0 +1,88 @@ +<tool id="data_manager_fetch_refseq" name="RefSeq data manager" version="0.0.18" tool_type="manage_data"> + <description>Fetch FASTA data from NCBI RefSeq and update all_fasta data table</description> + <requirements> + <requirement type="package">python</requirement> + <requirement type="package">requests</requirement> + </requirements> + <command detect_errors="exit_code"><![CDATA[ + python $__tool_directory__/fetch_refseq.py + #if str( $advanced.advanced_selector ) == 'advanced': + '${advanced.compress}' + #end if + --galaxy_datamanager_filename '${output_file}' + --division_names ${division_names} + --mol_types ${mol_types} + #if str( $pin_date ) != 'NO': + --pin_date '${pin_date}' + #end if + ]]></command> + <inputs> + <param argument="division_names" type="select" label="RefSeq division" multiple="true"> + <option value="archea">Archea</option> + <option value="bacteria">Bacteria</option> + <option value="complete">Complete</option> + <option value="fungi">Fungi</option> + <option value="invertebrate">Invertebrate</option> + <option value="mitochondrion">Mitochondrion</option> + <option value="other">Other</option> + <option value="plant">Plant</option> + <option value="plasmid">Plasmid</option> + <option value="plastid">Plastid</option> + <option value="protozoa">Protozoa</option> + <option value="vertebrate_mammalian">Mammalian Vertebrate</option> + <option value="vertebrate_other">Other Vertebrate</option> + <option value="viral">Viral</option> + </param> + <param argument="mol_types" type="select" multiple="true" label="Molecule type" help="Select at least one of genomic, protein or rna sequence"> + <option value="protein">Protein</option> + <option value="genomic">Genomic (DNA)</option> + <option value="rna">RNA</option> + </param> + <conditional name="advanced"> + <param name="advanced_selector" type="select" label="Advanced Options"> + <option value="basic" selected="True">Basic</option> + <option value="advanced">Advanced</option> + </param> + <when value="basic"> + </when> + <when value="advanced"> + <param type="boolean" argument="--compress" truevalue="--compress" falsevalue="" label="Compress FASTA files" + help="Compress downloaded FASTA files (with gzip). Limits compatibility with tools expecting uncompressed FASTA."/> + </when> + </conditional> + <param argument="--pin_date" type="hidden" value="NO" help="Used for testing"/> + </inputs> + <outputs> + <data name="output_file" format="data_manager_json"/> + </outputs> + <tests> + <test> + <param name="division_names" value="plastid"/> + <param name="mol_types" value="protein"/> + <param name="pin_date" value="2018-03-14"/> + <param name="advanced_selector" value="basic"/> + <output name="output_file"> + <assert_contents> + <has_text text="2018-03-14"/> + <has_text text="refseq_plastid"/> + <has_text text="/refseq_plastid."/> + </assert_contents> + </output> + </test> + </tests> + <help><![CDATA[ +This data manager fetches FASTA format collections of proteins, nucleotides (genomic DNA) and RNA +from NCBI's RefSeq_ data collection. + +RefSeq is released every two months and consists of a number of divisions. Some sequences are shared +between multiple divisions. This data manager allows the Galaxy administrator to select which +divisions and which molecule types within each division to download. Once downloaded the +files are made accessible by adding an entry into the *all_fasta* data table. + +.. _RefSeq: https://www.ncbi.nlm.nih.gov/refseq/ + ]]> + </help> + <citations> + <citation type="doi">10.1093/nar/gkv1189</citation> + </citations> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_conf.xml Mon Oct 01 15:36:01 2018 -0400 @@ -0,0 +1,20 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/fetch_refseq.xml" id="fetch_genome_fetch_refseq"> + <data_table name="all_fasta"> + <output> + <column name="value" /> + <column name="dbkey" /> + <column name="name" /> + <column name="path" output_ref="output_file" > + <move type="file"> + <source>${path}</source> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">refseq/#echo str($dbkey).split('.')[1]#/${value}.fasta</target> + </move> + <value_translation>refseq/#echo str($dbkey).split('.')[1]#/${value}.fasta</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/plastid.json Mon Oct 01 15:36:01 2018 -0400 @@ -0,0 +1,1 @@ +{"data_tables": {"all_fasta": [{"path": "tmp/plastid.89.protein.fasta.gz", "dbkey": "plastid.89.protein", "name": "RefSeq plastid Release 89 protein (2018-09-07)", "value": "plastid.89.protein.2018-03-14"}]}} \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool-data/all_fasta.loc.sample Mon Oct 01 15:36:01 2018 -0400 @@ -0,0 +1,18 @@ +#This file lists the locations and dbkeys of all the fasta files +#under the "genome" directory (a directory that contains a directory +#for each build). The script extract_fasta.py will generate the file +#all_fasta.loc. This file has the format (white space characters are +#TAB characters): +# +#<unique_build_id> <dbkey> <display_name> <file_path> +# +#So, all_fasta.loc could look something like this: +# +#apiMel3 apiMel3 Honeybee (Apis mellifera): apiMel3 /path/to/genome/apiMel3/apiMel3.fa +#hg19canon hg19 Human (Homo sapiens): hg19 Canonical /path/to/genome/hg19/hg19canon.fa +#hg19full hg19 Human (Homo sapiens): hg19 Full /path/to/genome/hg19/hg19full.fa +# +#Your all_fasta.loc file should contain an entry for each individual +#fasta file. So there will be multiple fasta files for each build, +#such as with hg19 above. +#
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tool_data_table_conf.xml.sample Mon Oct 01 15:36:01 2018 -0400 @@ -0,0 +1,7 @@ +<tables> + <!-- Locations of all fasta files under genome directory --> + <table name="all_fasta" comment_char="#"> + <columns>value, dbkey, name, path</columns> + <file path="tool-data/all_fasta.loc" /> + </table> +</tables> \ No newline at end of file