Mercurial > repos > charles-bernard > data_manager_build_alfa_indexes
changeset 0:016200d4e379 draft
Uploaded
author | charles-bernard |
---|---|
date | Thu, 27 Oct 2016 05:15:05 -0400 |
parents | |
children | 51e82f7d66c8 |
files | data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.py data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.xml data_manager_build_alfa_indexes/data_manager_conf.xml data_manager_build_alfa_indexes/tool-data/alfa_indexes.loc.sample data_manager_build_alfa_indexes/tool-data/alfa_indexes.loc.sample~ data_manager_build_alfa_indexes/tool_data_table_conf.xml.sample data_manager_build_alfa_indexes/tool_dependencies.xml |
diffstat | 7 files changed, 310 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.py Thu Oct 27 05:15:05 2016 -0400 @@ -0,0 +1,240 @@ +#!/usr/bin/python + +import sys +import shutil +import re +import urllib2 +import subprocess +import gzip +import os +import tempfile +import logging +from optparse import OptionParser +from galaxy.util.json import from_json_string, to_json_string + + +def get_arg(): + parser = OptionParser() + parser.add_option("-e", "--ensembl", dest = 'ensembl_info', action = "store", nargs = 2, metavar = ("kingdom", "species_name"), type = "str") + parser.add_option("-o", "--output", dest='output_filename', action="store", nargs = 1, metavar = 'JSON_FILE') + (options, args) = parser.parse_args() + return options, args + +def cleanup_before_exit(tmp_dir): + if tmp_dir and os.path.exists(tmp_dir): + shutil.rmtree(tmp_dir) + +def get_page_content(url): + req = urllib2.Request(url) + page = urllib2.urlopen(req) + return page.read() + + +def download_file(link, local_file_name): + req = urllib2.Request(link) + src_file = urllib2.urlopen(req) + local_file = open(local_file_name, 'wb') + local_file.write(src_file.read()) + local_file.close() + + +def uncompress_gz(gz_file_name, uncompressed_file_name): + logging.info("____________________________________________________________") + logging.info("*** Uncompressing %s" % gz_file_name) + uncompressed_file = open(uncompressed_file_name, 'wb') + with gzip.open(gz_file_name, 'rb') as src_file: + uncompressed_file.write(src_file.read()) + uncompressed_file.close() + logging.info("-> Uncompressed !\n") + + +def add_data_table_entry( data_manager_dict, data_table_entry ): + data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} ) + data_manager_dict['data_tables']['alfa_indexes'] = data_manager_dict['data_tables'].get( 'alfa_indexes', data_table_entry ) + return data_manager_dict + + +def standardize_species_name(species_name): + standard_species_name = re.sub(r'[)]$', '', species_name) + standard_species_name = re.sub(r'[ _),-.(=]+ *', '_', standard_species_name) + return standard_species_name.lower() + + +def get_ensembl_url_root(kingdom): + logging.info("____________________________________________________________") + logging.info("*** Determining Ensembl ftp root url") + if kingdom == 'vertebrates': + root = 'ftp://ftp.ensembl.org/pub/current_gtf/' + else: + root = 'ftp://ftp.ensemblgenomes.org/pub/%s/current/' % kingdom + logging.info("-> Determined !\n") + return root + + +def test_ensembl_species_exists(kingdom, url, species_name): + logging.info("____________________________________________________________") + logging.info ("*** Testing whether %s is referenced in Ensembl %s" % (species_name, kingdom)) + list_species_file_name = 'species_Ensembl%s%s.txt' % (kingdom[0].upper(), kingdom[1:]) + if kingdom=='vertebrates': + download_file(url, list_species_file_name) + else: + download_file(url + list_species_file_name, list_species_file_name) + + grep_result = subprocess.Popen(['grep', species_name, list_species_file_name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + species_lines_matched, grep_error = grep_result.communicate() + if grep_error != None or species_lines_matched == "": + msg = 'The species \'%s\' is not referenced on Ensembl (%s)' % (species_name, kingdom) + logging.critical(msg) + sys.exit(msg) + + species_lines = species_lines_matched.split('\n') + del species_lines[-1] + nb_lines = len(species_lines) + + if nb_lines == 1: + columns = species_lines[0].split('\t') + found_species_name = columns[1] + if species_name != found_species_name: + logging.info('-> \'%s\' has been replace with the complete species name \'%s\'' % (species_name, found_species_name)) + return found_species_name, species_lines_matched + logging.info("-> Referenced !\n") + return species_name, species_lines_matched + else: + list_species = [''] * nb_lines + for i in range(0, nb_lines): + columns = species_lines[i].split('\t') + list_species[i] = columns[1] + exact_match = re.search('^%s$' % species_name, list_species[i]) + if exact_match: + logging.info("-> Referenced !\n") + return species_name, species_lines[i] + msg = 'The string \'%s\' has been matched against the list of Ensembl Species but is not a complete species name.\nPlease retry with one of the following species names:\n%s' % (species_name, list_species[0:]) + logging.critical(msg) + sys.exit(msg) + + +def get_ensembl_collection(kingdom, species_line): + logging.info("*** Extracting the %s_collection of the species" % kingdom) + collection_regex = re.compile('%s_.+_collection' % kingdom.lower()) + collection_match = re.search(collection_regex, species_line) + if not collection_match: + logging.info("-> Skiped: this species is not classified in a Ensembl %s collection\n" % kingdom) + return None + logging.info("-> Extracted !\n") + return collection_match.group(0) + + +def get_ensembl_gtf_archive_name(url_dir, species_name): + logging.info("____________________________________________________________") + logging.info("*** Extracting the gtf archive name of %s" % species_name) + gtf_archive_regex = re.compile('%s\..*\.[0-9]+\.gtf\.gz' % species_name, flags = re.IGNORECASE) + dir_content = get_page_content(url_dir) + gtf_archive_match = re.search(gtf_archive_regex, dir_content) + if not gtf_archive_match: + sys.exit('The species is referenced on Ensembl but error of nomenclature led to download failure') + gtf_archive_name = gtf_archive_match.group(0) + logging.info("-> Extracted !\n") + return gtf_archive_name + + +def get_ensembl_gtf_archive(kingdom, url, species_name, species_line): + if kingdom != 'vertebrates': + url = url + 'gtf/' + if kingdom == 'bacteria' or kingdom == 'protists' or kingdom == 'fungi': + collection = get_ensembl_collection(kingdom, species_line) + if collection != None: + url = url + "%s/" % collection + final_url = url + species_name + '/' + gtf_archive_name = get_ensembl_gtf_archive_name(final_url, species_name) + logging.info("____________________________________________________________") + logging.info("*** Download the gtf archive of %s" % species_name) + download_file(final_url + gtf_archive_name, gtf_archive_name) + logging.info("-> Downloaded !\n") + return gtf_archive_name + + +def generate_alfa_indexes(path_to_alfa, gtf_file_name): + logging.info("____________________________________________________________") + logging.info("*** Generating alfa indexes from %s" % gtf_file_name) + alfa_result = subprocess.Popen(['python', path_to_alfa + 'ALFA.py', '-a', gtf_file_name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + alfa_out, alfa_err = alfa_result.communicate() + if alfa_err != None and not re.search('### End of program', alfa_err): + msg = 'Generation Failed due an alfa error: %s' % (alfa_err) + logging.critical(msg) + sys.exit(msg) + logging.info("-> Generated !\n") + + +def get_data_table_new_entry(gtf_archive_name): + info_list = gtf_archive_name.split('.') + species = info_list[0] + version = info_list[1] + release = info_list[2] + value = '%s_%s_%s' % (species, version, release) + dbkey = value + name = '%s: %s (release %s)' % (species, version, release) + prefix = '%s.%s.%s' % (species, version, release) + entry_dict = { 'species': species, 'version': version, 'release': release, 'value': value, 'dbkey': dbkey, 'name': name, 'prefix': prefix } + return entry_dict + + +def main(): + options, args = get_arg() + galaxy_root_dir = args[0] + + path_to_alfa = os.path.join(galaxy_root_dir, 'tools/alfa/') + path_to_tmp_dir = os.path.join(galaxy_root_dir, 'database/tmp/') + + if options.output_filename == None: + msg = 'No json output file specified' + logging.critical(msg) + sys.exit(msg) + output_filename = options.output_filename + params = from_json_string(open(output_filename).read()) + target_directory = params['output_data'][0]['extra_files_path'] + os.mkdir(target_directory) + + tmp_dir = tempfile.mkdtemp(prefix='tmp', suffix='', dir=path_to_tmp_dir) + os.chdir(tmp_dir) + log_file_name = 'galaxy_log_report.log' + logging.basicConfig(level=logging.INFO, filename=log_file_name, filemode="a+", format='%(message)s') + data_manager_dict = {} + + if options.ensembl_info: + kingdom, species_name = options.ensembl_info + species_name = standardize_species_name(species_name) + url = get_ensembl_url_root(kingdom) + species_name, species_line = test_ensembl_species_exists(kingdom, url, species_name) + gtf_archive_name = get_ensembl_gtf_archive(kingdom, url, species_name, species_line) + data_table_entry = get_data_table_new_entry(gtf_archive_name) + gtf_file_name = '%s.gtf' % data_table_entry['prefix'] + uncompress_gz(gtf_archive_name, gtf_file_name) + generate_alfa_indexes(path_to_alfa, gtf_file_name) + stranded_index_name = '%s.stranded.index' % data_table_entry['prefix'] + unstranded_index_name = '%s.unstranded.index' % data_table_entry['prefix'] + add_data_table_entry(data_manager_dict, data_table_entry) + + logging.info("____________________________________________________________") + logging.info("*** General Info") + logging.info("TMP DIR:\t%s" % tmp_dir) + logging.info("TARGET DIR:\t%s" % target_directory) + logging.info("URL ROOT:\t%s" % url) + logging.info("SPECIES:\t%s" % data_table_entry['species']) + logging.info("VERSION:\t%s" % data_table_entry['version']) + logging.info("RELEASE:\t%s" % data_table_entry['release']) + logging.info("VALUE:\t%s" % data_table_entry['value']) + logging.info("DBKEY:\t%s" % data_table_entry['dbkey']) + logging.info("NAME:\t%s" % data_table_entry['name']) + logging.info("PREFIX:\t%s" % data_table_entry['prefix']) + logging.info("____________________________________________________________") + logging.info("*** Intial dictionary") + logging.info("%s" % params) + + shutil.copyfile(stranded_index_name, os.path.join(target_directory, stranded_index_name)) + shutil.copyfile(unstranded_index_name, os.path.join(target_directory, unstranded_index_name)) + shutil.copyfile(log_file_name, os.path.join(target_directory, log_file_name)) + + cleanup_before_exit(tmp_dir) + + open(output_filename, 'wb').write(to_json_string(data_manager_dict)) +main() \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.xml Thu Oct 27 05:15:05 2016 -0400 @@ -0,0 +1,27 @@ +<tool id="build_alfa_indexes" name="ALFA indexes" version="0.0.1" tool_type="manage_data"> + <description>build ALFA indexes from automatically downloaded gtf annotation file</description> + <command interpreter="python">data_manager_build_alfa_indexes.py -e "${reference_source['kingdom']}" "${reference_source['species_name']}" -o "${out_file}" $__root_dir__ </command> + <inputs> + <conditional name="reference_source"> + <param name="reference_source_selector" type="select" label="Choose the source for the reference genome"> + <option value="ensembl" selected="True">Ensembl Genomes Websites</option> + </param> + <when value="ensembl"> + <param name="kingdom" type="select" label="Select the ensembl website where to fetch the genome"> + <option value="vertebrates" selected="True">Ensembl (Vertebrates)</option> + <option value="bacteria" selected="True">Ensembl Bacteria</option> + <option value="fungi" selected="True">Ensembl Fungi</option> + <option value="metazoa" selected="True">Ensembl Metazoa</option> + <option value="plants" selected="True">Ensembl Plants</option> + <option value="protists" selected="True">Ensembl Protists</option> + </param> + <param name="species_name" type="text" value="Homo sapiens" label="Complete Species_Name" optional="False"> + <validator type="empty_field" message="Please, enter a species name."/> + </param> + </when> + </conditional> + </inputs> + <outputs> + <data name="out_file" format="data_manager_json"/> + </outputs> +</tool>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_build_alfa_indexes/data_manager_conf.xml Thu Oct 27 05:15:05 2016 -0400 @@ -0,0 +1,23 @@ +<?xml version="1.0"?> +<data_managers> + <data_manager tool_file="data_manager/data_manager_build_alfa_indexes.xml" id="build_alfa_indexes"> + <data_table name="alfa_indexes"> + <output> + <column name="species" /> + <column name="version" /> + <column name="release" /> + <column name="value" /> + <column name="dbkey" /> + <column name="name" /> + <column name="prefix" output_ref="out_file"> + <move type="directory"> + <!-- <source>{prefix}</source> - out_file.extra_files_path is used as base by default --> + <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">alfa_indexes/${dbkey}</target> + </move> + <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/alfa_indexes/${dbkey}</value_translation> + <value_translation type="function">abspath</value_translation> + </column> + </output> + </data_table> + </data_manager> +</data_managers> \ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_build_alfa_indexes/tool-data/alfa_indexes.loc.sample Thu Oct 27 05:15:05 2016 -0400 @@ -0,0 +1,2 @@ +#<species> <version> <release> <value> <dbkey> <name> <prefix> +#Dictyostelium_discoideum dicty_2 7 Dictyostelium_discoideum_dicty_2_7 Dictyostelium_discoideum_dicty_2_7 Dictyostelium_discoideum: dicty_2 (release 7) <path_to_dicty_indexes_dir>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_build_alfa_indexes/tool-data/alfa_indexes.loc.sample~ Thu Oct 27 05:15:05 2016 -0400 @@ -0,0 +1,5 @@ +#<species> <version> <release> <value> <dbkey> <name> <prefix> +#Arabidopsis_thaliana TAIR10 30 Arabidopsis_t_TAIR10_30 Arabidopsis_t_TAIR10_30 Arabidopsis thaliana: TAIR10 <path_to_alfa_indexes>/Arabidopsis_thaliana.TAIR10.30 +#Drosophila_melanogaster dm6 30 Drosophila_m_dm6_30 Drosophila_m_dm6_30 Drosophila melanogaster: dm6 <path_to_alfa_indexes>/Drosophila_melanogaster.BDGP6.30 +#Homo_sapiens v38 82 Homo_s_v38_82 Homo_s_v38_82 Homo sapiens: v38 <path_to_alfa_indexes>/Homo_sapiens.GRCh38.82 +#Mus_musculus v38 83 Mus_m_v38_83 Mus_m_v38_83 Mus musculus: v38 <path_to_alfa_indexes>/Mus_musculus.GRCm38.83.chr
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_build_alfa_indexes/tool_data_table_conf.xml.sample Thu Oct 27 05:15:05 2016 -0400 @@ -0,0 +1,7 @@ +<tables> + <!-- Locations of all alfa indexes --> + <table name="alfa_indexes" comment_char="#" allow_duplicate_entries="False"> + <columns>species, version, release, value, dbkey, name, prefix</columns> + <file path="tool-data/alfa_indexes.loc" /> + </table> +</tables>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/data_manager_build_alfa_indexes/tool_dependencies.xml Thu Oct 27 05:15:05 2016 -0400 @@ -0,0 +1,6 @@ +<tool_dependency> + <package name="alfa" version="0.1.0"> + <repository toolshed="http://testtoolshed.g2.bx.psu.edu" name="alfa" owner="charles-bernard" changeset_revision="c7279df3f7ba" /> + </package> +</tool_dependency> +