# HG changeset patch
# User charles-bernard
# Date 1477559705 14400
# Node ID 016200d4e379962d5b48e00dcaebcd130b2b179a
Uploaded
diff -r 000000000000 -r 016200d4e379 data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.py Thu Oct 27 05:15:05 2016 -0400
@@ -0,0 +1,240 @@
+#!/usr/bin/python
+
+import sys
+import shutil
+import re
+import urllib2
+import subprocess
+import gzip
+import os
+import tempfile
+import logging
+from optparse import OptionParser
+from galaxy.util.json import from_json_string, to_json_string
+
+
+def get_arg():
+ parser = OptionParser()
+ parser.add_option("-e", "--ensembl", dest = 'ensembl_info', action = "store", nargs = 2, metavar = ("kingdom", "species_name"), type = "str")
+ parser.add_option("-o", "--output", dest='output_filename', action="store", nargs = 1, metavar = 'JSON_FILE')
+ (options, args) = parser.parse_args()
+ return options, args
+
+def cleanup_before_exit(tmp_dir):
+ if tmp_dir and os.path.exists(tmp_dir):
+ shutil.rmtree(tmp_dir)
+
+def get_page_content(url):
+ req = urllib2.Request(url)
+ page = urllib2.urlopen(req)
+ return page.read()
+
+
+def download_file(link, local_file_name):
+ req = urllib2.Request(link)
+ src_file = urllib2.urlopen(req)
+ local_file = open(local_file_name, 'wb')
+ local_file.write(src_file.read())
+ local_file.close()
+
+
+def uncompress_gz(gz_file_name, uncompressed_file_name):
+ logging.info("____________________________________________________________")
+ logging.info("*** Uncompressing %s" % gz_file_name)
+ uncompressed_file = open(uncompressed_file_name, 'wb')
+ with gzip.open(gz_file_name, 'rb') as src_file:
+ uncompressed_file.write(src_file.read())
+ uncompressed_file.close()
+ logging.info("-> Uncompressed !\n")
+
+
+def add_data_table_entry( data_manager_dict, data_table_entry ):
+ data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} )
+ data_manager_dict['data_tables']['alfa_indexes'] = data_manager_dict['data_tables'].get( 'alfa_indexes', data_table_entry )
+ return data_manager_dict
+
+
+def standardize_species_name(species_name):
+ standard_species_name = re.sub(r'[)]$', '', species_name)
+ standard_species_name = re.sub(r'[ _),-.(=]+ *', '_', standard_species_name)
+ return standard_species_name.lower()
+
+
+def get_ensembl_url_root(kingdom):
+ logging.info("____________________________________________________________")
+ logging.info("*** Determining Ensembl ftp root url")
+ if kingdom == 'vertebrates':
+ root = 'ftp://ftp.ensembl.org/pub/current_gtf/'
+ else:
+ root = 'ftp://ftp.ensemblgenomes.org/pub/%s/current/' % kingdom
+ logging.info("-> Determined !\n")
+ return root
+
+
+def test_ensembl_species_exists(kingdom, url, species_name):
+ logging.info("____________________________________________________________")
+ logging.info ("*** Testing whether %s is referenced in Ensembl %s" % (species_name, kingdom))
+ list_species_file_name = 'species_Ensembl%s%s.txt' % (kingdom[0].upper(), kingdom[1:])
+ if kingdom=='vertebrates':
+ download_file(url, list_species_file_name)
+ else:
+ download_file(url + list_species_file_name, list_species_file_name)
+
+ grep_result = subprocess.Popen(['grep', species_name, list_species_file_name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+ species_lines_matched, grep_error = grep_result.communicate()
+ if grep_error != None or species_lines_matched == "":
+ msg = 'The species \'%s\' is not referenced on Ensembl (%s)' % (species_name, kingdom)
+ logging.critical(msg)
+ sys.exit(msg)
+
+ species_lines = species_lines_matched.split('\n')
+ del species_lines[-1]
+ nb_lines = len(species_lines)
+
+ if nb_lines == 1:
+ columns = species_lines[0].split('\t')
+ found_species_name = columns[1]
+ if species_name != found_species_name:
+ logging.info('-> \'%s\' has been replace with the complete species name \'%s\'' % (species_name, found_species_name))
+ return found_species_name, species_lines_matched
+ logging.info("-> Referenced !\n")
+ return species_name, species_lines_matched
+ else:
+ list_species = [''] * nb_lines
+ for i in range(0, nb_lines):
+ columns = species_lines[i].split('\t')
+ list_species[i] = columns[1]
+ exact_match = re.search('^%s$' % species_name, list_species[i])
+ if exact_match:
+ logging.info("-> Referenced !\n")
+ return species_name, species_lines[i]
+ msg = 'The string \'%s\' has been matched against the list of Ensembl Species but is not a complete species name.\nPlease retry with one of the following species names:\n%s' % (species_name, list_species[0:])
+ logging.critical(msg)
+ sys.exit(msg)
+
+
+def get_ensembl_collection(kingdom, species_line):
+ logging.info("*** Extracting the %s_collection of the species" % kingdom)
+ collection_regex = re.compile('%s_.+_collection' % kingdom.lower())
+ collection_match = re.search(collection_regex, species_line)
+ if not collection_match:
+ logging.info("-> Skiped: this species is not classified in a Ensembl %s collection\n" % kingdom)
+ return None
+ logging.info("-> Extracted !\n")
+ return collection_match.group(0)
+
+
+def get_ensembl_gtf_archive_name(url_dir, species_name):
+ logging.info("____________________________________________________________")
+ logging.info("*** Extracting the gtf archive name of %s" % species_name)
+ gtf_archive_regex = re.compile('%s\..*\.[0-9]+\.gtf\.gz' % species_name, flags = re.IGNORECASE)
+ dir_content = get_page_content(url_dir)
+ gtf_archive_match = re.search(gtf_archive_regex, dir_content)
+ if not gtf_archive_match:
+ sys.exit('The species is referenced on Ensembl but error of nomenclature led to download failure')
+ gtf_archive_name = gtf_archive_match.group(0)
+ logging.info("-> Extracted !\n")
+ return gtf_archive_name
+
+
+def get_ensembl_gtf_archive(kingdom, url, species_name, species_line):
+ if kingdom != 'vertebrates':
+ url = url + 'gtf/'
+ if kingdom == 'bacteria' or kingdom == 'protists' or kingdom == 'fungi':
+ collection = get_ensembl_collection(kingdom, species_line)
+ if collection != None:
+ url = url + "%s/" % collection
+ final_url = url + species_name + '/'
+ gtf_archive_name = get_ensembl_gtf_archive_name(final_url, species_name)
+ logging.info("____________________________________________________________")
+ logging.info("*** Download the gtf archive of %s" % species_name)
+ download_file(final_url + gtf_archive_name, gtf_archive_name)
+ logging.info("-> Downloaded !\n")
+ return gtf_archive_name
+
+
+def generate_alfa_indexes(path_to_alfa, gtf_file_name):
+ logging.info("____________________________________________________________")
+ logging.info("*** Generating alfa indexes from %s" % gtf_file_name)
+ alfa_result = subprocess.Popen(['python', path_to_alfa + 'ALFA.py', '-a', gtf_file_name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+ alfa_out, alfa_err = alfa_result.communicate()
+ if alfa_err != None and not re.search('### End of program', alfa_err):
+ msg = 'Generation Failed due an alfa error: %s' % (alfa_err)
+ logging.critical(msg)
+ sys.exit(msg)
+ logging.info("-> Generated !\n")
+
+
+def get_data_table_new_entry(gtf_archive_name):
+ info_list = gtf_archive_name.split('.')
+ species = info_list[0]
+ version = info_list[1]
+ release = info_list[2]
+ value = '%s_%s_%s' % (species, version, release)
+ dbkey = value
+ name = '%s: %s (release %s)' % (species, version, release)
+ prefix = '%s.%s.%s' % (species, version, release)
+ entry_dict = { 'species': species, 'version': version, 'release': release, 'value': value, 'dbkey': dbkey, 'name': name, 'prefix': prefix }
+ return entry_dict
+
+
+def main():
+ options, args = get_arg()
+ galaxy_root_dir = args[0]
+
+ path_to_alfa = os.path.join(galaxy_root_dir, 'tools/alfa/')
+ path_to_tmp_dir = os.path.join(galaxy_root_dir, 'database/tmp/')
+
+ if options.output_filename == None:
+ msg = 'No json output file specified'
+ logging.critical(msg)
+ sys.exit(msg)
+ output_filename = options.output_filename
+ params = from_json_string(open(output_filename).read())
+ target_directory = params['output_data'][0]['extra_files_path']
+ os.mkdir(target_directory)
+
+ tmp_dir = tempfile.mkdtemp(prefix='tmp', suffix='', dir=path_to_tmp_dir)
+ os.chdir(tmp_dir)
+ log_file_name = 'galaxy_log_report.log'
+ logging.basicConfig(level=logging.INFO, filename=log_file_name, filemode="a+", format='%(message)s')
+ data_manager_dict = {}
+
+ if options.ensembl_info:
+ kingdom, species_name = options.ensembl_info
+ species_name = standardize_species_name(species_name)
+ url = get_ensembl_url_root(kingdom)
+ species_name, species_line = test_ensembl_species_exists(kingdom, url, species_name)
+ gtf_archive_name = get_ensembl_gtf_archive(kingdom, url, species_name, species_line)
+ data_table_entry = get_data_table_new_entry(gtf_archive_name)
+ gtf_file_name = '%s.gtf' % data_table_entry['prefix']
+ uncompress_gz(gtf_archive_name, gtf_file_name)
+ generate_alfa_indexes(path_to_alfa, gtf_file_name)
+ stranded_index_name = '%s.stranded.index' % data_table_entry['prefix']
+ unstranded_index_name = '%s.unstranded.index' % data_table_entry['prefix']
+ add_data_table_entry(data_manager_dict, data_table_entry)
+
+ logging.info("____________________________________________________________")
+ logging.info("*** General Info")
+ logging.info("TMP DIR:\t%s" % tmp_dir)
+ logging.info("TARGET DIR:\t%s" % target_directory)
+ logging.info("URL ROOT:\t%s" % url)
+ logging.info("SPECIES:\t%s" % data_table_entry['species'])
+ logging.info("VERSION:\t%s" % data_table_entry['version'])
+ logging.info("RELEASE:\t%s" % data_table_entry['release'])
+ logging.info("VALUE:\t%s" % data_table_entry['value'])
+ logging.info("DBKEY:\t%s" % data_table_entry['dbkey'])
+ logging.info("NAME:\t%s" % data_table_entry['name'])
+ logging.info("PREFIX:\t%s" % data_table_entry['prefix'])
+ logging.info("____________________________________________________________")
+ logging.info("*** Intial dictionary")
+ logging.info("%s" % params)
+
+ shutil.copyfile(stranded_index_name, os.path.join(target_directory, stranded_index_name))
+ shutil.copyfile(unstranded_index_name, os.path.join(target_directory, unstranded_index_name))
+ shutil.copyfile(log_file_name, os.path.join(target_directory, log_file_name))
+
+ cleanup_before_exit(tmp_dir)
+
+ open(output_filename, 'wb').write(to_json_string(data_manager_dict))
+main()
\ No newline at end of file
diff -r 000000000000 -r 016200d4e379 data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes.xml Thu Oct 27 05:15:05 2016 -0400
@@ -0,0 +1,27 @@
+
+ build ALFA indexes from automatically downloaded gtf annotation file
+ data_manager_build_alfa_indexes.py -e "${reference_source['kingdom']}" "${reference_source['species_name']}" -o "${out_file}" $__root_dir__
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff -r 000000000000 -r 016200d4e379 data_manager_build_alfa_indexes/data_manager_conf.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_build_alfa_indexes/data_manager_conf.xml Thu Oct 27 05:15:05 2016 -0400
@@ -0,0 +1,23 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff -r 000000000000 -r 016200d4e379 data_manager_build_alfa_indexes/tool-data/alfa_indexes.loc.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_build_alfa_indexes/tool-data/alfa_indexes.loc.sample Thu Oct 27 05:15:05 2016 -0400
@@ -0,0 +1,2 @@
+#
+#Dictyostelium_discoideum dicty_2 7 Dictyostelium_discoideum_dicty_2_7 Dictyostelium_discoideum_dicty_2_7 Dictyostelium_discoideum: dicty_2 (release 7)
diff -r 000000000000 -r 016200d4e379 data_manager_build_alfa_indexes/tool-data/alfa_indexes.loc.sample~
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_build_alfa_indexes/tool-data/alfa_indexes.loc.sample~ Thu Oct 27 05:15:05 2016 -0400
@@ -0,0 +1,5 @@
+#
+#Arabidopsis_thaliana TAIR10 30 Arabidopsis_t_TAIR10_30 Arabidopsis_t_TAIR10_30 Arabidopsis thaliana: TAIR10 /Arabidopsis_thaliana.TAIR10.30
+#Drosophila_melanogaster dm6 30 Drosophila_m_dm6_30 Drosophila_m_dm6_30 Drosophila melanogaster: dm6 /Drosophila_melanogaster.BDGP6.30
+#Homo_sapiens v38 82 Homo_s_v38_82 Homo_s_v38_82 Homo sapiens: v38 /Homo_sapiens.GRCh38.82
+#Mus_musculus v38 83 Mus_m_v38_83 Mus_m_v38_83 Mus musculus: v38 /Mus_musculus.GRCm38.83.chr
diff -r 000000000000 -r 016200d4e379 data_manager_build_alfa_indexes/tool_data_table_conf.xml.sample
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_build_alfa_indexes/tool_data_table_conf.xml.sample Thu Oct 27 05:15:05 2016 -0400
@@ -0,0 +1,7 @@
+
+
+
+ species, version, release, value, dbkey, name, prefix
+
+
+
diff -r 000000000000 -r 016200d4e379 data_manager_build_alfa_indexes/tool_dependencies.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/data_manager_build_alfa_indexes/tool_dependencies.xml Thu Oct 27 05:15:05 2016 -0400
@@ -0,0 +1,6 @@
+
+
+
+
+
+