0
|
1 #!/usr/bin/python
|
|
2
|
|
3 import sys
|
|
4 import shutil
|
|
5 import re
|
|
6 import urllib2
|
|
7 import subprocess
|
|
8 import gzip
|
|
9 import os
|
|
10 import tempfile
|
|
11 import logging
|
|
12 from optparse import OptionParser
|
|
13 from galaxy.util.json import from_json_string, to_json_string
|
|
14
|
|
15 def get_arg():
|
|
16 parser = OptionParser()
|
|
17 parser.add_option("-e", "--ensembl", dest = 'ensembl_info', action = "store", nargs = 2, metavar = ("kingdom", "species_name"), type = "str")
|
|
18 parser.add_option("-o", "--output", dest='output_filename', action="store", nargs = 1, metavar = 'JSON_FILE')
|
7
|
19 parser.add_option("--log", dest='log_filename', action="store", nargs=1, metavar='log_report')
|
0
|
20 (options, args) = parser.parse_args()
|
|
21 return options, args
|
|
22
|
|
23 def cleanup_before_exit(tmp_dir):
|
|
24 if tmp_dir and os.path.exists(tmp_dir):
|
|
25 shutil.rmtree(tmp_dir)
|
|
26
|
|
27 def get_page_content(url):
|
|
28 req = urllib2.Request(url)
|
|
29 page = urllib2.urlopen(req)
|
|
30 return page.read()
|
|
31
|
|
32
|
|
33 def download_file(link, local_file_name):
|
|
34 req = urllib2.Request(link)
|
|
35 src_file = urllib2.urlopen(req)
|
|
36 local_file = open(local_file_name, 'wb')
|
|
37 local_file.write(src_file.read())
|
|
38 local_file.close()
|
|
39
|
|
40 def uncompress_gz(gz_file_name, uncompressed_file_name):
|
7
|
41 print("____________________________________________________________")
|
|
42 print("*** Uncompressing %s" % gz_file_name)
|
0
|
43 uncompressed_file = open(uncompressed_file_name, 'wb')
|
|
44 with gzip.open(gz_file_name, 'rb') as src_file:
|
|
45 uncompressed_file.write(src_file.read())
|
|
46 uncompressed_file.close()
|
7
|
47 print("-> Uncompressed !\n")
|
0
|
48
|
|
49 def add_data_table_entry( data_manager_dict, data_table_entry ):
|
|
50 data_manager_dict['data_tables'] = data_manager_dict.get( 'data_tables', {} )
|
|
51 data_manager_dict['data_tables']['alfa_indexes'] = data_manager_dict['data_tables'].get( 'alfa_indexes', data_table_entry )
|
|
52 return data_manager_dict
|
|
53
|
|
54 def standardize_species_name(species_name):
|
|
55 standard_species_name = re.sub(r'[)]$', '', species_name)
|
|
56 standard_species_name = re.sub(r'[ _),-.(=]+ *', '_', standard_species_name)
|
|
57 return standard_species_name.lower()
|
|
58
|
|
59 def get_ensembl_url_root(kingdom):
|
7
|
60 print("____________________________________________________________")
|
|
61 print("*** Determining Ensembl ftp root url")
|
0
|
62 if kingdom == 'vertebrates':
|
|
63 root = 'ftp://ftp.ensembl.org/pub/current_gtf/'
|
|
64 else:
|
|
65 root = 'ftp://ftp.ensemblgenomes.org/pub/%s/current/' % kingdom
|
7
|
66 print("-> Determined !\n")
|
0
|
67 return root
|
|
68
|
|
69 def test_ensembl_species_exists(kingdom, url, species_name):
|
7
|
70 print("____________________________________________________________")
|
|
71 print ("*** Testing whether %s is referenced in Ensembl %s" % (species_name, kingdom))
|
0
|
72 list_species_file_name = 'species_Ensembl%s%s.txt' % (kingdom[0].upper(), kingdom[1:])
|
15
|
73 print("%s" % kingdom)
|
0
|
74 if kingdom=='vertebrates':
|
|
75 download_file(url, list_species_file_name)
|
|
76 else:
|
|
77 download_file(url + list_species_file_name, list_species_file_name)
|
|
78
|
|
79 grep_result = subprocess.Popen(['grep', species_name, list_species_file_name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
|
80 species_lines_matched, grep_error = grep_result.communicate()
|
|
81 if grep_error != None or species_lines_matched == "":
|
|
82 msg = 'The species \'%s\' is not referenced on Ensembl (%s)' % (species_name, kingdom)
|
|
83 logging.critical(msg)
|
|
84 sys.exit(msg)
|
|
85
|
|
86 species_lines = species_lines_matched.split('\n')
|
|
87 del species_lines[-1]
|
|
88 nb_lines = len(species_lines)
|
|
89
|
|
90 if nb_lines == 1:
|
15
|
91 if kingdom == 'vertebrates':
|
|
92 fields = species_lines[0].split(' ')
|
|
93 columns = fields[-1].split('\r')
|
|
94 found_species_name = columns[0]
|
|
95 else:
|
|
96 columns = species_lines[0].split('\t')
|
|
97 found_species_name = columns[1]
|
0
|
98 if species_name != found_species_name:
|
7
|
99 print('-> \'%s\' has been replace with the complete species name \'%s\'' % (species_name, found_species_name))
|
0
|
100 return found_species_name, species_lines_matched
|
7
|
101 print("-> Referenced !\n")
|
0
|
102 return species_name, species_lines_matched
|
|
103 else:
|
|
104 list_species = [''] * nb_lines
|
|
105 for i in range(0, nb_lines):
|
15
|
106 if kingdom == 'vertebrates':
|
21
|
107 fields = species_lines[i].split(' ')
|
15
|
108 columns = fields[-1].split('\r')
|
|
109 list_species[i] = columns[0]
|
|
110 else:
|
21
|
111 columns = species_lines[i].split('\t')
|
15
|
112 list_species[i] = columns[1]
|
0
|
113 exact_match = re.search('^%s$' % species_name, list_species[i])
|
|
114 if exact_match:
|
7
|
115 print("-> Referenced !\n")
|
0
|
116 return species_name, species_lines[i]
|
|
117 msg = 'The string \'%s\' has been matched against the list of Ensembl Species but is not a complete species name.\nPlease retry with one of the following species names:\n%s' % (species_name, list_species[0:])
|
|
118 logging.critical(msg)
|
|
119 sys.exit(msg)
|
|
120
|
|
121 def get_ensembl_collection(kingdom, species_line):
|
7
|
122 print("*** Extracting the %s_collection of the species" % kingdom)
|
0
|
123 collection_regex = re.compile('%s_.+_collection' % kingdom.lower())
|
|
124 collection_match = re.search(collection_regex, species_line)
|
|
125 if not collection_match:
|
7
|
126 print("-> Skiped: this species is not classified in a Ensembl %s collection\n" % kingdom)
|
0
|
127 return None
|
7
|
128 print("-> Extracted !\n")
|
0
|
129 return collection_match.group(0)
|
|
130
|
|
131 def get_ensembl_gtf_archive_name(url_dir, species_name):
|
7
|
132 print("____________________________________________________________")
|
|
133 print("*** Extracting the gtf archive name of %s" % species_name)
|
0
|
134 gtf_archive_regex = re.compile('%s\..*\.[0-9]+\.gtf\.gz' % species_name, flags = re.IGNORECASE)
|
|
135 dir_content = get_page_content(url_dir)
|
|
136 gtf_archive_match = re.search(gtf_archive_regex, dir_content)
|
|
137 if not gtf_archive_match:
|
|
138 sys.exit('The species is referenced on Ensembl but error of nomenclature led to download failure')
|
|
139 gtf_archive_name = gtf_archive_match.group(0)
|
7
|
140 print("-> Extracted !\n")
|
0
|
141 return gtf_archive_name
|
|
142
|
|
143 def get_ensembl_gtf_archive(kingdom, url, species_name, species_line):
|
|
144 if kingdom != 'vertebrates':
|
|
145 url = url + 'gtf/'
|
|
146 if kingdom == 'bacteria' or kingdom == 'protists' or kingdom == 'fungi':
|
|
147 collection = get_ensembl_collection(kingdom, species_line)
|
|
148 if collection != None:
|
|
149 url = url + "%s/" % collection
|
|
150 final_url = url + species_name + '/'
|
|
151 gtf_archive_name = get_ensembl_gtf_archive_name(final_url, species_name)
|
7
|
152 print("____________________________________________________________")
|
|
153 print("*** Download the gtf archive of %s" % species_name)
|
0
|
154 download_file(final_url + gtf_archive_name, gtf_archive_name)
|
7
|
155 print("-> Downloaded !\n")
|
0
|
156 return gtf_archive_name
|
|
157
|
|
158 def generate_alfa_indexes(path_to_alfa, gtf_file_name):
|
7
|
159 print("____________________________________________________________")
|
|
160 print("*** Generating alfa indexes from %s" % gtf_file_name)
|
4
|
161 alfa_result = subprocess.Popen(['python', path_to_alfa, '-a', gtf_file_name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
|
0
|
162 alfa_out, alfa_err = alfa_result.communicate()
|
|
163 if alfa_err != None and not re.search('### End of program', alfa_err):
|
|
164 msg = 'Generation Failed due an alfa error: %s' % (alfa_err)
|
|
165 logging.critical(msg)
|
|
166 sys.exit(msg)
|
10
|
167 print("Alfa prompt:\n%s" % alfa_out)
|
7
|
168 print("-> Generated !\n")
|
0
|
169
|
|
170 def get_data_table_new_entry(gtf_archive_name):
|
|
171 info_list = gtf_archive_name.split('.')
|
|
172 species = info_list[0]
|
|
173 version = info_list[1]
|
|
174 release = info_list[2]
|
|
175 value = '%s_%s_%s' % (species, version, release)
|
|
176 dbkey = value
|
|
177 name = '%s: %s (release %s)' % (species, version, release)
|
|
178 prefix = '%s.%s.%s' % (species, version, release)
|
|
179 entry_dict = { 'species': species, 'version': version, 'release': release, 'value': value, 'dbkey': dbkey, 'name': name, 'prefix': prefix }
|
|
180 return entry_dict
|
|
181
|
|
182 def main():
|
|
183 options, args = get_arg()
|
15
|
184 tool_dir = args[0]
|
0
|
185
|
4
|
186 path_to_alfa = os.path.join(tool_dir, 'ALFA.py')
|
0
|
187
|
|
188 if options.output_filename == None:
|
|
189 msg = 'No json output file specified'
|
|
190 logging.critical(msg)
|
|
191 sys.exit(msg)
|
|
192 output_filename = options.output_filename
|
|
193 params = from_json_string(open(output_filename).read())
|
|
194 target_directory = params['output_data'][0]['extra_files_path']
|
|
195 os.mkdir(target_directory)
|
|
196
|
15
|
197 tmp_dir = tempfile.mkdtemp(prefix='tmp', suffix='')
|
0
|
198 os.chdir(tmp_dir)
|
15
|
199
|
0
|
200 data_manager_dict = {}
|
|
201
|
|
202 if options.ensembl_info:
|
|
203 kingdom, species_name = options.ensembl_info
|
|
204 species_name = standardize_species_name(species_name)
|
|
205 url = get_ensembl_url_root(kingdom)
|
|
206 species_name, species_line = test_ensembl_species_exists(kingdom, url, species_name)
|
|
207 gtf_archive_name = get_ensembl_gtf_archive(kingdom, url, species_name, species_line)
|
|
208 data_table_entry = get_data_table_new_entry(gtf_archive_name)
|
|
209 gtf_file_name = '%s.gtf' % data_table_entry['prefix']
|
|
210 uncompress_gz(gtf_archive_name, gtf_file_name)
|
|
211 generate_alfa_indexes(path_to_alfa, gtf_file_name)
|
|
212 stranded_index_name = '%s.stranded.index' % data_table_entry['prefix']
|
|
213 unstranded_index_name = '%s.unstranded.index' % data_table_entry['prefix']
|
|
214 add_data_table_entry(data_manager_dict, data_table_entry)
|
|
215
|
7
|
216 print("____________________________________________________________")
|
|
217 print("*** General Info")
|
|
218 print("TMP DIR:\t%s" % tmp_dir)
|
|
219 print("TARGET DIR:\t%s" % target_directory)
|
|
220 print("URL ROOT:\t%s" % url)
|
|
221 print("SPECIES:\t%s" % data_table_entry['species'])
|
|
222 print("VERSION:\t%s" % data_table_entry['version'])
|
|
223 print("RELEASE:\t%s" % data_table_entry['release'])
|
|
224 print("VALUE:\t%s" % data_table_entry['value'])
|
|
225 print("DBKEY:\t%s" % data_table_entry['dbkey'])
|
|
226 print("NAME:\t%s" % data_table_entry['name'])
|
|
227 print("PREFIX:\t%s" % data_table_entry['prefix'])
|
|
228 print("____________________________________________________________")
|
|
229 print("*** Intial dictionary")
|
|
230 print("%s" % params)
|
0
|
231
|
6
|
232
|
0
|
233 shutil.copyfile(stranded_index_name, os.path.join(target_directory, stranded_index_name))
|
|
234 shutil.copyfile(unstranded_index_name, os.path.join(target_directory, unstranded_index_name))
|
|
235
|
|
236 cleanup_before_exit(tmp_dir)
|
|
237
|
|
238 open(output_filename, 'wb').write(to_json_string(data_manager_dict))
|
21
|
239 main()
|