Mercurial > repos > charles-bernard > data_manager_build_alfa_indexes
comparison data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes_testchr.py @ 29:0c821f76e2e5 draft default tip
Uploaded
author | charles-bernard |
---|---|
date | Thu, 21 Dec 2017 13:51:22 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
28:9139892d06a2 | 29:0c821f76e2e5 |
---|---|
1 #!/usr/bin/python | |
2 | |
3 import sys | |
4 import shutil | |
5 import re | |
6 import urllib2 | |
7 import subprocess | |
8 import gzip | |
9 import os | |
10 import tempfile | |
11 from optparse import OptionParser | |
12 from galaxy.util.json import from_json_string, to_json_string | |
13 | |
14 def get_arg(): | |
15 parser = OptionParser() | |
16 parser.add_option("-e", "--ensembl", dest = 'ensembl_info', action = "store", nargs = 2, metavar = ("kingdom", "species_name"), type = "str") | |
17 parser.add_option("-o", "--output", dest='output_filename', action="store", nargs = 1, metavar = 'JSON_FILE') | |
18 parser.add_option("--log", dest='log_filename', action="store", nargs=1, metavar='log_report') | |
19 (options, args) = parser.parse_args() | |
20 return options, args | |
21 | |
22 def cleanup_before_exit(tmp_dir): | |
23 if tmp_dir and os.path.exists(tmp_dir): | |
24 shutil.rmtree(tmp_dir) | |
25 | |
26 def get_page_content(url): | |
27 req = urllib2.Request(url) | |
28 page = urllib2.urlopen(req) | |
29 return page.read() | |
30 | |
31 def download_file(link, local_file_name): | |
32 req = urllib2.Request(link) | |
33 src_file = urllib2.urlopen(req) | |
34 local_file = open(local_file_name, 'wb') | |
35 local_file.write(src_file.read()) | |
36 local_file.close() | |
37 | |
38 def uncompress_gz(gz_file_name, uncompressed_file_name): | |
39 print("____________________________________________________________") | |
40 print("*** Uncompressing %s" % gz_file_name) | |
41 uncompressed_file = open(uncompressed_file_name, 'wb') | |
42 with gzip.open(gz_file_name, 'rb') as src_file: | |
43 uncompressed_file.write(src_file.read()) | |
44 uncompressed_file.close() | |
45 print("-> Uncompressed !\n") | |
46 | |
47 def standardize_species_name(species_name): | |
48 #substitute all capital letters, replace every succession of chars that are not letters to one underscore | |
49 standard_species_name = re.sub(r'[)]$', '', species_name) | |
50 standard_species_name = re.sub(r'[ _),-.(=]+ *', '_', standard_species_name) | |
51 return standard_species_name.lower() | |
52 | |
53 def get_ensembl_url_root(kingdom): | |
54 print("____________________________________________________________") | |
55 print("*** Determining Ensembl ftp root url") | |
56 if kingdom == 'vertebrates': | |
57 root = 'ftp://ftp.ensembl.org/pub/current_gtf/' | |
58 else: | |
59 root = 'ftp://ftp.ensemblgenomes.org/pub/%s/current/' % kingdom | |
60 print("-> Determined !\n") | |
61 return root | |
62 | |
63 def test_ensembl_species_exists(kingdom, url, species_name): | |
64 """ | |
65 Test if a species exist on the ftp & return the species name with the species_line if so. | |
66 if the species_name matches a single string, then this string will be returned as the species name | |
67 if the species_name matches several strings, then an error is printed with all the possible species to enter for a new run | |
68 """ | |
69 print("____________________________________________________________") | |
70 print ("*** Testing whether %s is referenced in Ensembl %s" % (species_name, kingdom)) | |
71 list_species_file_name = 'species_Ensembl%s%s.txt' % (kingdom[0].upper(), kingdom[1:]) | |
72 if kingdom=='vertebrates': | |
73 download_file(url, list_species_file_name) | |
74 else: | |
75 download_file(url + list_species_file_name, list_species_file_name) | |
76 | |
77 grep_result = subprocess.Popen(['grep', species_name, list_species_file_name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) | |
78 species_lines_matched, grep_error = grep_result.communicate() | |
79 if grep_error != None or species_lines_matched == "": | |
80 msg = 'The species \'%s\' is not referenced on Ensembl (%s)' % (species_name, kingdom) | |
81 sys.exit(msg) | |
82 | |
83 species_lines = species_lines_matched.split('\n') | |
84 del species_lines[-1] | |
85 nb_lines = len(species_lines) | |
86 | |
87 if nb_lines == 1: | |
88 if kingdom == 'vertebrates': | |
89 fields = species_lines[0].split(' ') | |
90 columns = fields[-1].split('\r') | |
91 found_species_name = columns[0] | |
92 else: | |
93 columns = species_lines[0].split('\t') | |
94 found_species_name = columns[1] | |
95 if species_name != found_species_name: | |
96 print('-> \'%s\' has been replace with the complete species name \'%s\'' % (species_name, found_species_name)) | |
97 return found_species_name, species_lines_matched | |
98 print("-> Referenced !\n") | |
99 return species_name, species_lines_matched | |
100 else: | |
101 list_species = [''] * nb_lines | |
102 for i in range(0, nb_lines): | |
103 if kingdom == 'vertebrates': | |
104 fields = species_lines[i].split(' ') | |
105 columns = fields[-1].split('\r') | |
106 list_species[i] = columns[0] | |
107 else: | |
108 columns = species_lines[i].split('\t') | |
109 list_species[i] = columns[1] | |
110 exact_match = re.search('^%s$' % species_name, list_species[i]) | |
111 if exact_match: | |
112 print("-> Referenced !\n") | |
113 return species_name, species_lines[i] | |
114 msg = ("The string \'%s\' has been matched against the list of Ensembl Species but is not a complete species name.\n" | |
115 "Please retry with one of these following species names:\n" % species_name) | |
116 for s in list_species: | |
117 msg = ("%s- %s\n" % (msg, s)) | |
118 sys.exit(msg) | |
119 | |
120 def get_ensembl_collection(kingdom, species_line): | |
121 print("*** Extracting the %s_collection of the species" % kingdom) | |
122 collection_regex = re.compile('%s_.+_collection' % kingdom.lower()) | |
123 collection_match = re.search(collection_regex, species_line) | |
124 if not collection_match: | |
125 print("-> Skiped: this species is not classified in a Ensembl %s collection\n" % kingdom) | |
126 return None | |
127 print("-> Extracted !\n") | |
128 return collection_match.group(0) | |
129 | |
130 def get_ensembl_gtf_archive_name(url_dir, species_name): | |
131 print("____________________________________________________________") | |
132 print("*** Extracting the gtf archive name of %s" % species_name) | |
133 gtf_archive_regex = re.compile('%s\..*\.[0-9]+\.gtf\.gz' % species_name, flags = re.IGNORECASE) | |
134 chr_gtf_archive_regex = re.compile('%s\..*\.[0-9]+\.chr\.gtf\.gz' % species_name, flags = re.IGNORECASE) | |
135 dir_content = get_page_content(url_dir) | |
136 gtf_archive_match = re.search(gtf_archive_regex, dir_content) | |
137 chr_gtf_archive_match = re.search(chr_gtf_archive_regex, dir_content) | |
138 if not gtf_archive_match: | |
139 sys.exit('The species is referenced on Ensembl but error of nomenclature led to download failure') | |
140 if not chr_gtf_archive_match: | |
141 chr_gtf_archive_name = "" | |
142 else: | |
143 chr_gtf_archive_name = chr_gtf_archive_match.group(0) | |
144 gtf_archive_name = gtf_archive_match.group(0) | |
145 print("-> Extracted !\n") | |
146 return gtf_archive_name, chr_gtf_archive_name | |
147 | |
148 def get_ensembl_gtf_archive(kingdom, url, species_name, species_line): | |
149 if kingdom != 'vertebrates': | |
150 url = url + 'gtf/' | |
151 if kingdom == 'bacteria' or kingdom == 'protists' or kingdom == 'fungi': | |
152 collection = get_ensembl_collection(kingdom, species_line) | |
153 if collection != None: | |
154 url = url + "%s/" % collection | |
155 final_url = url + species_name + '/' | |
156 gtf_archive_name, chr_gtf_archive_name = get_ensembl_gtf_archive_name(final_url, species_name) | |
157 print("____________________________________________________________") | |
158 print("*** Download the gtf archive of %s" % species_name) | |
159 download_file(final_url + gtf_archive_name, gtf_archive_name) | |
160 print("-> Downloaded !\n") | |
161 if chr_gtf_archive_name: | |
162 print("*** Download the chr gtf archive of %s" % species_name) | |
163 download_file(final_url + chr_gtf_archive_name, chr_gtf_archive_name) | |
164 print("-> Downloaded !\n") | |
165 return gtf_archive_name, chr_gtf_archive_name | |
166 | |
167 def generate_alfa_indexes(path_to_alfa, gtf_file_name): | |
168 print("____________________________________________________________") | |
169 print("*** Generating alfa indexes from %s" % gtf_file_name) | |
170 alfa_result = subprocess.Popen(['python', path_to_alfa, '-a', gtf_file_name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) | |
171 alfa_out, alfa_err = alfa_result.communicate() | |
172 if alfa_err != None and not re.search('### End of program', alfa_err): | |
173 msg = 'Generation Failed due an alfa error: %s' % (alfa_err) | |
174 sys.exit(msg) | |
175 print("Alfa prompt:\n%s" % alfa_out) | |
176 print("-> Generated !\n") | |
177 | |
178 def get_data_table_new_entry(gtf_archive_name): | |
179 info_list = gtf_archive_name.split('.') | |
180 species = info_list[0] | |
181 version = info_list[1] | |
182 release = info_list[2] | |
183 value = '%s_%s_%s' % (species, version, release) | |
184 dbkey = value | |
185 name = '%s: %s (release %s)' % (species, version, release) | |
186 prefix = '%s.%s.%s' % (species, version, release) | |
187 entry_dict = { 'species': species, 'version': version, 'release': release, 'value': value, 'dbkey': dbkey, 'name': name, 'prefix': prefix } | |
188 return entry_dict | |
189 | |
190 def chr_get_data_table_new_entry(chr_gtf_archive_name): | |
191 info_list = chr_gtf_archive_name.split('.') | |
192 species = info_list[0] | |
193 version = info_list[1] | |
194 release = info_list[2] | |
195 value = '%s_%s_%s.chr' % (species, version, release) | |
196 dbkey = value | |
197 name = '%s: %s (release %s) - Chr' % (species, version, release) | |
198 prefix = '%s.%s.%s.chr' % (species, version, release) | |
199 entry_dict = { 'species': species, 'version': version, 'release': release, 'value': value, 'dbkey': dbkey, 'name': name, 'prefix': prefix } | |
200 return entry_dict | |
201 | |
202 def main(): | |
203 options, args = get_arg() | |
204 tool_dir = args[0] | |
205 | |
206 path_to_alfa = os.path.join(tool_dir, 'ALFA.py') | |
207 | |
208 if options.output_filename == None: | |
209 msg = 'No json output file specified' | |
210 sys.exit(msg) | |
211 output_filename = options.output_filename | |
212 | |
213 # Interestingly the output file to return is not empty initially. | |
214 # it contains a dictionary, with notably the path to the dir where the alfa_indexes | |
215 # are expected to be found | |
216 params = from_json_string(open(output_filename).read()) | |
217 target_directory = params['output_data'][0]['extra_files_path'] | |
218 os.mkdir(target_directory) | |
219 | |
220 tmp_dir = tempfile.mkdtemp(prefix='tmp', suffix='') | |
221 os.chdir(tmp_dir) | |
222 | |
223 data_manager_dict = {} | |
224 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {}) | |
225 data_manager_dict['data_tables']['alfa_indexes'] = data_manager_dict['data_tables'].get('alfa_indexes', []) | |
226 | |
227 if options.ensembl_info: | |
228 kingdom, species_name = options.ensembl_info | |
229 species_name = standardize_species_name(species_name) | |
230 url = get_ensembl_url_root(kingdom) | |
231 species_name, species_line = test_ensembl_species_exists(kingdom, url, species_name) | |
232 gtf_archive_name, chr_gtf_archive_name = get_ensembl_gtf_archive(kingdom, url, species_name, species_line) | |
233 data_table_entry = get_data_table_new_entry(gtf_archive_name) | |
234 gtf_file_name = '%s.gtf' % data_table_entry['prefix'] | |
235 uncompress_gz(gtf_archive_name, gtf_file_name) | |
236 generate_alfa_indexes(path_to_alfa, gtf_file_name) | |
237 stranded_index_name = '%s.stranded.index' % data_table_entry['prefix'] | |
238 unstranded_index_name = '%s.unstranded.index' % data_table_entry['prefix'] | |
239 data_manager_dict['data_tables']['alfa_indexes'].append(data_table_entry) | |
240 if chr_gtf_archive_name: | |
241 data_table_entry = chr_get_data_table_new_entry(chr_gtf_archive_name) | |
242 chr_gtf_file_name = '%s.gtf' % data_table_entry['prefix'] | |
243 uncompress_gz(chr_gtf_archive_name, chr_gtf_file_name) | |
244 generate_alfa_indexes(path_to_alfa, chr_gtf_file_name) | |
245 chr_stranded_index_name = '%s.stranded.index' % data_table_entry['prefix'] | |
246 chr_unstranded_index_name = '%s.unstranded.index' % data_table_entry['prefix'] | |
247 data_manager_dict['data_tables']['alfa_indexes'].append(data_table_entry) | |
248 | |
249 | |
250 print("____________________________________________________________") | |
251 print("*** General Info") | |
252 print("URL ROOT:\t%s" % url) | |
253 print("SPECIES:\t%s" % data_table_entry['species']) | |
254 print("VERSION:\t%s" % data_table_entry['version']) | |
255 print("RELEASE:\t%s" % data_table_entry['release']) | |
256 print("VALUE:\t%s" % data_table_entry['value']) | |
257 print("DBKEY:\t%s" % data_table_entry['dbkey']) | |
258 print("NAME:\t%s" % data_table_entry['name']) | |
259 print("PREFIX:\t%s" % data_table_entry['prefix']) | |
260 | |
261 shutil.copyfile(stranded_index_name, os.path.join(target_directory, stranded_index_name)) | |
262 shutil.copyfile(unstranded_index_name, os.path.join(target_directory, unstranded_index_name)) | |
263 | |
264 if chr_gtf_archive_name: | |
265 shutil.copyfile(chr_stranded_index_name, os.path.join(target_directory, stranded_index_name)) | |
266 shutil.copyfile(chr_unstranded_index_name, os.path.join(target_directory, unstranded_index_name)) | |
267 | |
268 | |
269 cleanup_before_exit(tmp_dir) | |
270 | |
271 open(output_filename, 'wb').write(to_json_string(data_manager_dict)) | |
272 main() |