annotate data_manager_build_alfa_indexes/data_manager/data_manager_build_alfa_indexes_testchr.py @ 29:0c821f76e2e5 draft default tip

Uploaded
author charles-bernard
date Thu, 21 Dec 2017 13:51:22 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
29
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
1 #!/usr/bin/python
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
2
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
3 import sys
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
4 import shutil
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
5 import re
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
6 import urllib2
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
7 import subprocess
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
8 import gzip
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
9 import os
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
10 import tempfile
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
11 from optparse import OptionParser
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
12 from galaxy.util.json import from_json_string, to_json_string
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
13
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
14 def get_arg():
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
15 parser = OptionParser()
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
16 parser.add_option("-e", "--ensembl", dest = 'ensembl_info', action = "store", nargs = 2, metavar = ("kingdom", "species_name"), type = "str")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
17 parser.add_option("-o", "--output", dest='output_filename', action="store", nargs = 1, metavar = 'JSON_FILE')
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
18 parser.add_option("--log", dest='log_filename', action="store", nargs=1, metavar='log_report')
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
19 (options, args) = parser.parse_args()
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
20 return options, args
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
21
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
22 def cleanup_before_exit(tmp_dir):
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
23 if tmp_dir and os.path.exists(tmp_dir):
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
24 shutil.rmtree(tmp_dir)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
25
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
26 def get_page_content(url):
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
27 req = urllib2.Request(url)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
28 page = urllib2.urlopen(req)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
29 return page.read()
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
30
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
31 def download_file(link, local_file_name):
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
32 req = urllib2.Request(link)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
33 src_file = urllib2.urlopen(req)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
34 local_file = open(local_file_name, 'wb')
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
35 local_file.write(src_file.read())
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
36 local_file.close()
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
37
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
38 def uncompress_gz(gz_file_name, uncompressed_file_name):
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
39 print("____________________________________________________________")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
40 print("*** Uncompressing %s" % gz_file_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
41 uncompressed_file = open(uncompressed_file_name, 'wb')
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
42 with gzip.open(gz_file_name, 'rb') as src_file:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
43 uncompressed_file.write(src_file.read())
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
44 uncompressed_file.close()
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
45 print("-> Uncompressed !\n")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
46
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
47 def standardize_species_name(species_name):
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
48 #substitute all capital letters, replace every succession of chars that are not letters to one underscore
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
49 standard_species_name = re.sub(r'[)]$', '', species_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
50 standard_species_name = re.sub(r'[ _),-.(=]+ *', '_', standard_species_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
51 return standard_species_name.lower()
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
52
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
53 def get_ensembl_url_root(kingdom):
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
54 print("____________________________________________________________")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
55 print("*** Determining Ensembl ftp root url")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
56 if kingdom == 'vertebrates':
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
57 root = 'ftp://ftp.ensembl.org/pub/current_gtf/'
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
58 else:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
59 root = 'ftp://ftp.ensemblgenomes.org/pub/%s/current/' % kingdom
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
60 print("-> Determined !\n")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
61 return root
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
62
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
63 def test_ensembl_species_exists(kingdom, url, species_name):
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
64 """
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
65 Test if a species exist on the ftp & return the species name with the species_line if so.
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
66 if the species_name matches a single string, then this string will be returned as the species name
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
67 if the species_name matches several strings, then an error is printed with all the possible species to enter for a new run
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
68 """
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
69 print("____________________________________________________________")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
70 print ("*** Testing whether %s is referenced in Ensembl %s" % (species_name, kingdom))
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
71 list_species_file_name = 'species_Ensembl%s%s.txt' % (kingdom[0].upper(), kingdom[1:])
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
72 if kingdom=='vertebrates':
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
73 download_file(url, list_species_file_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
74 else:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
75 download_file(url + list_species_file_name, list_species_file_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
76
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
77 grep_result = subprocess.Popen(['grep', species_name, list_species_file_name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
78 species_lines_matched, grep_error = grep_result.communicate()
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
79 if grep_error != None or species_lines_matched == "":
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
80 msg = 'The species \'%s\' is not referenced on Ensembl (%s)' % (species_name, kingdom)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
81 sys.exit(msg)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
82
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
83 species_lines = species_lines_matched.split('\n')
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
84 del species_lines[-1]
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
85 nb_lines = len(species_lines)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
86
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
87 if nb_lines == 1:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
88 if kingdom == 'vertebrates':
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
89 fields = species_lines[0].split(' ')
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
90 columns = fields[-1].split('\r')
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
91 found_species_name = columns[0]
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
92 else:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
93 columns = species_lines[0].split('\t')
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
94 found_species_name = columns[1]
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
95 if species_name != found_species_name:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
96 print('-> \'%s\' has been replace with the complete species name \'%s\'' % (species_name, found_species_name))
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
97 return found_species_name, species_lines_matched
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
98 print("-> Referenced !\n")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
99 return species_name, species_lines_matched
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
100 else:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
101 list_species = [''] * nb_lines
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
102 for i in range(0, nb_lines):
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
103 if kingdom == 'vertebrates':
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
104 fields = species_lines[i].split(' ')
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
105 columns = fields[-1].split('\r')
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
106 list_species[i] = columns[0]
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
107 else:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
108 columns = species_lines[i].split('\t')
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
109 list_species[i] = columns[1]
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
110 exact_match = re.search('^%s$' % species_name, list_species[i])
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
111 if exact_match:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
112 print("-> Referenced !\n")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
113 return species_name, species_lines[i]
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
114 msg = ("The string \'%s\' has been matched against the list of Ensembl Species but is not a complete species name.\n"
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
115 "Please retry with one of these following species names:\n" % species_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
116 for s in list_species:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
117 msg = ("%s- %s\n" % (msg, s))
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
118 sys.exit(msg)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
119
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
120 def get_ensembl_collection(kingdom, species_line):
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
121 print("*** Extracting the %s_collection of the species" % kingdom)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
122 collection_regex = re.compile('%s_.+_collection' % kingdom.lower())
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
123 collection_match = re.search(collection_regex, species_line)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
124 if not collection_match:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
125 print("-> Skiped: this species is not classified in a Ensembl %s collection\n" % kingdom)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
126 return None
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
127 print("-> Extracted !\n")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
128 return collection_match.group(0)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
129
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
130 def get_ensembl_gtf_archive_name(url_dir, species_name):
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
131 print("____________________________________________________________")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
132 print("*** Extracting the gtf archive name of %s" % species_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
133 gtf_archive_regex = re.compile('%s\..*\.[0-9]+\.gtf\.gz' % species_name, flags = re.IGNORECASE)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
134 chr_gtf_archive_regex = re.compile('%s\..*\.[0-9]+\.chr\.gtf\.gz' % species_name, flags = re.IGNORECASE)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
135 dir_content = get_page_content(url_dir)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
136 gtf_archive_match = re.search(gtf_archive_regex, dir_content)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
137 chr_gtf_archive_match = re.search(chr_gtf_archive_regex, dir_content)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
138 if not gtf_archive_match:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
139 sys.exit('The species is referenced on Ensembl but error of nomenclature led to download failure')
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
140 if not chr_gtf_archive_match:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
141 chr_gtf_archive_name = ""
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
142 else:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
143 chr_gtf_archive_name = chr_gtf_archive_match.group(0)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
144 gtf_archive_name = gtf_archive_match.group(0)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
145 print("-> Extracted !\n")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
146 return gtf_archive_name, chr_gtf_archive_name
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
147
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
148 def get_ensembl_gtf_archive(kingdom, url, species_name, species_line):
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
149 if kingdom != 'vertebrates':
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
150 url = url + 'gtf/'
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
151 if kingdom == 'bacteria' or kingdom == 'protists' or kingdom == 'fungi':
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
152 collection = get_ensembl_collection(kingdom, species_line)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
153 if collection != None:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
154 url = url + "%s/" % collection
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
155 final_url = url + species_name + '/'
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
156 gtf_archive_name, chr_gtf_archive_name = get_ensembl_gtf_archive_name(final_url, species_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
157 print("____________________________________________________________")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
158 print("*** Download the gtf archive of %s" % species_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
159 download_file(final_url + gtf_archive_name, gtf_archive_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
160 print("-> Downloaded !\n")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
161 if chr_gtf_archive_name:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
162 print("*** Download the chr gtf archive of %s" % species_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
163 download_file(final_url + chr_gtf_archive_name, chr_gtf_archive_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
164 print("-> Downloaded !\n")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
165 return gtf_archive_name, chr_gtf_archive_name
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
166
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
167 def generate_alfa_indexes(path_to_alfa, gtf_file_name):
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
168 print("____________________________________________________________")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
169 print("*** Generating alfa indexes from %s" % gtf_file_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
170 alfa_result = subprocess.Popen(['python', path_to_alfa, '-a', gtf_file_name], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
171 alfa_out, alfa_err = alfa_result.communicate()
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
172 if alfa_err != None and not re.search('### End of program', alfa_err):
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
173 msg = 'Generation Failed due an alfa error: %s' % (alfa_err)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
174 sys.exit(msg)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
175 print("Alfa prompt:\n%s" % alfa_out)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
176 print("-> Generated !\n")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
177
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
178 def get_data_table_new_entry(gtf_archive_name):
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
179 info_list = gtf_archive_name.split('.')
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
180 species = info_list[0]
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
181 version = info_list[1]
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
182 release = info_list[2]
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
183 value = '%s_%s_%s' % (species, version, release)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
184 dbkey = value
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
185 name = '%s: %s (release %s)' % (species, version, release)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
186 prefix = '%s.%s.%s' % (species, version, release)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
187 entry_dict = { 'species': species, 'version': version, 'release': release, 'value': value, 'dbkey': dbkey, 'name': name, 'prefix': prefix }
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
188 return entry_dict
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
189
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
190 def chr_get_data_table_new_entry(chr_gtf_archive_name):
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
191 info_list = chr_gtf_archive_name.split('.')
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
192 species = info_list[0]
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
193 version = info_list[1]
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
194 release = info_list[2]
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
195 value = '%s_%s_%s.chr' % (species, version, release)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
196 dbkey = value
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
197 name = '%s: %s (release %s) - Chr' % (species, version, release)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
198 prefix = '%s.%s.%s.chr' % (species, version, release)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
199 entry_dict = { 'species': species, 'version': version, 'release': release, 'value': value, 'dbkey': dbkey, 'name': name, 'prefix': prefix }
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
200 return entry_dict
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
201
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
202 def main():
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
203 options, args = get_arg()
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
204 tool_dir = args[0]
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
205
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
206 path_to_alfa = os.path.join(tool_dir, 'ALFA.py')
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
207
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
208 if options.output_filename == None:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
209 msg = 'No json output file specified'
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
210 sys.exit(msg)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
211 output_filename = options.output_filename
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
212
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
213 # Interestingly the output file to return is not empty initially.
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
214 # it contains a dictionary, with notably the path to the dir where the alfa_indexes
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
215 # are expected to be found
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
216 params = from_json_string(open(output_filename).read())
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
217 target_directory = params['output_data'][0]['extra_files_path']
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
218 os.mkdir(target_directory)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
219
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
220 tmp_dir = tempfile.mkdtemp(prefix='tmp', suffix='')
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
221 os.chdir(tmp_dir)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
222
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
223 data_manager_dict = {}
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
224 data_manager_dict['data_tables'] = data_manager_dict.get('data_tables', {})
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
225 data_manager_dict['data_tables']['alfa_indexes'] = data_manager_dict['data_tables'].get('alfa_indexes', [])
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
226
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
227 if options.ensembl_info:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
228 kingdom, species_name = options.ensembl_info
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
229 species_name = standardize_species_name(species_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
230 url = get_ensembl_url_root(kingdom)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
231 species_name, species_line = test_ensembl_species_exists(kingdom, url, species_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
232 gtf_archive_name, chr_gtf_archive_name = get_ensembl_gtf_archive(kingdom, url, species_name, species_line)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
233 data_table_entry = get_data_table_new_entry(gtf_archive_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
234 gtf_file_name = '%s.gtf' % data_table_entry['prefix']
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
235 uncompress_gz(gtf_archive_name, gtf_file_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
236 generate_alfa_indexes(path_to_alfa, gtf_file_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
237 stranded_index_name = '%s.stranded.index' % data_table_entry['prefix']
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
238 unstranded_index_name = '%s.unstranded.index' % data_table_entry['prefix']
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
239 data_manager_dict['data_tables']['alfa_indexes'].append(data_table_entry)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
240 if chr_gtf_archive_name:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
241 data_table_entry = chr_get_data_table_new_entry(chr_gtf_archive_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
242 chr_gtf_file_name = '%s.gtf' % data_table_entry['prefix']
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
243 uncompress_gz(chr_gtf_archive_name, chr_gtf_file_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
244 generate_alfa_indexes(path_to_alfa, chr_gtf_file_name)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
245 chr_stranded_index_name = '%s.stranded.index' % data_table_entry['prefix']
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
246 chr_unstranded_index_name = '%s.unstranded.index' % data_table_entry['prefix']
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
247 data_manager_dict['data_tables']['alfa_indexes'].append(data_table_entry)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
248
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
249
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
250 print("____________________________________________________________")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
251 print("*** General Info")
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
252 print("URL ROOT:\t%s" % url)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
253 print("SPECIES:\t%s" % data_table_entry['species'])
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
254 print("VERSION:\t%s" % data_table_entry['version'])
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
255 print("RELEASE:\t%s" % data_table_entry['release'])
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
256 print("VALUE:\t%s" % data_table_entry['value'])
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
257 print("DBKEY:\t%s" % data_table_entry['dbkey'])
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
258 print("NAME:\t%s" % data_table_entry['name'])
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
259 print("PREFIX:\t%s" % data_table_entry['prefix'])
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
260
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
261 shutil.copyfile(stranded_index_name, os.path.join(target_directory, stranded_index_name))
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
262 shutil.copyfile(unstranded_index_name, os.path.join(target_directory, unstranded_index_name))
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
263
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
264 if chr_gtf_archive_name:
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
265 shutil.copyfile(chr_stranded_index_name, os.path.join(target_directory, stranded_index_name))
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
266 shutil.copyfile(chr_unstranded_index_name, os.path.join(target_directory, unstranded_index_name))
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
267
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
268
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
269 cleanup_before_exit(tmp_dir)
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
270
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
271 open(output_filename, 'wb').write(to_json_string(data_manager_dict))
0c821f76e2e5 Uploaded
charles-bernard
parents:
diff changeset
272 main()