Mercurial > repos > dave > data_manager_fetch_index_maf
comparison data_manager/data_manager_fetch_and_index_maf.py @ 0:de73b258a601 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_fetch_index_maf commit 21852ee28cf191d12b1ffe5583efaa5deeb1d80d-dirty"
author | dave |
---|---|
date | Wed, 15 Jul 2020 14:30:00 -0400 |
parents | |
children | edf39ed96bc3 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:de73b258a601 |
---|---|
1 #!/usr/bin/env python | |
2 import bz2 | |
3 import ftplib | |
4 import gzip | |
5 import json | |
6 import optparse | |
7 import os | |
8 import re | |
9 import shutil | |
10 import subprocess | |
11 import sys | |
12 import tempfile | |
13 import urllib.parse | |
14 import urllib.request | |
15 import zipfile | |
16 from binascii import hexlify | |
17 | |
18 CHUNK_SIZE = 2**20 | |
19 | |
20 DEFAULT_DATA_TABLE_NAME = "indexed_maf_files" | |
21 | |
22 # Nice solution to opening compressed files (zip/bz2/gz) transparently | |
23 # https://stackoverflow.com/a/13045892/638445 | |
24 | |
25 | |
26 class CompressedFile(object): | |
27 magic = None | |
28 file_type = None | |
29 mime_type = None | |
30 proper_extension = None | |
31 | |
32 def __init__(self, f): | |
33 # f is an open file or file like object | |
34 self.f = f | |
35 self.accessor = self.open() | |
36 | |
37 @classmethod | |
38 def is_magic(self, data): | |
39 return hexlify(data).startswith(hexlify(self.magic)) | |
40 | |
41 def open(self): | |
42 return None | |
43 | |
44 | |
45 class ZIPFile(CompressedFile): | |
46 magic = b'\x50\x4b\x03\x04' | |
47 file_type = 'zip' | |
48 mime_type = 'compressed/zip' | |
49 | |
50 def open(self): | |
51 return zipfile.ZipFile(self.f) | |
52 | |
53 | |
54 class BZ2File(CompressedFile): | |
55 magic = b'\x42\x5a\x68' | |
56 file_type = 'bz2' | |
57 mime_type = 'compressed/bz2' | |
58 | |
59 def open(self): | |
60 return bz2.BZ2File(self.f) | |
61 | |
62 | |
63 class GZFile(CompressedFile): | |
64 magic = b'\x1f\x8b\x08' | |
65 file_type = 'gz' | |
66 mime_type = 'compressed/gz' | |
67 | |
68 def open(self): | |
69 return gzip.GzipFile(self.f) | |
70 | |
71 | |
72 # Factory function to create a suitable instance for accessing files | |
73 def get_compressed_file(filename): | |
74 with open(filename, 'rb') as f: | |
75 start_of_file = f.read(16) | |
76 f.seek(0) | |
77 for cls in (ZIPFile, BZ2File, GZFile): | |
78 if cls.is_magic(start_of_file): | |
79 f.close() | |
80 return cls(filename) | |
81 | |
82 return None | |
83 | |
84 | |
85 def url_download(url, tmp=False, localpath=None): | |
86 """Attempt to download file from a given url | |
87 :param url: full url to file | |
88 :type url: str. | |
89 :returns: name of downloaded file | |
90 :raises: ContentDecodingError, IOError | |
91 """ | |
92 | |
93 # Generate file_name | |
94 file_name = url.split('/')[-1] | |
95 if tmp: | |
96 file_name = os.path.join(tempfile.mkdtemp(), file_name) | |
97 elif localpath is not None: | |
98 file_name = os.path.join(localpath, file_name) | |
99 | |
100 try: | |
101 # download URL (FTP and HTTP work, probably local and data too) | |
102 urllib.request.urlretrieve(url, file_name) | |
103 | |
104 # uncompress file if needed | |
105 cf = get_compressed_file(file_name) | |
106 if cf is not None: | |
107 uncompressed_file_name = os.path.splitext(file_name)[0] | |
108 with open(uncompressed_file_name, 'wb') as uncompressed_file: | |
109 shutil.copyfileobj(cf.accessor, uncompressed_file) | |
110 os.remove(file_name) | |
111 file_name = uncompressed_file_name | |
112 except IOError as e: | |
113 sys.stderr.write('Error occured downloading reference file: %s' % e) | |
114 os.remove(file_name) | |
115 return file_name | |
116 | |
117 | |
118 def generate_metadata(params, options): | |
119 name = options.name | |
120 uid = name | |
121 species = [] | |
122 # Found to be the fastest way to strip non-alphanumeric characters | |
123 # from a string in some post on StackOverflow | |
124 pattern = re.compile(r'[\W]+') | |
125 uid = pattern.sub('_', uid).strip('_') | |
126 url = options.nexus | |
127 with open(url_download(url, True), 'r') as fh: | |
128 species = [line.strip(' (),').split(':')[0] for line in fh.readlines()] | |
129 return name, uid.upper(), species | |
130 | |
131 | |
132 def get_maf_listing(maf_path): | |
133 maf_files = [] | |
134 maf_url = urllib.parse.urlparse(maf_path) | |
135 f = ftplib.FTP() | |
136 f.connect(maf_url.netloc) | |
137 f.login() | |
138 listing = f.mlsd(maf_url.path) | |
139 compressions = ['gz', 'bz2', 'zip'] | |
140 for name, facts in listing: | |
141 skip = False | |
142 if os.path.splitext(name)[-1].lstrip('.') not in compressions: | |
143 skip = True | |
144 if facts['type'] != 'file': | |
145 skip = True | |
146 for compression in compressions: | |
147 for exclusion in ['_alt', '_random']: | |
148 if name.endswith('%s.maf.%s' % (exclusion, compression)): | |
149 skip = True | |
150 break | |
151 if name.startswith('chrUn'): | |
152 skip = True | |
153 if skip: | |
154 continue | |
155 maf_files.append(urllib.parse.urljoin(maf_path, name)) | |
156 f.close() | |
157 return maf_files | |
158 | |
159 | |
160 def index_maf_files(maf_files, maf_path, options, params, target_directory): | |
161 for maf_file in maf_files: | |
162 maf_url = urllib.parse.urljoin(maf_path, maf_file) | |
163 local_maf = url_download(maf_url, localpath=target_directory) | |
164 index_command = ['maf_build_index.py', local_maf, local_maf + '.index'] | |
165 executor = subprocess.Popen(index_command) | |
166 stdout, stderr = executor.communicate() | |
167 | |
168 | |
169 def main(): | |
170 parser = optparse.OptionParser() | |
171 parser.add_option('-x', '--nexus', dest='nexus', action='store', type='string', help='URL for .nh') | |
172 parser.add_option('-a', '--alignments', dest='alignments', action='store', type='string', help='URL for alignments') | |
173 parser.add_option('-n', '--name', dest='name', action='store', type='string', help='Name') | |
174 parser.add_option('-o', '--output', dest='output', action='store', type='string', help='Output') | |
175 parser.add_option('-d', '--dbkey', dest='dbkey', action='store', type='string', help='dbkey') | |
176 (options, args) = parser.parse_args() | |
177 | |
178 params = {} | |
179 | |
180 with open(options.output) as fh: | |
181 params = json.load(fh) | |
182 target_directory = params['output_data'][0]['extra_files_path'] | |
183 os.makedirs(target_directory, exist_ok=True) | |
184 | |
185 display_name, uid, species_list = generate_metadata(params, options) | |
186 maf_path = urllib.parse.urljoin(options.nexus, 'maf/') | |
187 maf_files = get_maf_listing(maf_path) | |
188 | |
189 data_manager_entry = { | |
190 'data_tables': { | |
191 'indexed_maf_files': { | |
192 'name': display_name, | |
193 'dbkey': options.dbkey, # This is needed for the output path | |
194 'value': uid, | |
195 'indexed_for': ','.join(species_list), | |
196 'exists_in_maf': ','.join(species_list), | |
197 'path': ','.join([maf_file.split('/')[-1] for maf_file in maf_files]), | |
198 } | |
199 } | |
200 } | |
201 | |
202 # Fetch and index the MAFs | |
203 index_maf_files(maf_files, maf_path, options, params, target_directory) | |
204 with open(options.output, 'w') as fh: | |
205 fh.write(json.dumps(data_manager_entry)) | |
206 | |
207 | |
208 if __name__ == "__main__": | |
209 main() |