Mercurial > repos > fubar > jbrowse2
comparison abjbrowse2.py @ 13:1d86925dbb4c draft
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse2 commit 873a12803692b0a84814a6dc08331d772d0e5492-dirty
author | fubar |
---|---|
date | Mon, 22 Jan 2024 11:52:19 +0000 |
parents | |
children | 7c2e28e144f3 |
comparison
equal
deleted
inserted
replaced
12:247e17ce504b | 13:1d86925dbb4c |
---|---|
1 #!/usr/bin/env python | |
2 import argparse | |
3 import binascii | |
4 import datetime | |
5 import hashlib | |
6 import json | |
7 import logging | |
8 import os | |
9 import re | |
10 import shutil | |
11 import struct | |
12 import subprocess | |
13 import tempfile | |
14 import xml.etree.ElementTree as ET | |
15 from collections import defaultdict | |
16 | |
17 logging.basicConfig(level=logging.INFO) | |
18 log = logging.getLogger('jbrowse') | |
19 TODAY = datetime.datetime.now().strftime("%Y-%m-%d") | |
20 GALAXY_INFRASTRUCTURE_URL = None | |
21 | |
22 | |
23 class ColorScaling(object): | |
24 | |
25 COLOR_FUNCTION_TEMPLATE = """ | |
26 function(feature, variableName, glyphObject, track) {{ | |
27 var score = {score}; | |
28 {opacity} | |
29 return 'rgba({red}, {green}, {blue}, ' + opacity + ')'; | |
30 }} | |
31 """ | |
32 | |
33 COLOR_FUNCTION_TEMPLATE_QUAL = r""" | |
34 function(feature, variableName, glyphObject, track) {{ | |
35 var search_up = function self(sf, attr){{ | |
36 if(sf.get(attr) !== undefined){{ | |
37 return sf.get(attr); | |
38 }} | |
39 if(sf.parent() === undefined) {{ | |
40 return; | |
41 }}else{{ | |
42 return self(sf.parent(), attr); | |
43 }} | |
44 }}; | |
45 | |
46 var search_down = function self(sf, attr){{ | |
47 if(sf.get(attr) !== undefined){{ | |
48 return sf.get(attr); | |
49 }} | |
50 if(sf.children() === undefined) {{ | |
51 return; | |
52 }}else{{ | |
53 var kids = sf.children(); | |
54 for(var child_idx in kids){{ | |
55 var x = self(kids[child_idx], attr); | |
56 if(x !== undefined){{ | |
57 return x; | |
58 }} | |
59 }} | |
60 return; | |
61 }} | |
62 }}; | |
63 | |
64 var color = ({user_spec_color} || search_up(feature, 'color') || search_down(feature, 'color') || {auto_gen_color}); | |
65 var score = (search_up(feature, 'score') || search_down(feature, 'score')); | |
66 {opacity} | |
67 if(score === undefined){{ opacity = 1; }} | |
68 var result = /^#?([a-f\d]{{2}})([a-f\d]{{2}})([a-f\d]{{2}})$/i.exec(color); | |
69 var red = parseInt(result[1], 16); | |
70 var green = parseInt(result[2], 16); | |
71 var blue = parseInt(result[3], 16); | |
72 if(isNaN(opacity) || opacity < 0){{ opacity = 0; }} | |
73 return 'rgba(' + red + ',' + green + ',' + blue + ',' + opacity + ')'; | |
74 }} | |
75 """ | |
76 | |
77 OPACITY_MATH = { | |
78 'linear': """ | |
79 var opacity = (score - ({min})) / (({max}) - ({min})); | |
80 """, | |
81 'logarithmic': """ | |
82 var opacity = Math.log10(score - ({min})) / Math.log10(({max}) - ({min})); | |
83 """, | |
84 'blast': """ | |
85 var opacity = 0; | |
86 if(score == 0.0) {{ | |
87 opacity = 1; | |
88 }} else {{ | |
89 opacity = (20 - Math.log10(score)) / 180; | |
90 }} | |
91 """ | |
92 } | |
93 | |
94 BREWER_COLOUR_IDX = 0 | |
95 BREWER_COLOUR_SCHEMES = [ | |
96 (166, 206, 227), | |
97 (31, 120, 180), | |
98 (178, 223, 138), | |
99 (51, 160, 44), | |
100 (251, 154, 153), | |
101 (227, 26, 28), | |
102 (253, 191, 111), | |
103 (255, 127, 0), | |
104 (202, 178, 214), | |
105 (106, 61, 154), | |
106 (255, 255, 153), | |
107 (177, 89, 40), | |
108 (228, 26, 28), | |
109 (55, 126, 184), | |
110 (77, 175, 74), | |
111 (152, 78, 163), | |
112 (255, 127, 0), | |
113 ] | |
114 | |
115 BREWER_DIVERGING_PALLETES = { | |
116 'BrBg': ("#543005", "#003c30"), | |
117 'PiYg': ("#8e0152", "#276419"), | |
118 'PRGn': ("#40004b", "#00441b"), | |
119 'PuOr': ("#7f3b08", "#2d004b"), | |
120 'RdBu': ("#67001f", "#053061"), | |
121 'RdGy': ("#67001f", "#1a1a1a"), | |
122 'RdYlBu': ("#a50026", "#313695"), | |
123 'RdYlGn': ("#a50026", "#006837"), | |
124 'Spectral': ("#9e0142", "#5e4fa2"), | |
125 } | |
126 | |
127 def __init__(self): | |
128 self.brewer_colour_idx = 0 | |
129 | |
130 def rgb_from_hex(self, hexstr): | |
131 # http://stackoverflow.com/questions/4296249/how-do-i-convert-a-hex-triplet-to-an-rgb-tuple-and-back | |
132 return struct.unpack('BBB', binascii.unhexlify(hexstr)) | |
133 | |
134 def min_max_gff(self, gff_file): | |
135 min_val = None | |
136 max_val = None | |
137 with open(gff_file, 'r') as handle: | |
138 for line in handle: | |
139 try: | |
140 value = float(line.split('\t')[5]) | |
141 min_val = min(value, (min_val or value)) | |
142 max_val = max(value, (max_val or value)) | |
143 | |
144 if value < min_val: | |
145 min_val = value | |
146 | |
147 if value > max_val: | |
148 max_val = value | |
149 except Exception: | |
150 pass | |
151 return min_val, max_val | |
152 | |
153 def hex_from_rgb(self, r, g, b): | |
154 return '#%02x%02x%02x' % (r, g, b) | |
155 | |
156 def _get_colours(self): | |
157 r, g, b = self.BREWER_COLOUR_SCHEMES[self.brewer_colour_idx % len(self.BREWER_COLOUR_SCHEMES)] | |
158 self.brewer_colour_idx += 1 | |
159 return r, g, b | |
160 | |
161 def parse_menus(self, track): | |
162 trackConfig = {'menuTemplate': [{}, {}, {}, {}]} | |
163 | |
164 if 'menu' in track['menus']: | |
165 menu_list = [track['menus']['menu']] | |
166 if isinstance(track['menus']['menu'], list): | |
167 menu_list = track['menus']['menu'] | |
168 | |
169 for m in menu_list: | |
170 tpl = { | |
171 'action': m['action'], | |
172 'label': m.get('label', '{name}'), | |
173 'iconClass': m.get('iconClass', 'dijitIconBookmark'), | |
174 } | |
175 if 'url' in m: | |
176 tpl['url'] = m['url'] | |
177 if 'content' in m: | |
178 tpl['content'] = m['content'] | |
179 if 'title' in m: | |
180 tpl['title'] = m['title'] | |
181 | |
182 trackConfig['menuTemplate'].append(tpl) | |
183 | |
184 return trackConfig | |
185 | |
186 def parse_colours(self, track, trackFormat, gff3=None): | |
187 # Wiggle tracks have a bicolor pallete | |
188 trackConfig = {'style': {}} | |
189 if trackFormat == 'wiggle': | |
190 | |
191 trackConfig['style']['pos_color'] = track['wiggle']['color_pos'] | |
192 trackConfig['style']['neg_color'] = track['wiggle']['color_neg'] | |
193 | |
194 if trackConfig['style']['pos_color'] == '__auto__': | |
195 trackConfig['style']['neg_color'] = self.hex_from_rgb(*self._get_colours()) | |
196 trackConfig['style']['pos_color'] = self.hex_from_rgb(*self._get_colours()) | |
197 | |
198 # Wiggle tracks can change colour at a specified place | |
199 bc_pivot = track['wiggle']['bicolor_pivot'] | |
200 if bc_pivot not in ('mean', 'zero'): | |
201 # The values are either one of those two strings | |
202 # or a number | |
203 bc_pivot = float(bc_pivot) | |
204 trackConfig['bicolor_pivot'] = bc_pivot | |
205 elif 'scaling' in track: | |
206 if track['scaling']['method'] == 'ignore': | |
207 if track['scaling']['scheme']['color'] != '__auto__': | |
208 trackConfig['style']['color'] = track['scaling']['scheme']['color'] | |
209 else: | |
210 trackConfig['style']['color'] = self.hex_from_rgb(*self._get_colours()) | |
211 else: | |
212 # Scored method | |
213 algo = track['scaling']['algo'] | |
214 # linear, logarithmic, blast | |
215 scales = track['scaling']['scales'] | |
216 # type __auto__, manual (min, max) | |
217 scheme = track['scaling']['scheme'] | |
218 # scheme -> (type (opacity), color) | |
219 # ================================== | |
220 # GENE CALLS OR BLAST | |
221 # ================================== | |
222 if trackFormat == 'blast': | |
223 red, green, blue = self._get_colours() | |
224 color_function = self.COLOR_FUNCTION_TEMPLATE.format(**{ | |
225 'score': "feature._parent.get('score')", | |
226 'opacity': self.OPACITY_MATH['blast'], | |
227 'red': red, | |
228 'green': green, | |
229 'blue': blue, | |
230 }) | |
231 trackConfig['style']['color'] = color_function.replace('\n', '') | |
232 elif trackFormat == 'gene_calls': | |
233 # Default values, based on GFF3 spec | |
234 min_val = 0 | |
235 max_val = 1000 | |
236 # Get min/max and build a scoring function since JBrowse doesn't | |
237 if scales['type'] == 'automatic' or scales['type'] == '__auto__': | |
238 min_val, max_val = self.min_max_gff(gff3) | |
239 else: | |
240 min_val = scales.get('min', 0) | |
241 max_val = scales.get('max', 1000) | |
242 | |
243 if scheme['color'] == '__auto__': | |
244 user_color = 'undefined' | |
245 auto_color = "'%s'" % self.hex_from_rgb(*self._get_colours()) | |
246 elif scheme['color'].startswith('#'): | |
247 user_color = "'%s'" % self.hex_from_rgb(*self.rgb_from_hex(scheme['color'][1:])) | |
248 auto_color = 'undefined' | |
249 else: | |
250 user_color = 'undefined' | |
251 auto_color = "'%s'" % self.hex_from_rgb(*self._get_colours()) | |
252 | |
253 color_function = self.COLOR_FUNCTION_TEMPLATE_QUAL.format(**{ | |
254 'opacity': self.OPACITY_MATH[algo].format(**{'max': max_val, 'min': min_val}), | |
255 'user_spec_color': user_color, | |
256 'auto_gen_color': auto_color, | |
257 }) | |
258 | |
259 trackConfig['style']['color'] = color_function.replace('\n', '') | |
260 return trackConfig | |
261 | |
262 | |
263 def etree_to_dict(t): | |
264 if t is None: | |
265 return {} | |
266 | |
267 d = {t.tag: {} if t.attrib else None} | |
268 children = list(t) | |
269 if children: | |
270 dd = defaultdict(list) | |
271 for dc in map(etree_to_dict, children): | |
272 for k, v in dc.items(): | |
273 dd[k].append(v) | |
274 d = {t.tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}} | |
275 if t.attrib: | |
276 d[t.tag].update(('@' + k, v) for k, v in t.attrib.items()) | |
277 if t.text: | |
278 text = t.text.strip() | |
279 if children or t.attrib: | |
280 if text: | |
281 d[t.tag]['#text'] = text | |
282 else: | |
283 d[t.tag] = text | |
284 return d | |
285 | |
286 | |
287 # score comes from feature._parent.get('score') or feature.get('score') | |
288 | |
289 INSTALLED_TO = os.path.dirname(os.path.realpath(__file__)) | |
290 | |
291 | |
292 def metadata_from_node(node): | |
293 metadata = {} | |
294 try: | |
295 if len(node.findall('dataset')) != 1: | |
296 # exit early | |
297 return metadata | |
298 except Exception: | |
299 return {} | |
300 | |
301 for (key, value) in node.findall('dataset')[0].attrib.items(): | |
302 metadata['dataset_%s' % key] = value | |
303 | |
304 for (key, value) in node.findall('history')[0].attrib.items(): | |
305 metadata['history_%s' % key] = value | |
306 | |
307 for (key, value) in node.findall('metadata')[0].attrib.items(): | |
308 metadata['metadata_%s' % key] = value | |
309 | |
310 for (key, value) in node.findall('tool')[0].attrib.items(): | |
311 metadata['tool_%s' % key] = value | |
312 | |
313 # Additional Mappings applied: | |
314 metadata['dataset_edam_format'] = '<a target="_blank" href="http://edamontology.org/{0}">{1}</a>'.format(metadata['dataset_edam_format'], metadata['dataset_file_ext']) | |
315 metadata['history_user_email'] = '<a href="mailto:{0}">{0}</a>'.format(metadata['history_user_email']) | |
316 metadata['history_display_name'] = '<a target="_blank" href="{galaxy}/history/view/{encoded_hist_id}">{hist_name}</a>'.format( | |
317 galaxy=GALAXY_INFRASTRUCTURE_URL, | |
318 encoded_hist_id=metadata['history_id'], | |
319 hist_name=metadata['history_display_name'] | |
320 ) | |
321 metadata['tool_tool'] = '<a target="_blank" href="{galaxy}/datasets/{encoded_id}/show_params">{tool_id}</a>'.format( | |
322 galaxy=GALAXY_INFRASTRUCTURE_URL, | |
323 encoded_id=metadata['dataset_id'], | |
324 tool_id=metadata['tool_tool_id'], | |
325 # tool_version=metadata['tool_tool_version'], | |
326 ) | |
327 return metadata | |
328 | |
329 | |
330 class JbrowseConnector(object): | |
331 | |
332 def __init__(self, jbrowse, outdir, genomes): | |
333 self.cs = ColorScaling() | |
334 self.jbrowse = jbrowse | |
335 self.outdir = outdir | |
336 self.genome_paths = genomes | |
337 self.tracksToIndex = [] | |
338 | |
339 # This is the id of the current assembly | |
340 self.assembly_ids = {} | |
341 self.current_assembly_id = [] | |
342 | |
343 # If upgrading, look at the existing data | |
344 self.check_existing(self.outdir) | |
345 | |
346 self.clone_jbrowse(self.jbrowse, self.outdir) | |
347 | |
348 self.process_genomes() | |
349 | |
350 def subprocess_check_call(self, command, output=None): | |
351 if output: | |
352 log.debug('cd %s && %s > %s', self.outdir, ' '.join(command), output) | |
353 subprocess.check_call(command, cwd=self.outdir, stdout=output) | |
354 else: | |
355 log.debug('cd %s && %s', self.outdir, ' '.join(command)) | |
356 subprocess.check_call(command, cwd=self.outdir) | |
357 | |
358 def subprocess_popen(self, command): | |
359 log.debug('cd %s && %s', self.outdir, command) | |
360 p = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
361 output, err = p.communicate() | |
362 retcode = p.returncode | |
363 if retcode != 0: | |
364 log.error('cd %s && %s', self.outdir, command) | |
365 log.error(output) | |
366 log.error(err) | |
367 raise RuntimeError("Command failed with exit code %s" % (retcode)) | |
368 | |
369 def subprocess_check_output(self, command): | |
370 log.debug('cd %s && %s', self.outdir, ' '.join(command)) | |
371 return subprocess.check_output(command, cwd=self.outdir) | |
372 | |
373 def symlink_or_copy(self, src, dest): | |
374 if 'GALAXY_JBROWSE_SYMLINKS' in os.environ and bool(os.environ['GALAXY_JBROWSE_SYMLINKS']): | |
375 cmd = ['ln', '-s', src, dest] | |
376 else: | |
377 cmd = ['cp', src, dest] | |
378 | |
379 return self.subprocess_check_call(cmd) | |
380 | |
381 def symlink_or_copy_load_action(self): | |
382 if 'GALAXY_JBROWSE_SYMLINKS' in os.environ and bool(os.environ['GALAXY_JBROWSE_SYMLINKS']): | |
383 return 'symlink' | |
384 else: | |
385 return 'copy' | |
386 | |
387 def check_existing(self, destination): | |
388 existing = os.path.join(destination, 'data', "config.json") | |
389 if os.path.exists(existing): | |
390 with open(existing, 'r') as existing_conf: | |
391 conf = json.load(existing_conf) | |
392 if 'assemblies' in conf: | |
393 for assembly in conf['assemblies']: | |
394 if 'name' in assembly: | |
395 self.assembly_ids[assembly['name']] = None | |
396 | |
397 def process_genomes(self): | |
398 for genome_node in self.genome_paths: | |
399 # We only expect one input genome per run. This for loop is just | |
400 # easier to write than the alternative / catches any possible | |
401 # issues. | |
402 self.add_assembly(genome_node['path'], genome_node['label']) | |
403 | |
404 def add_assembly(self, path, label, default=True): | |
405 # Find a non-existing filename for the new genome | |
406 # (to avoid colision when upgrading an existing instance) | |
407 rel_seq_path = os.path.join('data', 'assembly') | |
408 seq_path = os.path.join(self.outdir, rel_seq_path) | |
409 fn_try = 1 | |
410 while (os.path.exists(seq_path + '.fasta') or os.path.exists(seq_path + '.fasta.gz') | |
411 or os.path.exists(seq_path + '.fasta.gz.fai') or os.path.exists(seq_path + '.fasta.gz.gzi')): | |
412 rel_seq_path = os.path.join('data', 'assembly%s' % fn_try) | |
413 seq_path = os.path.join(self.outdir, rel_seq_path) | |
414 fn_try += 1 | |
415 | |
416 # Find a non-existing label for the new genome | |
417 # (to avoid colision when upgrading an existing instance) | |
418 lab_try = 1 | |
419 uniq_label = label | |
420 while uniq_label in self.assembly_ids: | |
421 uniq_label = label + str(lab_try) | |
422 lab_try += 1 | |
423 | |
424 # Find a default scaffold to display | |
425 # TODO this may not be necessary in the future, see https://github.com/GMOD/jbrowse-components/issues/2708 | |
426 with open(path, 'r') as fa_handle: | |
427 fa_header = fa_handle.readline()[1:].strip().split(' ')[0] | |
428 | |
429 self.assembly_ids[uniq_label] = fa_header | |
430 if default: | |
431 self.current_assembly_id = uniq_label | |
432 | |
433 copied_genome = seq_path + '.fasta' | |
434 shutil.copy(path, copied_genome) | |
435 | |
436 # Compress with bgzip | |
437 cmd = ['bgzip', copied_genome] | |
438 self.subprocess_check_call(cmd) | |
439 | |
440 # FAI Index | |
441 cmd = ['samtools', 'faidx', copied_genome + '.gz'] | |
442 self.subprocess_check_call(cmd) | |
443 | |
444 self.subprocess_check_call([ | |
445 'jbrowse', 'add-assembly', | |
446 '--load', 'inPlace', | |
447 '--name', uniq_label, | |
448 '--type', 'bgzipFasta', | |
449 '--target', os.path.join(self.outdir, 'data'), | |
450 '--skipCheck', | |
451 rel_seq_path + '.fasta.gz']) | |
452 | |
453 return uniq_label | |
454 | |
455 def text_index(self): | |
456 # Index tracks | |
457 args = [ | |
458 'jbrowse', 'text-index', | |
459 '--target', os.path.join(self.outdir, 'data'), | |
460 '--assemblies', self.current_assembly_id, | |
461 ] | |
462 | |
463 tracks = ','.join(self.tracksToIndex) | |
464 if tracks: | |
465 args += ['--tracks', tracks] | |
466 | |
467 self.subprocess_check_call(args) | |
468 | |
469 def _blastxml_to_gff3(self, xml, min_gap=10): | |
470 gff3_unrebased = tempfile.NamedTemporaryFile(delete=False) | |
471 cmd = ['python', os.path.join(INSTALLED_TO, 'blastxml_to_gapped_gff3.py'), | |
472 '--trim', '--trim_end', '--include_seq', '--min_gap', str(min_gap), xml] | |
473 log.debug('cd %s && %s > %s', self.outdir, ' '.join(cmd), gff3_unrebased.name) | |
474 subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_unrebased) | |
475 gff3_unrebased.close() | |
476 return gff3_unrebased.name | |
477 | |
478 def _prepare_track_style(self, xml_conf): | |
479 | |
480 style_data = { | |
481 "type": "LinearBasicDisplay" | |
482 } | |
483 | |
484 if 'display' in xml_conf['style']: | |
485 style_data['type'] = xml_conf['style']['display'] | |
486 del xml_conf['style']['display'] | |
487 | |
488 style_data['displayId'] = "%s_%s" % (xml_conf['label'], style_data['type']) | |
489 | |
490 style_data.update(xml_conf['style']) | |
491 | |
492 return {'displays': [style_data]} | |
493 | |
494 def add_blastxml(self, data, trackData, blastOpts, **kwargs): | |
495 gff3 = self._blastxml_to_gff3(data, min_gap=blastOpts['min_gap']) | |
496 | |
497 if 'parent' in blastOpts and blastOpts['parent'] != 'None': | |
498 gff3_rebased = tempfile.NamedTemporaryFile(delete=False) | |
499 cmd = ['python', os.path.join(INSTALLED_TO, 'gff3_rebase.py')] | |
500 if blastOpts.get('protein', 'false') == 'true': | |
501 cmd.append('--protein2dna') | |
502 cmd.extend([os.path.realpath(blastOpts['parent']), gff3]) | |
503 log.debug('cd %s && %s > %s', self.outdir, ' '.join(cmd), gff3_rebased.name) | |
504 subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_rebased) | |
505 gff3_rebased.close() | |
506 | |
507 # Replace original gff3 file | |
508 shutil.copy(gff3_rebased.name, gff3) | |
509 os.unlink(gff3_rebased.name) | |
510 | |
511 rel_dest = os.path.join('data', trackData['label'] + '.gff') | |
512 dest = os.path.join(self.outdir, rel_dest) | |
513 | |
514 self._sort_gff(gff3, dest) | |
515 os.unlink(gff3) | |
516 | |
517 style_json = self._prepare_track_style(trackData) | |
518 | |
519 self._add_track(trackData['label'], trackData['key'], trackData['category'], rel_dest + '.gz', config=style_json) | |
520 | |
521 def add_bigwig(self, data, trackData, wiggleOpts, **kwargs): | |
522 | |
523 rel_dest = os.path.join('data', trackData['label'] + '.bw') | |
524 dest = os.path.join(self.outdir, rel_dest) | |
525 self.symlink_or_copy(os.path.realpath(data), dest) | |
526 | |
527 style_json = self._prepare_track_style(trackData) | |
528 | |
529 self._add_track(trackData['label'], trackData['key'], trackData['category'], rel_dest, config=style_json) | |
530 | |
531 # Anything ending in "am" (Bam or Cram) | |
532 def add_xam(self, data, trackData, xamOpts, index=None, ext="bam", **kwargs): | |
533 | |
534 index_ext = "bai" | |
535 if ext == "cram": | |
536 index_ext = "crai" | |
537 | |
538 rel_dest = os.path.join('data', trackData['label'] + '.%s' % ext) | |
539 dest = os.path.join(self.outdir, rel_dest) | |
540 | |
541 self.symlink_or_copy(os.path.realpath(data), dest) | |
542 | |
543 if index is not None and os.path.exists(os.path.realpath(index)): | |
544 # xai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest | |
545 self.subprocess_check_call(['cp', os.path.realpath(index), dest + '.%s' % index_ext]) | |
546 else: | |
547 # Can happen in exotic condition | |
548 # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam | |
549 # => no index generated by galaxy, but there might be one next to the symlink target | |
550 # this trick allows to skip the bam sorting made by galaxy if already done outside | |
551 if os.path.exists(os.path.realpath(data) + '.%s' % index_ext): | |
552 self.symlink_or_copy(os.path.realpath(data) + '.%s' % index_ext, dest + '.%s' % index_ext) | |
553 else: | |
554 log.warn('Could not find a bam index (.%s file) for %s', (index_ext, data)) | |
555 | |
556 style_json = self._prepare_track_style(trackData) | |
557 | |
558 self._add_track(trackData['label'], trackData['key'], trackData['category'], rel_dest, config=style_json) | |
559 | |
560 def add_vcf(self, data, trackData, vcfOpts={}, zipped=False, **kwargs): | |
561 | |
562 if zipped: | |
563 rel_dest = os.path.join('data', trackData['label'] + '.vcf.gz') | |
564 dest = os.path.join(self.outdir, rel_dest) | |
565 shutil.copy(os.path.realpath(data), dest) | |
566 else: | |
567 rel_dest = os.path.join('data', trackData['label'] + '.vcf') | |
568 dest = os.path.join(self.outdir, rel_dest) | |
569 shutil.copy(os.path.realpath(data), dest) | |
570 | |
571 cmd = ['bgzip', dest] | |
572 self.subprocess_check_call(cmd) | |
573 cmd = ['tabix', dest + '.gz'] | |
574 self.subprocess_check_call(cmd) | |
575 | |
576 rel_dest = os.path.join('data', trackData['label'] + '.vcf.gz') | |
577 | |
578 style_json = self._prepare_track_style(trackData) | |
579 | |
580 self._add_track(trackData['label'], trackData['key'], trackData['category'], rel_dest, config=style_json) | |
581 | |
582 def add_gff(self, data, format, trackData, gffOpts, **kwargs): | |
583 rel_dest = os.path.join('data', trackData['label'] + '.gff') | |
584 dest = os.path.join(self.outdir, rel_dest) | |
585 | |
586 self._sort_gff(data, dest) | |
587 | |
588 style_json = self._prepare_track_style(trackData) | |
589 | |
590 self._add_track(trackData['label'], trackData['key'], trackData['category'], rel_dest + '.gz', config=style_json) | |
591 | |
592 def add_bed(self, data, format, trackData, gffOpts, **kwargs): | |
593 rel_dest = os.path.join('data', trackData['label'] + '.bed') | |
594 dest = os.path.join(self.outdir, rel_dest) | |
595 | |
596 self._sort_bed(data, dest) | |
597 | |
598 style_json = self._prepare_track_style(trackData) | |
599 | |
600 self._add_track(trackData['label'], trackData['key'], trackData['category'], rel_dest + '.gz', config=style_json) | |
601 | |
602 def add_paf(self, data, trackData, pafOpts, **kwargs): | |
603 rel_dest = os.path.join('data', trackData['label'] + '.paf') | |
604 dest = os.path.join(self.outdir, rel_dest) | |
605 | |
606 self.symlink_or_copy(os.path.realpath(data), dest) | |
607 | |
608 added_assembly = self.add_assembly(pafOpts['genome'], pafOpts['genome_label'], default=False) | |
609 | |
610 style_json = self._prepare_track_style(trackData) | |
611 | |
612 self._add_track(trackData['label'], trackData['key'], trackData['category'], rel_dest, assemblies=[self.current_assembly_id, added_assembly], config=style_json) | |
613 | |
614 def add_hic(self, data, trackData, hicOpts, **kwargs): | |
615 rel_dest = os.path.join('data', trackData['label'] + '.hic') | |
616 dest = os.path.join(self.outdir, rel_dest) | |
617 | |
618 self.symlink_or_copy(os.path.realpath(data), dest) | |
619 | |
620 style_json = self._prepare_track_style(trackData) | |
621 | |
622 self._add_track(trackData['label'], trackData['key'], trackData['category'], rel_dest, config=style_json) | |
623 | |
624 def add_sparql(self, url, query, query_refnames, trackData): | |
625 | |
626 json_track_data = { | |
627 "type": "FeatureTrack", | |
628 "trackId": id, | |
629 "name": trackData['label'], | |
630 "adapter": { | |
631 "type": "SPARQLAdapter", | |
632 "endpoint": { | |
633 "uri": url, | |
634 "locationType": "UriLocation" | |
635 }, | |
636 "queryTemplate": query | |
637 }, | |
638 "category": [ | |
639 trackData['category'] | |
640 ], | |
641 "assemblyNames": [ | |
642 self.current_assembly_id | |
643 ] | |
644 } | |
645 | |
646 if query_refnames: | |
647 json_track_data['adapter']['refNamesQueryTemplate']: query_refnames | |
648 | |
649 self.subprocess_check_call([ | |
650 'jbrowse', 'add-track-json', | |
651 '--target', os.path.join(self.outdir, 'data'), | |
652 json_track_data]) | |
653 | |
654 # Doesn't work as of 1.6.4, might work in the future | |
655 # self.subprocess_check_call([ | |
656 # 'jbrowse', 'add-track', | |
657 # '--trackType', 'sparql', | |
658 # '--name', trackData['label'], | |
659 # '--category', trackData['category'], | |
660 # '--target', os.path.join(self.outdir, 'data'), | |
661 # '--trackId', id, | |
662 # '--config', '{"queryTemplate": "%s"}' % query, | |
663 # url]) | |
664 | |
665 def _add_track(self, id, label, category, path, assemblies=[], config=None): | |
666 | |
667 assemblies_opt = self.current_assembly_id | |
668 if assemblies: | |
669 assemblies_opt = ','.join(assemblies) | |
670 | |
671 cmd = [ | |
672 'jbrowse', 'add-track', | |
673 '--load', 'inPlace', | |
674 '--name', label, | |
675 '--category', category, | |
676 '--target', os.path.join(self.outdir, 'data'), | |
677 '--trackId', id, | |
678 '--assemblyNames', assemblies_opt | |
679 ] | |
680 | |
681 if config: | |
682 cmd.append('--config') | |
683 cmd.append(json.dumps(config)) | |
684 | |
685 cmd.append(path) | |
686 | |
687 self.subprocess_check_call(cmd) | |
688 | |
689 def _sort_gff(self, data, dest): | |
690 # Only index if not already done | |
691 if not os.path.exists(dest): | |
692 cmd = "gff3sort.pl --precise '%s' | grep -v \"^$\" > '%s'" % (data, dest) | |
693 self.subprocess_popen(cmd) | |
694 | |
695 self.subprocess_check_call(['bgzip', '-f', dest]) | |
696 self.subprocess_check_call(['tabix', '-f', '-p', 'gff', dest + '.gz']) | |
697 | |
698 def _sort_bed(self, data, dest): | |
699 # Only index if not already done | |
700 if not os.path.exists(dest): | |
701 cmd = ['sort', '-k1,1', '-k2,2n', data] | |
702 with open(dest, 'w') as handle: | |
703 self.subprocess_check_call(cmd, output=handle) | |
704 | |
705 self.subprocess_check_call(['bgzip', '-f', dest]) | |
706 self.subprocess_check_call(['tabix', '-f', '-p', 'bed', dest + '.gz']) | |
707 | |
708 def process_annotations(self, track): | |
709 | |
710 category = track['category'].replace('__pd__date__pd__', TODAY) | |
711 outputTrackConfig = { | |
712 'category': category, | |
713 } | |
714 | |
715 mapped_chars = { | |
716 '>': '__gt__', | |
717 '<': '__lt__', | |
718 "'": '__sq__', | |
719 '"': '__dq__', | |
720 '[': '__ob__', | |
721 ']': '__cb__', | |
722 '{': '__oc__', | |
723 '}': '__cc__', | |
724 '@': '__at__', | |
725 '#': '__pd__', | |
726 "": '__cn__' | |
727 } | |
728 | |
729 for i, (dataset_path, dataset_ext, track_human_label, extra_metadata) in enumerate(track['trackfiles']): | |
730 # Unsanitize labels (element_identifiers are always sanitized by Galaxy) | |
731 for key, value in mapped_chars.items(): | |
732 track_human_label = track_human_label.replace(value, key) | |
733 | |
734 log.info('Processing track %s / %s (%s)', category, track_human_label, dataset_ext) | |
735 outputTrackConfig['key'] = track_human_label | |
736 # We add extra data to hash for the case of REST + SPARQL. | |
737 if 'conf' in track and 'options' in track['conf'] and 'url' in track['conf']['options']: | |
738 rest_url = track['conf']['options']['url'] | |
739 else: | |
740 rest_url = '' | |
741 | |
742 # I chose to use track['category'] instead of 'category' here. This | |
743 # is intentional. This way re-running the tool on a different date | |
744 # will not generate different hashes and make comparison of outputs | |
745 # much simpler. | |
746 hashData = [str(dataset_path), track_human_label, track['category'], rest_url, self.current_assembly_id] | |
747 hashData = '|'.join(hashData).encode('utf-8') | |
748 outputTrackConfig['label'] = hashlib.md5(hashData).hexdigest() + '_%s' % i | |
749 outputTrackConfig['metadata'] = extra_metadata | |
750 | |
751 outputTrackConfig['style'] = track['style'] | |
752 | |
753 if 'menus' in track['conf']['options']: | |
754 menus = self.cs.parse_menus(track['conf']['options']) | |
755 outputTrackConfig.update(menus) | |
756 | |
757 if dataset_ext in ('gff', 'gff3'): | |
758 self.add_gff(dataset_path, dataset_ext, outputTrackConfig, | |
759 track['conf']['options']['gff']) | |
760 elif dataset_ext == 'bed': | |
761 self.add_bed(dataset_path, dataset_ext, outputTrackConfig, | |
762 track['conf']['options']['gff']) | |
763 elif dataset_ext == 'bigwig': | |
764 self.add_bigwig(dataset_path, outputTrackConfig, | |
765 track['conf']['options']['wiggle']) | |
766 elif dataset_ext == 'bam': | |
767 real_indexes = track['conf']['options']['pileup']['bam_indices']['bam_index'] | |
768 if not isinstance(real_indexes, list): | |
769 # <bam_indices> | |
770 # <bam_index>/path/to/a.bam.bai</bam_index> | |
771 # </bam_indices> | |
772 # | |
773 # The above will result in the 'bam_index' key containing a | |
774 # string. If there are two or more indices, the container | |
775 # becomes a list. Fun! | |
776 real_indexes = [real_indexes] | |
777 | |
778 self.add_xam(dataset_path, outputTrackConfig, | |
779 track['conf']['options']['pileup'], | |
780 index=real_indexes[i], ext="bam") | |
781 elif dataset_ext == 'cram': | |
782 real_indexes = track['conf']['options']['cram']['cram_indices']['cram_index'] | |
783 if not isinstance(real_indexes, list): | |
784 # <bam_indices> | |
785 # <bam_index>/path/to/a.bam.bai</bam_index> | |
786 # </bam_indices> | |
787 # | |
788 # The above will result in the 'bam_index' key containing a | |
789 # string. If there are two or more indices, the container | |
790 # becomes a list. Fun! | |
791 real_indexes = [real_indexes] | |
792 | |
793 self.add_xam(dataset_path, outputTrackConfig, | |
794 track['conf']['options']['cram'], | |
795 index=real_indexes[i], ext="cram") | |
796 elif dataset_ext == 'blastxml': | |
797 self.add_blastxml(dataset_path, outputTrackConfig, track['conf']['options']['blast']) | |
798 elif dataset_ext == 'vcf': | |
799 self.add_vcf(dataset_path, outputTrackConfig) | |
800 elif dataset_ext == 'vcf_bgzip': | |
801 self.add_vcf(dataset_path, outputTrackConfig, zipped=True) | |
802 elif dataset_ext == 'rest': | |
803 self.add_rest(track['conf']['options']['rest']['url'], outputTrackConfig) | |
804 elif dataset_ext == 'synteny': | |
805 self.add_paf(dataset_path, outputTrackConfig, | |
806 track['conf']['options']['synteny']) | |
807 elif dataset_ext == 'hic': | |
808 self.add_hic(dataset_path, outputTrackConfig, | |
809 track['conf']['options']['hic']) | |
810 elif dataset_ext == 'sparql': | |
811 sparql_query = track['conf']['options']['sparql']['query'] | |
812 for key, value in mapped_chars.items(): | |
813 sparql_query = sparql_query.replace(value, key) | |
814 sparql_query_refnames = track['conf']['options']['sparql']['query_refnames'] | |
815 for key, value in mapped_chars.items(): | |
816 sparql_query_refnames = sparql_query_refnames.replace(value, key) | |
817 self.add_sparql(track['conf']['options']['sparql']['url'], sparql_query, sparql_query_refnames, outputTrackConfig) | |
818 else: | |
819 log.warn('Do not know how to handle %s', dataset_ext) | |
820 | |
821 # Return non-human label for use in other fields | |
822 yield outputTrackConfig['label'] | |
823 | |
824 def add_default_session(self, data): | |
825 """ | |
826 Add some default session settings: set some assemblies/tracks on/off | |
827 """ | |
828 tracks_data = [] | |
829 | |
830 # TODO using the default session for now, but check out session specs in the future https://github.com/GMOD/jbrowse-components/issues/2708 | |
831 | |
832 # We need to know the track type from the config.json generated just before | |
833 config_path = os.path.join(self.outdir, 'data', 'config.json') | |
834 track_types = {} | |
835 with open(config_path, 'r') as config_file: | |
836 config_json = json.load(config_file) | |
837 | |
838 for track_conf in config_json['tracks']: | |
839 track_types[track_conf['trackId']] = track_conf['type'] | |
840 | |
841 for on_track in data['visibility']['default_on']: | |
842 # TODO several problems with this currently | |
843 # - we are forced to copy the same kind of style config as the per track config from _prepare_track_style (not exactly the same though) | |
844 # - we get an error when refreshing the page | |
845 # - this could be solved by session specs, see https://github.com/GMOD/jbrowse-components/issues/2708 | |
846 style_data = { | |
847 "type": "LinearBasicDisplay", | |
848 "height": 100 | |
849 } | |
850 | |
851 if on_track in data['style']: | |
852 if 'display' in data['style'][on_track]: | |
853 style_data['type'] = data['style'][on_track]['display'] | |
854 del data['style'][on_track]['display'] | |
855 | |
856 style_data.update(data['style'][on_track]) | |
857 | |
858 if on_track in data['style_labels']: | |
859 # TODO fix this: it should probably go in a renderer block (SvgFeatureRenderer) but still does not work | |
860 # TODO move this to per track displays? | |
861 style_data['labels'] = data['style_labels'][on_track] | |
862 | |
863 tracks_data.append({ | |
864 "type": track_types[on_track], | |
865 "configuration": on_track, | |
866 "displays": [ | |
867 style_data | |
868 ] | |
869 }) | |
870 | |
871 # The view for the assembly we're adding | |
872 view_json = { | |
873 "type": "LinearGenomeView", | |
874 "tracks": tracks_data | |
875 } | |
876 | |
877 refName = None | |
878 if data.get('defaultLocation', ''): | |
879 loc_match = re.search(r'^(\w+):(\d+)\.+(\d+)$', data['defaultLocation']) | |
880 if loc_match: | |
881 refName = loc_match.group(1) | |
882 start = int(loc_match.group(2)) | |
883 end = int(loc_match.group(3)) | |
884 elif self.assembly_ids[self.current_assembly_id] is not None: | |
885 refName = self.assembly_ids[self.current_assembly_id] | |
886 start = 0 | |
887 end = 1000000 # Booh, hard coded! waiting for https://github.com/GMOD/jbrowse-components/issues/2708 | |
888 | |
889 if refName is not None: | |
890 # TODO displayedRegions is not just zooming to the region, it hides the rest of the chromosome | |
891 view_json['displayedRegions'] = [{ | |
892 "refName": refName, | |
893 "start": start, | |
894 "end": end, | |
895 "reversed": False, | |
896 "assemblyName": self.current_assembly_id | |
897 }] | |
898 | |
899 session_name = data.get('session_name', "New session") | |
900 if not session_name: | |
901 session_name = "New session" | |
902 | |
903 # Merge with possibly existing defaultSession (if upgrading a jbrowse instance) | |
904 session_json = {} | |
905 if 'defaultSession' in config_json: | |
906 session_json = config_json['defaultSession'] | |
907 | |
908 session_json["name"] = session_name | |
909 | |
910 if 'views' not in session_json: | |
911 session_json['views'] = [] | |
912 | |
913 session_json['views'].append(view_json) | |
914 | |
915 config_json['defaultSession'] = session_json | |
916 | |
917 with open(config_path, 'w') as config_file: | |
918 json.dump(config_json, config_file, indent=2) | |
919 | |
920 def add_general_configuration(self, data): | |
921 """ | |
922 Add some general configuration to the config.json file | |
923 """ | |
924 | |
925 config_path = os.path.join(self.outdir, 'data', 'config.json') | |
926 with open(config_path, 'r') as config_file: | |
927 config_json = json.load(config_file) | |
928 | |
929 config_data = {} | |
930 | |
931 config_data['disableAnalytics'] = data.get('analytics', 'false') == 'true' | |
932 | |
933 config_data['theme'] = { | |
934 "palette": { | |
935 "primary": { | |
936 "main": data.get('primary_color', '#0D233F') | |
937 }, | |
938 "secondary": { | |
939 "main": data.get('secondary_color', '#721E63') | |
940 }, | |
941 "tertiary": { | |
942 "main": data.get('tertiary_color', '#135560') | |
943 }, | |
944 "quaternary": { | |
945 "main": data.get('quaternary_color', '#FFB11D') | |
946 }, | |
947 }, | |
948 "typography": { | |
949 "fontSize": int(data.get('font_size', 10)) | |
950 }, | |
951 } | |
952 | |
953 config_json['configuration'].update(config_data) | |
954 | |
955 with open(config_path, 'w') as config_file: | |
956 json.dump(config_json, config_file, indent=2) | |
957 | |
958 def clone_jbrowse(self, jbrowse_dir, destination): | |
959 """Clone a JBrowse directory into a destination directory. | |
960 """ | |
961 | |
962 copytree(jbrowse_dir, destination) | |
963 | |
964 try: | |
965 shutil.rmtree(os.path.join(destination, 'test_data')) | |
966 except OSError as e: | |
967 log.error("Error: %s - %s." % (e.filename, e.strerror)) | |
968 | |
969 if not os.path.exists(os.path.join(destination, 'data')): | |
970 # It can already exist if upgrading an instance | |
971 os.makedirs(os.path.join(destination, 'data')) | |
972 log.info("makedir %s" % (os.path.join(destination, 'data'))) | |
973 | |
974 os.symlink('./data/config.json', os.path.join(destination, 'config.json')) | |
975 | |
976 | |
977 def copytree(src, dst, symlinks=False, ignore=None): | |
978 for item in os.listdir(src): | |
979 s = os.path.join(src, item) | |
980 d = os.path.join(dst, item) | |
981 if os.path.isdir(s): | |
982 shutil.copytree(s, d, symlinks, ignore) | |
983 else: | |
984 shutil.copy2(s, d) | |
985 | |
986 | |
987 def parse_style_conf(item): | |
988 if 'type' in item.attrib and item.attrib['type'] in ['boolean', 'integer']: | |
989 if item.attrib['type'] == 'boolean': | |
990 return item.text in ("yes", "true", "True") | |
991 elif item.attrib['type'] == 'integer': | |
992 return int(item.text) | |
993 else: | |
994 return item.text | |
995 | |
996 | |
997 if __name__ == '__main__': | |
998 parser = argparse.ArgumentParser(description="", epilog="") | |
999 parser.add_argument('xml', type=argparse.FileType('r'), help='Track Configuration') | |
1000 | |
1001 parser.add_argument('--jbrowse', help='Folder containing a jbrowse release') | |
1002 parser.add_argument('--outdir', help='Output directory', default='out') | |
1003 parser.add_argument('--version', '-V', action='version', version="%(prog)s 0.8.0") | |
1004 args = parser.parse_args() | |
1005 | |
1006 tree = ET.parse(args.xml.name) | |
1007 root = tree.getroot() | |
1008 | |
1009 # This should be done ASAP | |
1010 GALAXY_INFRASTRUCTURE_URL = root.find('metadata/galaxyUrl').text | |
1011 # Sometimes this comes as `localhost` without a protocol | |
1012 if not GALAXY_INFRASTRUCTURE_URL.startswith('http'): | |
1013 # so we'll prepend `http://` and hope for the best. Requests *should* | |
1014 # be GET and not POST so it should redirect OK | |
1015 GALAXY_INFRASTRUCTURE_URL = 'http://' + GALAXY_INFRASTRUCTURE_URL | |
1016 | |
1017 jc = JbrowseConnector( | |
1018 jbrowse=args.jbrowse, | |
1019 outdir=args.outdir, | |
1020 genomes=[ | |
1021 { | |
1022 'path': os.path.realpath(x.attrib['path']), | |
1023 'meta': metadata_from_node(x.find('metadata')), | |
1024 'label': x.attrib['label'] | |
1025 } | |
1026 for x in root.findall('metadata/genomes/genome') | |
1027 ] | |
1028 ) | |
1029 | |
1030 default_session_data = { | |
1031 'visibility': { | |
1032 'default_on': [], | |
1033 'default_off': [], | |
1034 }, | |
1035 'style': {}, | |
1036 'style_labels': {} | |
1037 } | |
1038 | |
1039 # TODO add metadata to tracks | |
1040 for track in root.findall('tracks/track'): | |
1041 track_conf = {} | |
1042 track_conf['trackfiles'] = [] | |
1043 | |
1044 trackfiles = track.findall('files/trackFile') | |
1045 if trackfiles: | |
1046 for x in track.findall('files/trackFile'): | |
1047 if trackfiles: | |
1048 metadata = metadata_from_node(x.find('metadata')) | |
1049 | |
1050 track_conf['trackfiles'].append(( | |
1051 os.path.realpath(x.attrib['path']), | |
1052 x.attrib['ext'], | |
1053 x.attrib['label'], | |
1054 metadata | |
1055 )) | |
1056 else: | |
1057 # For tracks without files (rest, sparql) | |
1058 track_conf['trackfiles'].append(( | |
1059 '', # N/A, no path for rest or sparql | |
1060 track.attrib['format'], | |
1061 track.find('options/label').text, | |
1062 {} | |
1063 )) | |
1064 | |
1065 track_conf['category'] = track.attrib['cat'] | |
1066 track_conf['format'] = track.attrib['format'] | |
1067 track_conf['style'] = {item.tag: parse_style_conf(item) for item in track.find('options/style')} | |
1068 | |
1069 track_conf['style'] = {item.tag: parse_style_conf(item) for item in track.find('options/style')} | |
1070 | |
1071 track_conf['style_labels'] = {item.tag: parse_style_conf(item) for item in track.find('options/style_labels')} | |
1072 | |
1073 track_conf['conf'] = etree_to_dict(track.find('options')) | |
1074 keys = jc.process_annotations(track_conf) | |
1075 | |
1076 for key in keys: | |
1077 default_session_data['visibility'][track.attrib.get('visibility', 'default_off')].append(key) | |
1078 | |
1079 default_session_data['style'][key] = track_conf['style'] # TODO do we need this anymore? | |
1080 default_session_data['style_labels'][key] = track_conf['style_labels'] | |
1081 | |
1082 default_session_data['defaultLocation'] = root.find('metadata/general/defaultLocation').text | |
1083 default_session_data['session_name'] = root.find('metadata/general/session_name').text | |
1084 | |
1085 general_data = { | |
1086 'analytics': root.find('metadata/general/analytics').text, | |
1087 'primary_color': root.find('metadata/general/primary_color').text, | |
1088 'secondary_color': root.find('metadata/general/secondary_color').text, | |
1089 'tertiary_color': root.find('metadata/general/tertiary_color').text, | |
1090 'quaternary_color': root.find('metadata/general/quaternary_color').text, | |
1091 'font_size': root.find('metadata/general/font_size').text, | |
1092 } | |
1093 | |
1094 jc.add_default_session(default_session_data) | |
1095 jc.add_general_configuration(general_data) | |
1096 jc.text_index() | |
1097 |