Mercurial > repos > fubar > jbrowse2
diff jbrowse2.py @ 136:93fdd696c281 draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse2 commit 4fa86613193c985e0cb9a8fc795c56b8bc7b8532
author | iuc |
---|---|
date | Thu, 02 Oct 2025 10:20:29 +0000 |
parents | 21bb464c1d53 |
children |
line wrap: on
line diff
--- a/jbrowse2.py Sun Dec 15 23:47:40 2024 +0000 +++ b/jbrowse2.py Thu Oct 02 10:20:29 2025 +0000 @@ -1,29 +1,22 @@ #!/usr/bin/env python - import argparse -import binascii -import copy +import csv import datetime +import hashlib import json import logging import os import re import shutil -import ssl -import string -import struct import subprocess -import urllib.request import xml.etree.ElementTree as ET from collections import defaultdict +import requests + + logging.basicConfig(level=logging.DEBUG) log = logging.getLogger("jbrowse") - -JB2VER = "v2.15.4" -# version pinned if cloning - but not cloning now -logCommands = True -# useful for seeing what's being written but not for production setups TODAY = datetime.datetime.now().strftime("%Y-%m-%d") SELF_LOCATION = os.path.dirname(os.path.realpath(__file__)) GALAXY_INFRASTRUCTURE_URL = None @@ -42,288 +35,6 @@ } -INDEX_TEMPLATE = """<!doctype html> -<html lang="en" style="height:100%"> -<head> -<meta charset="utf-8"/> -<link rel="shortcut icon" href="./favicon.ico"/> -<meta name="viewport" content="width=device-width,initial-scale=1"/> -<meta name="theme-color" content="#000000"/> -<meta name="description" content="A fast and flexible genome browser"/> -<link rel="manifest" href="./manifest.json"/> -<title>JBrowse</title> -</script> -</head> -<body style="overscroll-behavior:none; height:100%; margin: 0;"> -<iframe - id="jbframe" - title="JBrowse2" - frameborder="0" - width="100%" - height="100%" - src='index_noview.html?config=config.json__SESSION_SPEC__'> -</iframe> -</body> -</html> -""" - - -class ColorScaling(object): - - COLOR_FUNCTION_TEMPLATE = """ - function(feature, variableName, glyphObject, track) {{ - var score = {score}; - {opacity} - return 'rgba({red}, {green}, {blue}, ' + opacity + ')'; - }} - """ - - COLOR_FUNCTION_TEMPLATE_QUAL = r""" - function(feature, variableName, glyphObject, track) {{ - var search_up = function self(sf, attr){{ - if(sf.get(attr) !== undefined){{ - return sf.get(attr); - }} - if(sf.parent() === undefined) {{ - return; - }}else{{ - return self(sf.parent(), attr); - }} - }}; - - var search_down = function self(sf, attr){{ - if(sf.get(attr) !== undefined){{ - return sf.get(attr); - }} - if(sf.children() === undefined) {{ - return; - }}else{{ - var kids = sf.children(); - for(var child_idx in kids){{ - var x = self(kids[child_idx], attr); - if(x !== undefined){{ - return x; - }} - }} - return; - }} - }}; - - var color = ({user_spec_color} || search_up(feature, 'color') || search_down(feature, 'color') || {auto_gen_color}); - var score = (search_up(feature, 'score') || search_down(feature, 'score')); - {opacity} - if(score === undefined){{ opacity = 1; }} - var result = /^#?([a-f\d]{{2}})([a-f\d]{{2}})([a-f\d]{{2}})$/i.exec(color); - var red = parseInt(result[1], 16); - var green = parseInt(result[2], 16); - var blue = parseInt(result[3], 16); - if(isNaN(opacity) || opacity < 0){{ opacity = 0; }} - return 'rgba(' + red + ',' + green + ',' + blue + ',' + opacity + ')'; - }} - """ - - OPACITY_MATH = { - "linear": """ - var opacity = (score - ({min})) / (({max}) - ({min})); - """, - "logarithmic": """ - var opacity = Math.log10(score - ({min})) / Math.log10(({max}) - ({min})); - """, - "blast": """ - var opacity = 0; - if(score == 0.0) {{ - opacity = 1; - }} else {{ - opacity = (20 - Math.log10(score)) / 180; - }} - """, - } - - BREWER_COLOUR_IDX = 0 - BREWER_COLOUR_SCHEMES = [ - (166, 206, 227), - (31, 120, 180), - (178, 223, 138), - (51, 160, 44), - (251, 154, 153), - (227, 26, 28), - (253, 191, 111), - (255, 127, 0), - (202, 178, 214), - (106, 61, 154), - (255, 255, 153), - (177, 89, 40), - (228, 26, 28), - (55, 126, 184), - (77, 175, 74), - (152, 78, 163), - (255, 127, 0), - ] - - BREWER_DIVERGING_PALLETES = { - "BrBg": ("#543005", "#003c30"), - "PiYg": ("#8e0152", "#276419"), - "PRGn": ("#40004b", "#00441b"), - "PuOr": ("#7f3b08", "#2d004b"), - "RdBu": ("#67001f", "#053061"), - "RdGy": ("#67001f", "#1a1a1a"), - "RdYlBu": ("#a50026", "#313695"), - "RdYlGn": ("#a50026", "#006837"), - "Spectral": ("#9e0142", "#5e4fa2"), - } - - def __init__(self): - self.brewer_colour_idx = 0 - - def rgb_from_hex(self, hexstr): - # http://stackoverflow.com/questions/4296249/how-do-i-convert-a-hex-triplet-to-an-rgb-tuple-and-back - return struct.unpack("BBB", binascii.unhexlify(hexstr)) - - def min_max_gff(self, gff_file): - min_val = None - max_val = None - with open(gff_file, "r") as handle: - for line in handle: - try: - value = float(line.split("\t")[5]) - min_val = min(value, (min_val or value)) - max_val = max(value, (max_val or value)) - - if value < min_val: - min_val = value - - if value > max_val: - max_val = value - except Exception: - pass - return min_val, max_val - - def hex_from_rgb(self, r, g, b): - return "#%02x%02x%02x" % (r, g, b) - - def _get_colours(self): - r, g, b = self.BREWER_COLOUR_SCHEMES[ - self.brewer_colour_idx % len(self.BREWER_COLOUR_SCHEMES) - ] - self.brewer_colour_idx += 1 - return r, g, b - - def parse_menus(self, track): - trackConfig = {"menuTemplate": [{}, {}, {}, {}]} - - if "menu" in track["menus"]: - menu_list = [track["menus"]["menu"]] - if isinstance(track["menus"]["menu"], list): - menu_list = track["menus"]["menu"] - - for m in menu_list: - tpl = { - "action": m["action"], - "label": m.get("label", "{name}"), - "iconClass": m.get("iconClass", "dijitIconBookmark"), - } - if "url" in m: - tpl["url"] = m["url"] - if "content" in m: - tpl["content"] = m["content"] - if "title" in m: - tpl["title"] = m["title"] - - trackConfig["menuTemplate"].append(tpl) - - return trackConfig - - def parse_colours(self, track, trackFormat, gff3=None): - # Wiggle tracks have a bicolor pallete - trackConfig = {"style": {}} - if trackFormat == "wiggle": - - trackConfig["style"]["pos_color"] = track["wiggle"]["color_pos"] - trackConfig["style"]["neg_color"] = track["wiggle"]["color_neg"] - - if trackConfig["style"]["pos_color"] == "__auto__": - trackConfig["style"]["neg_color"] = self.hex_from_rgb( - *self._get_colours() - ) - trackConfig["style"]["pos_color"] = self.hex_from_rgb( - *self._get_colours() - ) - - # Wiggle tracks can change colour at a specified place - bc_pivot = track["wiggle"]["bicolor_pivot"] - if bc_pivot not in ("mean", "zero"): - # The values are either one of those two strings - # or a number - bc_pivot = float(bc_pivot) - trackConfig["bicolor_pivot"] = bc_pivot - elif "scaling" in track: - if track["scaling"]["method"] == "ignore": - if track["scaling"]["scheme"]["color"] != "__auto__": - trackConfig["style"]["color"] = track["scaling"]["scheme"]["color"] - else: - trackConfig["style"]["color"] = self.hex_from_rgb( - *self._get_colours() - ) - else: - # Scored method - algo = track["scaling"]["algo"] - # linear, logarithmic, blast - scales = track["scaling"]["scales"] - # type __auto__, manual (min, max) - scheme = track["scaling"]["scheme"] - # scheme -> (type (opacity), color) - # ================================== - # GENE CALLS OR BLAST - # ================================== - if trackFormat == "blast": - red, green, blue = self._get_colours() - color_function = self.COLOR_FUNCTION_TEMPLATE.format( - **{ - "score": "feature._parent.get('score')", - "opacity": self.OPACITY_MATH["blast"], - "red": red, - "green": green, - "blue": blue, - } - ) - trackConfig["style"]["color"] = color_function.replace("\n", "") - elif trackFormat == "gene_calls": - # Default values, based on GFF3 spec - min_val = 0 - max_val = 1000 - # Get min/max and build a scoring function since JBrowse doesn't - if scales["type"] == "automatic" or scales["type"] == "__auto__": - min_val, max_val = self.min_max_gff(gff3) - else: - min_val = scales.get("min", 0) - max_val = scales.get("max", 1000) - - if scheme["color"] == "__auto__": - user_color = "undefined" - auto_color = "'%s'" % self.hex_from_rgb(*self._get_colours()) - elif scheme["color"].startswith("#"): - user_color = "'%s'" % self.hex_from_rgb( - *self.rgb_from_hex(scheme["color"][1:]) - ) - auto_color = "undefined" - else: - user_color = "undefined" - auto_color = "'%s'" % self.hex_from_rgb(*self._get_colours()) - - color_function = self.COLOR_FUNCTION_TEMPLATE_QUAL.format( - **{ - "opacity": self.OPACITY_MATH[algo].format( - **{"max": max_val, "min": min_val} - ), - "user_spec_color": user_color, - "auto_gen_color": auto_color, - } - ) - - trackConfig["style"]["color"] = color_function.replace("\n", "") - return trackConfig - - def etree_to_dict(t): if t is None: return {} @@ -353,98 +64,106 @@ def metadata_from_node(node): metadata = {} - try: - if len(node.findall("dataset")) != 1: - # exit early - return metadata - except Exception: - return {} + + if len(node.findall("dataset")) == 1: - for key, value in node.findall("dataset")[0].attrib.items(): - metadata["dataset_%s" % key] = value + for key, value in node.findall("dataset")[0].attrib.items(): + metadata[f"dataset_{key}"] = value + + for key, value in node.findall("history")[0].attrib.items(): + metadata[f"history_{key}"] = value - if node.findall("history"): - for key, value in node.findall("history")[0].attrib.items(): - metadata["history_%s" % key] = value + for key, value in node.findall("metadata")[0].attrib.items(): + metadata[f"metadata_{key}"] = value + + for key, value in node.findall("tool")[0].attrib.items(): + metadata[f"tool_{key}"] = value - if node.findall("metadata"): - for key, value in node.findall("metadata")[0].attrib.items(): - metadata["metadata_%s" % key] = value # Additional Mappings applied: - metadata["dataset_edam_format"] = ( - '<a target="_blank" href="http://edamontology.org/{0}">{1}</a>'.format( - metadata["dataset_edam_format"], metadata["dataset_file_ext"] - ) + metadata[ + "dataset_edam_format" + ] = '<a target="_blank" href="http://edamontology.org/{0}">{1}</a>'.format( + metadata["dataset_edam_format"], metadata["dataset_file_ext"] ) metadata["history_user_email"] = '<a href="mailto:{0}">{0}</a>'.format( metadata["history_user_email"] ) - metadata["hist_name"] = metadata["history_display_name"] - metadata["history_display_name"] = ( - '<a target="_blank" href="{galaxy}/history/view/{encoded_hist_id}">{hist_name}</a>'.format( - galaxy=GALAXY_INFRASTRUCTURE_URL, - encoded_hist_id=metadata.get("history_id", "not available"), - hist_name=metadata.get("history_display_name", "not available"), - ) + metadata[ + "history_display_name" + ] = '<a target="_blank" href="{galaxy}/history/view/{encoded_hist_id}">{hist_name}</a>'.format( + galaxy=GALAXY_INFRASTRUCTURE_URL, + encoded_hist_id=metadata["history_id"], + hist_name=metadata["history_display_name"], + ) + metadata[ + "tool_tool" + ] = '<a target="_blank" href="{galaxy}/datasets/{encoded_id}/show_params">{tool_id}</a>'.format( + galaxy=GALAXY_INFRASTRUCTURE_URL, + encoded_id=metadata["dataset_id"], + tool_id=metadata["tool_tool_id"], + # tool_version=metadata['tool_tool_version'], ) - if node.findall("tool"): - for key, value in node.findall("tool")[0].attrib.items(): - metadata["tool_%s" % key] = value - metadata["tool_tool"] = ( - '<a target="_blank" href="{galaxy}/datasets/{encoded_id}/show_params">{tool_id}{tool_version}</a>'.format( - galaxy=GALAXY_INFRASTRUCTURE_URL, - encoded_id=metadata.get("dataset_id", ""), - tool_id=metadata.get("tool_tool_id", ""), - tool_version=metadata.get("tool_tool_version", ""), - ) - ) + + # Load additional metadata from a TSV file if any given by user + bonus = node.findall("bonus") + if bonus and "src" in bonus[0].attrib and bonus[0].attrib["src"]: + with open(bonus[0].attrib["src"], "r") as bonus_tsv: + bonus_content = csv.reader(bonus_tsv, delimiter="\t", quotechar='"') + for row in bonus_content: + if len(row) == 2: + if row[0] in metadata: + log.warning(f"Overwriting existing metadata {row[0]} with value from bonus file {row[1]}") + metadata[row[0]] = row[1] + else: + log.warning(f"Skipping invalid bonus metadata line: {row}") + return metadata class JbrowseConnector(object): - def __init__(self, outdir, jbrowse2path): - self.bpPerPx = 50 - self.trackCounter = 0 # to avoid name clashes - self.assemblies = [] # these require more than a few line diff. - self.assmeta = {} - self.ass_first_contigs = ( - [] - ) # for default session - these are read as first line of the assembly .fai - self.giURL = GALAXY_INFRASTRUCTURE_URL - self.outdir = os.path.abspath(outdir) - self.jbrowse2path = jbrowse2path - os.makedirs(self.outdir, exist_ok=True) - self.genome_names = [] - self.trackIdlist = [] - self.tracksToAdd = {} - self.config_json = {} - self.config_json_file = os.path.join(outdir, "config.json") - self.clone_jbrowse(realclone=False) + def __init__(self, jbrowse, outdir, update): + self.jbrowse = jbrowse + self.outdir = outdir + self.update = update + + self.tracksToIndex = {} + + # This is the id of the current assembly + self.assembly_ids = {} + + self.default_views = {} + + self.plugins = [] + + self.use_synteny_viewer = False + + self.synteny_tracks = [] + + self.clone_jbrowse(self.jbrowse, self.outdir) + + # If upgrading, look at the existing data + self.check_existing(self.outdir) def get_cwd(self, cwd): if cwd: return self.outdir else: - return subprocess.check_output(["pwd"]).decode("utf-8").strip() + return subprocess.check_output(['pwd']).decode('utf-8').strip() + # return None def subprocess_check_call(self, command, output=None, cwd=True): if output: - if logCommands: - log.debug( - "cd %s && %s > %s", self.get_cwd(cwd), " ".join(command), output - ) + log.debug(f"cd {self.get_cwd(cwd)} && {' '.join(command)} > {output.name}") subprocess.check_call(command, cwd=self.get_cwd(cwd), stdout=output) else: - if logCommands: - log.debug("cd %s && %s", self.get_cwd(cwd), " ".join(command)) + log.debug(f"cd {self.get_cwd(cwd)} && {' '.join(command)}") subprocess.check_call(command, cwd=self.get_cwd(cwd)) def subprocess_popen(self, command, cwd=True): - if logCommands: - log.debug(command) + log.debug(f"cd {self.get_cwd(cwd)} && {command}") p = subprocess.Popen( command, - cwd=self.outdir, + cwd=self.get_cwd(cwd), shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, @@ -453,992 +172,1249 @@ output, err = p.communicate() retcode = p.returncode if retcode != 0: - log.error(command) + log.error(f"cd {self.get_cwd(cwd)} && {command}") log.error(output) log.error(err) - raise RuntimeError("Command failed with exit code %s" % (retcode)) + raise RuntimeError(f"Command failed with exit code {retcode}") - def subprocess_check_output(self, command): - if logCommands: - log.debug(" ".join(command)) - return subprocess.check_output(command, cwd=self.outdir) + def subprocess_check_output(self, command, cwd=True): + log.debug(f"cd {self.get_cwd(cwd)} && {' '.join(command)}") + return subprocess.check_output(command, cwd=self.get_cwd(cwd)) def symlink_or_copy(self, src, dest): - if "GALAXY_JBROWSE_SYMLINKS" in os.environ and bool( - os.environ["GALAXY_JBROWSE_SYMLINKS"] - ): - cmd = ["ln", "-s", src, dest] - else: - cmd = ["cp", src, dest] + # Use to support symlinking in jbrowse1, in jbrowse2 prefer to use remote uri + cmd = ["cp", src, dest] return self.subprocess_check_call(cmd) - def _prepare_track_style(self, trackDict): - + def _prepare_track_style(self, xml_conf): style_data = { - "type": "LinearBasicDisplay", - "displayId": "%s-LinearBasicDisplay" % trackDict["trackId"], + "type": "LinearBasicDisplay", # No ideal default, but should be overwritten anyway } - if trackDict.get("displays", None): # use first if multiple like bed - style_data["type"] = trackDict["displays"][0]["type"] - style_data["displayId"] = trackDict["displays"][0]["displayId"] + if "display" in xml_conf["style"]: + style_data["type"] = xml_conf["style"]["display"] + + style_data["displayId"] = f"{xml_conf['label']}_{style_data['type']}" + + style_data.update(self._prepare_renderer_config(style_data["type"], xml_conf["style"])) + + return {"displays": [style_data]} + + def _prepare_renderer_config(self, display_type, xml_conf): + + style_data = {} + + # if display_type in ("LinearBasicDisplay", "LinearVariantDisplay"): + # TODO LinearVariantDisplay does not understand these options when written in config.json + if display_type in ("LinearBasicDisplay"): + + # Doc: https://jbrowse.org/jb2/docs/config/svgfeaturerenderer/ + style_data["renderer"] = { + "type": "SvgFeatureRenderer", + "showLabels": xml_conf.get("show_labels", True), + "showDescriptions": xml_conf.get("show_descriptions", True), + "labels": { + "name": xml_conf.get("labels_name", "jexl:get(feature,'name') || get(feature,'id')"), + "description": xml_conf.get("descriptions_name", "jexl:get(feature,'note') || get(feature,'description')") + }, + "displayMode": xml_conf.get("display_mode", "normal"), + "maxHeight": xml_conf.get("max_height", 1200), + } + + elif display_type == "LinearArcDisplay": + + # Doc: https://jbrowse.org/jb2/docs/config/arcrenderer/ + style_data["renderer"] = { + "type": "ArcRenderer", + "label": xml_conf.get("labels_name", "jexl:get(feature,'score')"), + "displayMode": xml_conf.get("display_mode", "arcs"), + } + + elif display_type == "LinearWiggleDisplay": + + wig_renderer = xml_conf.get("renderer", "xyplot") + style_data["defaultRendering"] = wig_renderer + + elif display_type == "MultiLinearWiggleDisplay": + + wig_renderer = xml_conf.get("renderer", "multirowxy") + style_data["defaultRendering"] = wig_renderer + + elif display_type == "LinearSNPCoverageDisplay": + + # Does not work + # style_data["renderer"] = { + # "type": "SNPCoverageRenderer", + # "displayCrossHatches": xml_conf.get("display_cross_hatches", True), + # } + + style_data["scaleType"] = xml_conf.get("scale_type", "linear") + if "min_score" in xml_conf: + style_data["minScore"] = xml_conf["min_score"] + + if "max_score" in xml_conf: + style_data["maxScore"] = xml_conf["max_score"] + + # Doc: https://jbrowse.org/jb2/docs/config/snpcoveragerenderer/ + return style_data - def getNrow(self, url): - useuri = url.startswith("https://") or url.startswith("http://") - if not useuri: - fl = open(url, "r").readlines() - nrow = len(fl) - else: - try: - scontext = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) - scontext.check_hostname = False - scontext.verify_mode = ssl.VerifyMode.CERT_NONE - with urllib.request.urlopen(url, context=scontext) as f: - fl = f.readlines() - nrow = len(fl) - except Exception: - nrow = 0 - logging.debug("getNrow %s returning %d" % (url, nrow)) - return nrow + def _prepare_format_details(self, xml_conf): + formatDetails = { + } + + if "feature" in xml_conf["formatdetails"]: + feat_jexl = xml_conf["formatdetails"]["feature"] + for key, value in mapped_chars.items(): + feat_jexl = feat_jexl.replace(value, key) + formatDetails["feature"] = feat_jexl + + if "subfeature" in xml_conf["formatdetails"]: + sfeat_jexl = xml_conf["formatdetails"]["subfeature"] + for key, value in mapped_chars.items(): + sfeat_jexl = sfeat_jexl.replace(value, key) + formatDetails["subfeatures"] = sfeat_jexl + + if "depth" in xml_conf["formatdetails"]: + formatDetails["depth"] = int(xml_conf["formatdetails"]["depth"]) + + return {"formatDetails": formatDetails} + + def _prepare_track_metadata(self, xml_conf): + metadata = { + } + + metadata = xml_conf["metadata"] + + return {"metadata": metadata} + + def check_existing(self, destination): + existing = os.path.join(destination, "config.json") + if os.path.exists(existing): + with open(existing, "r") as existing_conf: + conf = json.load(existing_conf) + if "assemblies" in conf: + for assembly in conf["assemblies"]: + if "name" in assembly: + + # Look for a default scaffold + default_seq = None + if 'defaultSession' in conf and 'views' in conf['defaultSession']: + for view in conf['defaultSession']['views']: + if 'init' in view and 'assembly' in view['init'] and 'loc' in view['init']: + if view['init']['assembly'] == assembly["name"]: + default_seq = view['init']['loc'].split(":")[0] + if "views" in view: + subviews = view["views"] + for subview in subviews: + if 'init' in subview and 'assembly' in subview['init'] and 'loc' in subview['init']: + if subview['init']['assembly'] == assembly["name"]: + default_seq = subview['init']['loc'].split(":")[0] + + self.assembly_ids[assembly["name"]] = default_seq + + def _load_old_genome_views(self): + + views = {} + + config_path = os.path.join(self.outdir, "config.json") + with open(config_path, "r") as config_file: + config_json = json.load(config_file) - def process_genomes(self, genomes): - assembly = [] - assmeta = [] - useuri = False - primaryGenome = None - for i, genome_node in enumerate(genomes): - this_genome = {} - if genome_node["useuri"] == "yes": - useuri = True - genome_name = genome_node["label"].strip() - if len(genome_name) == 0: - genome_name = os.path.splitext(os.path.basename(genome_node["path"]))[0] - if len(genome_name.split()) > 1: - genome_name = genome_name.split()[0] - # spaces and cruft break scripts when substituted - if not primaryGenome: - primaryGenome = genome_name - if genome_name not in self.genome_names: - self.genome_names.append(genome_name) - fapath = genome_node["path"] - if not useuri: - fapath = os.path.realpath(fapath) - assem, first_contig = self.make_assembly(fapath, genome_name, useuri) - assembly.append(assem) - self.ass_first_contigs.append(first_contig) - if genome_name == primaryGenome: # first one - this_genome["genome_name"] = genome_name # first one for all tracks - this_genome["genome_sequence_adapter"] = assem["sequence"][ - "adapter" - ] - this_genome["genome_firstcontig"] = first_contig - assmeta.append(this_genome) - self.assemblies += assembly - self.assmeta[primaryGenome] = assmeta - self.tracksToAdd[primaryGenome] = [] - return primaryGenome + # Find default synteny views existing from a previous jbrowse dataset + if 'defaultSession' in config_json and 'views' in config_json['defaultSession']: + for view in config_json['defaultSession']['views']: + if view['type'] != "LinearSyntenyView": + if 'init' in view and 'assembly' in view['init']: + views[view['init']['assembly']] = view + + return views + + def _load_old_synteny_views(self): + + views = [] + + config_path = os.path.join(self.outdir, "config.json") + with open(config_path, "r") as config_file: + config_json = json.load(config_file) + + # Find default synteny views existing from a previous jbrowse dataset + if 'defaultSession' in config_json and 'views' in config_json['defaultSession']: + for view in config_json['defaultSession']['views']: + if view['type'] == "LinearSyntenyView": + views.append(view) + + return views + + def add_assembly(self, path, label, is_remote=False, cytobands=None, ref_name_aliases=None): + + if not is_remote: + # Find a non-existing filename for the new genome + # (to avoid colision when upgrading an existing instance) + rel_seq_path = os.path.join("data", label) + seq_path = os.path.join(self.outdir, rel_seq_path) + fn_try = 1 + while ( + os.path.exists(seq_path + ".fasta") + or os.path.exists(seq_path + ".fasta.gz") + or os.path.exists(seq_path + ".fasta.gz.fai") + or os.path.exists(seq_path + ".fasta.gz.gzi") + ): + rel_seq_path = os.path.join("data", f"{label}{fn_try}") + seq_path = os.path.join(self.outdir, rel_seq_path) + fn_try += 1 + + # Check if the assembly already exists from a previous run (--update mode) + if self.update: + + config_path = os.path.join(self.outdir, "config.json") + with open(config_path, "r") as config_file: + config_json = json.load(config_file) + + for asby in config_json['assemblies']: + if asby['name'] == label: + + # Find default views existing for this assembly + if 'defaultSession' in config_json and 'views' in config_json['defaultSession']: + for view in config_json['defaultSession']['views']: + if 'init' in view and 'assembly' in view['init']: + if view['init']['assembly'] == label: + + log.info("Found existing assembly from existing JBrowse2 instance, preserving it") + + self.default_views[view['init']['assembly']] = view - def make_assembly(self, fapath, gname, useuri): - if useuri: - faname = fapath - scontext = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) - scontext.check_hostname = False - scontext.verify_mode = ssl.VerifyMode.CERT_NONE - with urllib.request.urlopen(url=faname + ".fai", context=scontext) as f: - fl = f.readline() - contig = fl.decode("utf8").strip() - # Merlin 172788 8 60 61 + return label + + # Copy ref alias file if any + if ref_name_aliases: + copied_ref_name_aliases = seq_path + ".aliases" + shutil.copy(ref_name_aliases, copied_ref_name_aliases) + copied_ref_name_aliases = rel_seq_path + ".aliases" + + # Copy cytobands file if any + if cytobands: + copied_cytobands = seq_path + ".cytobands" + shutil.copy(cytobands, copied_cytobands) + copied_cytobands = rel_seq_path + ".cytobands" + + # Find a non-existing label for the new genome + # (to avoid colision when upgrading an existing instance) + lab_try = 1 + uniq_label = label + while uniq_label in self.assembly_ids: + uniq_label = label + str(lab_try) + lab_try += 1 + + if is_remote: + + # Find a default scaffold to display + with requests.get(path + ".fai", stream=True) as response: + response.raise_for_status() + first_seq = next(response.iter_lines()) + first_seq = first_seq.decode("utf-8").split('\t')[0] + + self.assembly_ids[uniq_label] = first_seq + + # We assume we just need to suffix url with .fai and .gzi for indexes. + cmd_jb = [ + "jbrowse", + "add-assembly", + "--name", + uniq_label, + "--type", + "bgzipFasta", + "--out", + self.outdir, + "--skipCheck", + ] + + if ref_name_aliases: + cmd_jb.extend([ + "--refNameAliases", + copied_ref_name_aliases, + ]) + + cmd_jb.append(path) # Path is an url in remote mode + + self.subprocess_check_call(cmd_jb) else: - faname = gname + ".fa.gz" - fadest = os.path.realpath(os.path.join(self.outdir, faname)) - cmd = "bgzip -k -i -c -I '%s.gzi' '%s' > '%s'" % (fadest, fapath, fadest) - subprocess.run(cmd, shell=True) - cmd = ["samtools", "faidx", fadest] + # Find a default scaffold to display + with open(path, "r") as fa_handle: + fa_header = fa_handle.readline()[1:].strip().split(" ")[0] + + self.assembly_ids[uniq_label] = fa_header + + copied_genome = seq_path + ".fasta" + shutil.copy(path, copied_genome) + + # Compress with bgzip + cmd = ["bgzip", copied_genome] + self.subprocess_check_call(cmd) + + # FAI Index + cmd = ["samtools", "faidx", copied_genome + ".gz"] self.subprocess_check_call(cmd) - contig = open(fadest + ".fai", "r").readline().strip() - adapter = { - "type": "BgzipFastaAdapter", - "fastaLocation": { - "uri": faname, - }, - "faiLocation": { - "uri": faname + ".fai", - }, - "gziLocation": { - "uri": faname + ".gzi", - }, + + cmd_jb = [ + "jbrowse", + "add-assembly", + "--load", + "inPlace", + "--name", + uniq_label, + "--type", + "bgzipFasta", + "--out", + self.outdir, + "--skipCheck", + ] + + if ref_name_aliases: + cmd_jb.extend([ + "--refNameAliases", + copied_ref_name_aliases, + ]) + + cmd_jb.append(rel_seq_path + ".fasta.gz") + + self.subprocess_check_call(cmd_jb) + + if cytobands: + self.add_cytobands(uniq_label, copied_cytobands) + + return uniq_label + + def add_cytobands(self, assembly_name, cytobands_path): + + config_path = os.path.join(self.outdir, "config.json") + with open(config_path, "r") as config_file: + config_json = json.load(config_file) + + config_data = {} + + config_data["cytobands"] = { + "adapter": { + "type": "CytobandAdapter", + "cytobandLocation": { + "uri": cytobands_path + } + } } - first_contig = contig.split()[:2] - first_contig.insert(0, gname) - trackDict = { - "name": gname, - "sequence": { - "type": "ReferenceSequenceTrack", - "trackId": gname, - "adapter": adapter, - }, - "displays": [ - { - "type": "LinearReferenceSequenceDisplay", - "displayId": "%s-LinearReferenceSequenceDisplay" % gname, - }, - { - "type": "LinearGCContentDisplay", - "displayId": "%s-LinearGCContentDisplay" % gname, - }, - ], - } - return (trackDict, first_contig) - def add_default_view(self): - cmd = [ - "jbrowse", - "set-default-session", - "-s", - self.config_json_file, - "-t", - ",".join(self.trackIdlist), - "-n", - "JBrowse2 in Galaxy", - "--target", - self.config_json_file, - "-v", - " LinearGenomeView", - ] - self.subprocess_check_call(cmd) + filled_assemblies = [] + for assembly in config_json["assemblies"]: + if assembly["name"] == assembly_name: + assembly.update(config_data) + filled_assemblies.append(assembly) + config_json["assemblies"] = filled_assemblies - def write_config(self): - with open(self.config_json_file, "w") as fp: - json.dump(self.config_json, fp, indent=2) + with open(config_path, "w") as config_file: + json.dump(config_json, config_file, indent=2) def text_index(self): - # Index tracks - e = os.environ - e["SHELL"] = "/bin/sh" - cmd = ["jbrowse", "text-index"] - subprocess.run(cmd, env=e, shell=True) + + for ass in self.tracksToIndex: + tracks = self.tracksToIndex[ass] + args = [ + "jbrowse", + "text-index", + "--target", + self.outdir, + "--assemblies", + ass, + ] + + tracks = ",".join(tracks) + if tracks: + args += ["--tracks", tracks] + + log.info(f"-----> Running text-index on assembly {ass} and tracks {tracks}") + + # Only run index if we want to index at least one + # If --tracks is not specified, it will index everything + self.subprocess_check_call(args) + + def add_gc_content(self, parent, trackData, **kwargs): + + adapter = {} + existing = os.path.join(self.outdir, "config.json") + if os.path.exists(existing): + with open(existing, "r") as existing_conf: + conf = json.load(existing_conf) + if "assemblies" in conf: + for assembly in conf["assemblies"]: + if assembly.get('name', "") == parent['uniq_id']: + adapter = assembly.get('sequence', {}).get('adapter', {}) + + json_track_data = { + "type": "GCContentTrack", + "trackId": trackData["label"], + "name": trackData["key"], + "adapter": adapter, + "category": [trackData["category"]], + "assemblyNames": [parent['uniq_id']], + } + + style_json = self._prepare_track_style(trackData) + + json_track_data.update(style_json) + + self.subprocess_check_call( + [ + "jbrowse", + "add-track-json", + "--target", + self.outdir, + json.dumps(json_track_data), + ] + ) + + def add_bigwig(self, parent, data, trackData, wiggleOpts, **kwargs): + + if trackData['remote']: + rel_dest = data + else: + rel_dest = os.path.join("data", trackData["label"] + ".bw") + dest = os.path.join(self.outdir, rel_dest) + self.symlink_or_copy(os.path.realpath(data), dest) + + style_json = self._prepare_track_style(trackData) + + track_metadata = self._prepare_track_metadata(trackData) + + style_json.update(track_metadata) + + self._add_track( + trackData["label"], + trackData["key"], + trackData["category"], + rel_dest, + parent, + config=style_json, + remote=trackData['remote'] + ) - def add_hic(self, data, trackData): - """ - HiC adapter. - https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md - for testing locally, these work: - HiC data is from https://s3.amazonaws.com/igv.broadinstitute.org/data/hic/intra_nofrag_30.hic - using hg19 reference track as a - 'BgzipFastaAdapter' - fastaLocation: - uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz', - faiLocation: - uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.fai', - gziLocation: - uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.gzi', - Cool will not be likely to be a good fit - see discussion at https://github.com/GMOD/jbrowse-components/issues/2438 + def add_bigwig_multi(self, parent, data_files, trackData, wiggleOpts, **kwargs): + + subadapters = [] + + sub_num = 0 + for data in data_files: + if trackData['remote']: + rel_dest = data[1] + else: + rel_dest = os.path.join("data", f"{trackData['label']}_sub{sub_num}.bw") + dest = os.path.join(self.outdir, rel_dest) + self.symlink_or_copy(os.path.realpath(data[1]), dest) + + subadapters.append({ + "type": "BigWigAdapter", + "name": data[0], + "bigWigLocation": { + "uri": rel_dest, + "locationType": "UriLocation" + } + }) + sub_num += 1 + + json_track_data = { + "type": "MultiQuantitativeTrack", + "trackId": trackData["label"], + "name": trackData["key"], + "adapter": { + "type": "MultiWiggleAdapter", + "subadapters": subadapters + }, + "category": [trackData["category"]], + "assemblyNames": [parent['uniq_id']], + } + + style_json = self._prepare_track_style(trackData) + + json_track_data.update(style_json) + + track_metadata = self._prepare_track_metadata(trackData) - """ - tId = trackData["label"] - wasCool = trackData["wasCool"] - # can be served - if public. - # dsId = trackData["metadata"]["dataset_id"] - # url = "%s/api/datasets/%s/display?to_ext=hic " % (self.giURL, dsId) - useuri = trackData["useuri"].lower() == "yes" - logging.debug("wasCool=%s, data=%s, tId=%s" % (wasCool, data, tId)) - if useuri: - uri = data + json_track_data.update(track_metadata) + + self.subprocess_check_call( + [ + "jbrowse", + "add-track-json", + "--target", + self.outdir, + json.dumps(json_track_data), + ] + ) + + # Anything ending in "am" (Bam or Cram) + def add_xam(self, parent, data, trackData, xamOpts, index=None, ext="bam", **kwargs): + index_ext = "bai" + if ext == "cram": + index_ext = "crai" + + if trackData['remote']: + rel_dest = data + # Index will be set automatically as xam url + xai .suffix by add-track cmd else: - uri = tId + ".hic" - if not wasCool: - dest = os.path.join(self.outdir, uri) - if not os.path.exists(dest): - cmd = ["cp", data, dest] - self.subprocess_check_call(cmd) + rel_dest = os.path.join("data", trackData["label"] + f".{ext}") + dest = os.path.join(self.outdir, rel_dest) + self.symlink_or_copy(os.path.realpath(data), dest) + + if index is not None and os.path.exists(os.path.realpath(index)): + # xai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest + self.subprocess_check_call( + ["cp", os.path.realpath(index), dest + f".{index_ext}"] + ) + else: + # Can happen in exotic condition + # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam + # => no index generated by galaxy, but there might be one next to the symlink target + # this trick allows to skip the bam sorting made by galaxy if already done outside + if os.path.exists(os.path.realpath(data) + f".{index_ext}"): + self.symlink_or_copy( + os.path.realpath(data) + f".{index_ext}", dest + f".{index_ext}" + ) else: - logging.error("not wasCool but %s exists" % dest) - categ = trackData["category"] - trackDict = { - "type": "HicTrack", - "trackId": tId, - "name": trackData["name"], - "assemblyNames": [trackData["assemblyNames"]], - "displays": [ - { - "type": "LinearHicDisplay", - "displayId": "%s-LinearHicDisplay" % tId, - } - ], - "category": [ - categ, - ], - "adapter": {"type": "HicAdapter", "hicLocation": {"uri": uri}}, - } - self.tracksToAdd[trackData["assemblyNames"]].append(copy.copy(trackDict)) - self.trackIdlist.append(tId) + log.warn( + f"Could not find a bam index (.{index_ext} file) for {data}" + ) + + style_json = self._prepare_track_style(trackData) + + track_metadata = self._prepare_track_metadata(trackData) + + style_json.update(track_metadata) + + self._add_track( + trackData["label"], + trackData["key"], + trackData["category"], + rel_dest, + parent, + config=style_json, + remote=trackData['remote'] + ) + + def add_vcf(self, parent, data, trackData, vcfOpts={}, zipped=False, **kwargs): + if trackData['remote']: + rel_dest = data + else: + if zipped: + rel_dest = os.path.join("data", trackData["label"] + ".vcf.gz") + dest = os.path.join(self.outdir, rel_dest) + shutil.copy(os.path.realpath(data), dest) + else: + rel_dest = os.path.join("data", trackData["label"] + ".vcf") + dest = os.path.join(self.outdir, rel_dest) + shutil.copy(os.path.realpath(data), dest) + + cmd = ["bgzip", dest] + self.subprocess_check_call(cmd) + cmd = ["tabix", dest + ".gz"] + self.subprocess_check_call(cmd) + + rel_dest = os.path.join("data", trackData["label"] + ".vcf.gz") + + style_json = self._prepare_track_style(trackData) + + formatdetails = self._prepare_format_details(trackData) + + style_json.update(formatdetails) + + track_metadata = self._prepare_track_metadata(trackData) + + style_json.update(track_metadata) + + self._add_track( + trackData["label"], + trackData["key"], + trackData["category"], + rel_dest, + parent, + config=style_json, + remote=trackData['remote'] + ) + + def add_gff(self, parent, data, format, trackData, gffOpts, **kwargs): + if trackData['remote']: + rel_dest = data + else: + rel_dest = os.path.join("data", trackData["label"] + ".gff") + dest = os.path.join(self.outdir, rel_dest) + rel_dest = rel_dest + ".gz" + + self._sort_gff(data, dest) + + style_json = self._prepare_track_style(trackData) + + formatdetails = self._prepare_format_details(trackData) + + style_json.update(formatdetails) + + track_metadata = self._prepare_track_metadata(trackData) + + style_json.update(track_metadata) + + if gffOpts.get('index', 'false') in ("yes", "true", "True"): + if parent['uniq_id'] not in self.tracksToIndex: + self.tracksToIndex[parent['uniq_id']] = [] + self.tracksToIndex[parent['uniq_id']].append(trackData["label"]) + + self._add_track( + trackData["label"], + trackData["key"], + trackData["category"], + rel_dest, + parent, + config=style_json, + remote=trackData['remote'] + ) + + def add_bed(self, parent, data, format, trackData, gffOpts, **kwargs): + if trackData['remote']: + rel_dest = data + else: + rel_dest = os.path.join("data", trackData["label"] + ".bed") + dest = os.path.join(self.outdir, rel_dest) + rel_dest = rel_dest + ".gz" + + self._sort_bed(data, dest) + + style_json = self._prepare_track_style(trackData) + + formatdetails = self._prepare_format_details(trackData) + + style_json.update(formatdetails) + + track_metadata = self._prepare_track_metadata(trackData) + + style_json.update(track_metadata) + + if gffOpts.get('index', 'false') in ("yes", "true", "True"): + if parent['uniq_id'] not in self.tracksToIndex: + self.tracksToIndex[parent['uniq_id']] = [] + self.tracksToIndex[parent['uniq_id']].append(trackData["label"]) + + self._add_track( + trackData["label"], + trackData["key"], + trackData["category"], + rel_dest, + parent, + config=style_json, + remote=trackData['remote'] + ) + + def add_paf(self, parent, data, trackData, pafOpts, **kwargs): + + if trackData['remote']: + rel_dest = data - def add_maf(self, data, trackData): - """ - from https://github.com/cmdcolin/maf2bed - Note: Both formats start with a MAF as input, and note that your MAF file should contain the species name and chromosome name - e.g. hg38.chr1 in the sequence identifiers. - need the reference id - eg hg18, for maf2bed.pl as the first parameter - """ - tId = trackData["label"] - mafPlugin = { - "plugins": [ - { - "name": "MafViewer", - "url": "https://unpkg.com/jbrowse-plugin-mafviewer/dist/jbrowse-plugin-mafviewer.umd.production.min.js", - } - ] + if rel_dest.endswith('pif') or rel_dest.endswith('pif.gz'): + adapter = "pif" + else: + adapter = "paf" + else: + rel_dest = os.path.join("data", trackData["label"] + ".pif.gz") + dest = os.path.join(self.outdir, rel_dest) + + cmd = ["jbrowse", "make-pif", "--out", dest, os.path.realpath(data)] + self.subprocess_check_call(cmd) + + adapter = "pif" + + if trackData["style"]["display"] == "LinearBasicDisplay": + # Normal style track + + json_track_data = { + "type": "SyntenyTrack", + "trackId": trackData["label"], + "name": trackData["key"], + "adapter": { + "type": "PairwiseIndexedPAFAdapter", + "pifGzLocation": { + "uri": rel_dest, + }, + "index": { + "location": { + "uri": rel_dest + ".tbi", + } + }, + }, + "category": [trackData["category"]], + "assemblyNames": [parent['uniq_id']], + } + else: + # Synteny viewer + + json_track_data = { + "type": "SyntenyTrack", + "trackId": trackData["label"], + "name": trackData["key"], + "adapter": { + "assemblyNames": [ + parent['uniq_id'], + "", # Placeholder until we know the next genome id + ], + }, + "category": [trackData["category"]], + "assemblyNames": [ + parent['uniq_id'], + "", # Placeholder until we know the next genome id + ] + } + + if adapter == "pif": + json_track_data["adapter"].update({ + "type": "PairwiseIndexedPAFAdapter", + "pifGzLocation": { + "uri": rel_dest, + }, + "index": { + "location": { + "uri": rel_dest + ".tbi", + } + }, + }) + else: + json_track_data["adapter"].update({ + "type": "PAFAdapter", + "pafLocation": { + "uri": rel_dest, + }, + }) + + style_json = self._prepare_track_style(trackData) + + json_track_data.update(style_json) + + track_metadata = self._prepare_track_metadata(trackData) + + json_track_data.update(track_metadata) + + if trackData["style"]["display"] == "LinearBasicDisplay": + self.subprocess_check_call( + [ + "jbrowse", + "add-track-json", + "--target", + self.outdir, + json.dumps(json_track_data), + ] + ) + else: + self.synteny_tracks.append(json_track_data) + + def add_hic(self, parent, data, trackData, hicOpts, **kwargs): + if trackData['remote']: + rel_dest = data + else: + rel_dest = os.path.join("data", trackData["label"] + ".hic") + dest = os.path.join(self.outdir, rel_dest) + self.symlink_or_copy(os.path.realpath(data), dest) + + style_json = self._prepare_track_style(trackData) + + track_metadata = self._prepare_track_metadata(trackData) + + style_json.update(track_metadata) + + self._add_track( + trackData["label"], + trackData["key"], + trackData["category"], + rel_dest, + parent, + config=style_json, + remote=trackData['remote'] + ) + + def add_maf(self, parent, data, trackData, mafOpts, **kwargs): + + # Add needed plugin + plugin_def = { + "name": "MafViewer", + "url": "https://unpkg.com/jbrowse-plugin-mafviewer/dist/jbrowse-plugin-mafviewer.umd.production.min.js" } - categ = trackData["category"] - fname = tId - dest = os.path.join(self.outdir, fname) - gname = trackData["assemblyNames"] + self.plugins.append(plugin_def) + + rel_dest = os.path.join("data", trackData["label"] + ".maf") + dest = os.path.join(self.outdir, rel_dest) - cmd = [ - "bash", - os.path.join(INSTALLED_TO, "convertMAF.sh"), - data, - gname, - INSTALLED_TO, - dest, - ] - self.subprocess_check_call(cmd) + assembly_name = mafOpts.get("assembly_name", "") + if not assembly_name: + # Guess from assembly + assembly_name = parent['uniq_id'] + + self._convert_maf(data, dest, assembly_name) + + # Extract samples list mafs = open(data, "r").readlines() mafss = [x for x in mafs if (x.startswith("s\t") or x.startswith("s "))] samp = [x.split()[1] for x in mafss if len(x.split()) > 0] sampu = list(dict.fromkeys(samp)) samples = [x.split(".")[0] for x in sampu] samples.sort() - if logCommands: - logging.debug( - "$$$$ cmd=%s, mafss=%s samp=%s samples=%s" - % (" ".join(cmd), mafss, samp, samples) - ) - trackDict = { + + json_track_data = { "type": "MafTrack", - "trackId": tId, - "name": trackData["name"], - "category": [ - categ, - ], + "trackId": trackData["label"], + "name": trackData["key"], "adapter": { "type": "MafTabixAdapter", "samples": samples, "bedGzLocation": { - "uri": fname + ".sorted.bed.gz", + "uri": rel_dest + ".gz", }, "index": { "location": { - "uri": fname + ".sorted.bed.gz.tbi", + "uri": rel_dest + ".gz.tbi", }, }, }, - "assemblyNames": [trackData["assemblyNames"]], - "displays": [ - { - "type": "LinearBasicDisplay", - "displayId": "%s-LinearBasicDisplay" % tId, - }, - {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, - ], + "category": [trackData["category"]], + "assemblyNames": [parent['uniq_id']], + } + + style_json = self._prepare_track_style(trackData) + + json_track_data.update(style_json) + + track_metadata = self._prepare_track_metadata(trackData) + + json_track_data.update(track_metadata) + + self.subprocess_check_call( + [ + "jbrowse", + "add-track-json", + "--target", + self.outdir, + json.dumps(json_track_data), + ] + ) + + def add_sparql(self, parent, url, query, query_refnames, trackData): + json_track_data = { + "type": "FeatureTrack", + "trackId": trackData["label"], + "name": trackData["key"], + "adapter": { + "type": "SPARQLAdapter", + "endpoint": {"uri": url, "locationType": "UriLocation"}, + "queryTemplate": query, + }, + "category": [trackData["category"]], + "assemblyNames": [parent['uniq_id']], } - style_json = self._prepare_track_style(trackDict) - trackDict["style"] = style_json - self.tracksToAdd[gname].append(copy.copy(trackDict)) - self.trackIdlist.append(tId) - if self.config_json.get("plugins", None): - self.config_json["plugins"].append(mafPlugin["plugins"][0]) - else: - self.config_json.update(mafPlugin) + + if query_refnames: + json_track_data["adapter"]["refNamesQueryTemplate"]: query_refnames + + # TODO handle metadata somehow for sparql too + + self.subprocess_check_call( + [ + "jbrowse", + "add-track-json", + "--target", + self.outdir, + json.dumps(json_track_data), + ] + ) + + def _add_track(self, track_id, label, category, path, assembly, config=None, trackType=None, load_action="inPlace", assemblies=None, remote=False): + """ + Adds a track to config.json using Jbrowse add-track cli + + By default, using `--load inPlace`: the file is supposed to be already placed at the `path` relative to + the outdir, `jbrowse add-track` will not touch it and trust us that the file is there and ready to use. + + With `load_action` parameter, you can ask `jbrowse add-track` to copy/move/symlink the file for you. + Not done by default because we often need more control on file copying/symlink for specific cases (indexes, symlinks of symlinks, ...) + """ + + cmd = [ + "jbrowse", + "add-track", + "--name", + label, + "--category", + category, + "--target", + self.outdir, + "--trackId", + track_id, + "--assemblyNames", + assemblies if assemblies else assembly['uniq_id'], + ] + + if not remote: + cmd.append("--load") + cmd.append(load_action) + + if config: + cmd.append("--config") + cmd.append(json.dumps(config)) + + if trackType: + cmd.append("--trackType") + cmd.append(trackType) + + cmd.append(path) + + self.subprocess_check_call(cmd) def _sort_gff(self, data, dest): # Only index if not already done if not os.path.exists(dest): - e = os.environ - e["SHELL"] = "/bin/sh" - cmd = "jbrowse sort-gff %s | bgzip -c > %s" % (data, dest) - subprocess.run(cmd, env=e, shell=True) - self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest]) - - def add_gff(self, data, trackData): - tId = trackData["label"] - useuri = trackData["useuri"].lower() == "yes" - if useuri: - url = trackData["path"] - else: - url = tId + ".gz" - dest = os.path.join(self.outdir, url) - self._sort_gff(data, dest) - categ = trackData["category"] - trackDict = { - "type": "FeatureTrack", - "trackId": tId, - "name": trackData["name"], - "assemblyNames": [trackData["assemblyNames"]], - "category": [ - categ, - ], - "adapter": { - "type": "Gff3TabixAdapter", - "gffGzLocation": { - "uri": url, - }, - "index": { - "location": { - "uri": url + ".tbi", - } - }, - }, - "displays": [ - { - "type": "LinearBasicDisplay", - "displayId": "%s-LinearBasicDisplay" % tId, - }, - { - "type": "LinearArcDisplay", - "displayId": "%s-LinearArcDisplay" % tId, - }, - ], - } - style_json = self._prepare_track_style(trackDict) - trackDict["style"] = style_json - self.tracksToAdd[trackData["assemblyNames"]].append(copy.copy(trackDict)) - self.trackIdlist.append(tId) - - def add_bigwig(self, data, trackData): - tId = trackData["label"] - useuri = trackData["useuri"].lower() == "yes" - if useuri: - url = data - else: - url = tId - # slashes in names cause path trouble - dest = os.path.join(self.outdir, url) - cmd = ["cp", data, dest] - self.subprocess_check_call(cmd) - bwloc = {"uri": url} - categ = trackData["category"] - trackDict = { - "type": "QuantitativeTrack", - "trackId": tId, - "name": trackData["name"], - "category": [ - categ, - ], - "assemblyNames": [trackData["assemblyNames"]], - "adapter": { - "type": "BigWigAdapter", - "bigWigLocation": bwloc, - }, - "displays": [ - { - "type": "LinearWiggleDisplay", - "displayId": "%s-LinearWiggleDisplay" % tId, - } - ], - } - style_json = self._prepare_track_style(trackDict) - trackDict["style"] = style_json - self.tracksToAdd[trackData["assemblyNames"]].append(copy.copy(trackDict)) - self.trackIdlist.append(tId) + # Not using jbrowse sort-gff because it uses sort and has the problem exposed on https://github.com/tao-bioinfo/gff3sort + cmd = f"gff3sort.pl --precise '{data}' | grep -v \"^$\" > '{dest}'" + self.subprocess_popen(cmd, cwd=False) - def add_bam(self, data, trackData, bam_indexes=None, **kwargs): - tId = trackData["label"] - realFName = trackData["path"] - useuri = trackData["useuri"].lower() == "yes" - categ = trackData["category"] - if useuri: - url = data - else: - fname = tId - dest = "%s/%s" % (self.outdir, fname) - self.subprocess_check_call(["cp", data, dest]) - url = fname - bindex = fname + ".bai" - bi = bam_indexes.split(",") - bam_index = [ - x.split("~~~")[1].strip() - for x in bi - if "~~~" in x and x.split("~~~")[0].strip() == realFName - ] - logging.debug( - "===realFName=%s got %s as bam_indexes %s as bi, %s for bam_index" - % (realFName, bam_indexes, bi, bam_index) - ) - if len(bam_index) > 0 and os.path.exists(os.path.realpath(bam_index[0])): - self.subprocess_check_call(["cp", bam_index[0], bindex]) - else: - cmd = ["samtools", "index", "-b", "-o", bindex, data] - self.subprocess_check_call(cmd) - trackDict = { - "type": "AlignmentsTrack", - "trackId": tId, - "name": trackData["name"], - "category": [ - categ, - ], - "assemblyNames": [trackData["assemblyNames"]], - "adapter": { - "type": "BamAdapter", - "bamLocation": {"uri": url}, - "index": { - "location": { - "uri": bindex, - } - }, - }, - "displays": [ - { - "type": "LinearAlignmentsDisplay", - "displayId": "%s-LinearAlignmentsDisplay" % tId, - }, - ], - } - style_json = self._prepare_track_style(trackDict) - trackDict["style"] = style_json - self.tracksToAdd[trackData["assemblyNames"]].append(copy.copy(trackDict)) - self.trackIdlist.append(tId) - - def add_cram(self, data, trackData, cram_indexes=None, **kwargs): - tId = trackData["label"] - realFName = trackData["path"] - categ = trackData["category"] - useuri = trackData["useuri"].lower() == "yes" - gsa = self.assmeta.get(trackData["assemblyNames"], None) - if gsa: - genseqad = gsa[0]["genome_sequence_adapter"] - else: - genseqad = "Not found" - logging.warning("No adapter found for cram %s in gsa=%s" % (tId, gsa)) - if useuri: - url = data - else: - fname = tId - dest = os.path.join(self.outdir, fname) - url = fname - self.subprocess_check_call(["cp", data, dest]) - ci = cram_indexes.split(",") - cram_index = [ - x.split("~~~")[1].strip() - for x in ci - if "~~~" in x and x.split("~~~")[0].strip() == realFName - ] - logging.debug( - "===realFName=%s got %s as cram_indexes %s as ci, %s for cram_index" - % (realFName, cram_indexes, ci, cram_index) - ) - if len(cram_index) > 0 and os.path.exists(cram_index[0]): - if not os.path.exists(dest + ".crai"): - # most probably made by galaxy and stored in galaxy dirs, need to copy it to dest - self.subprocess_check_call( - ["cp", os.path.realpath(cram_index[0]), dest + ".crai"] - ) - else: - cpath = os.path.realpath(dest) + ".crai" - cmd = ["samtools", "index", "-c", "-o", cpath, os.path.realpath(dest)] - self.subprocess_check_call(cmd) - trackDict = { - "type": "AlignmentsTrack", - "trackId": tId, - "name": trackData["name"], - "category": [ - categ, - ], - "assemblyNames": [trackData["assemblyNames"]], - "adapter": { - "type": "CramAdapter", - "cramLocation": {"uri": url}, - "craiLocation": { - "uri": url + ".crai", - }, - "sequenceAdapter": genseqad, - }, - "displays": [ - { - "type": "LinearAlignmentsDisplay", - "displayId": "%s-LinearAlignmentsDisplay" % tId, - }, - ], - } - style_json = self._prepare_track_style(trackDict) - trackDict["style"] = style_json - self.tracksToAdd[trackData["assemblyNames"]].append(copy.copy(trackDict)) - self.trackIdlist.append(tId) - - def add_vcf(self, data, trackData): - tId = trackData["label"] - categ = trackData["category"] - useuri = trackData["useuri"].lower() == "yes" - if useuri: - url = data - else: - url = tId - dest = os.path.join(self.outdir, url) - cmd = ["bgzip", "-c", data] - with open(dest, "wb") as fout: - subprocess.run(cmd, stdout=fout) - cmd = ["tabix", "-f", "-p", "vcf", dest] - self.subprocess_check_call(cmd) - trackDict = { - "type": "VariantTrack", - "trackId": tId, - "name": trackData["name"], - "assemblyNames": [trackData["assemblyNames"]], - "category": [ - categ, - ], - "adapter": { - "type": "VcfTabixAdapter", - "vcfGzLocation": {"uri": url}, - "index": { - "location": { - "uri": url + ".tbi", - } - }, - }, - "displays": [ - { - "type": "LinearVariantDisplay", - "displayId": "%s-LinearVariantDisplay" % tId, - }, - { - "type": "ChordVariantDisplay", - "displayId": "%s-ChordVariantDisplay" % tId, - }, - { - "type": "LinearPairedArcDisplay", - "displayId": "%s-LinearPairedArcDisplay" % tId, - }, - ], - } - style_json = self._prepare_track_style(trackDict) - trackDict["style"] = style_json - self.tracksToAdd[trackData["assemblyNames"]].append(copy.copy(trackDict)) - self.trackIdlist.append(tId) + self.subprocess_check_call(["bgzip", "-f", dest], cwd=False) + self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest + ".gz"], cwd=False) def _sort_bed(self, data, dest): # Only index if not already done if not os.path.exists(dest): cmd = ["sort", "-k1,1", "-k2,2n", data] - ps = subprocess.run(cmd, check=True, capture_output=True) - cmd = ["bgzip", "-c"] - with open(dest, "wb") as fout: - subprocess.run(cmd, input=ps.stdout, stdout=fout) - cmd = ["tabix", "-f", "-p", "bed", dest] - self.subprocess_check_call(cmd) + with open(dest, "w") as handle: + self.subprocess_check_call(cmd, output=handle) + + self.subprocess_check_call(["bgzip", "-f", dest]) + self.subprocess_check_call(["tabix", "-f", "-p", "bed", dest + ".gz"]) - def add_bed(self, data, ext, trackData): - bedPlugin = {"name": "BedScorePlugin", "umdLoc": {"uri": "bedscoreplugin.js"}} - tId = trackData["label"] - categ = trackData["category"] - useuri = trackData["useuri"].lower() == "yes" - if useuri: - url = data - else: - url = tId + ".gz" - dest = os.path.join(self.outdir, url) - self._sort_bed(data, dest) - if True or trackData.get("usebedscore", None): - bedgzlocation = { - "uri": url, - "columnNames": ["chr", "start", "end", "name", "score"], - "scoreColumn": "score", - } - else: - bedgzlocation = { - "uri": url, - } - trackDict = { - "type": "FeatureTrack", - "trackId": tId, - "name": trackData["name"], - "assemblyNames": [trackData["assemblyNames"]], - "adapter": { - "category": [ - categ, - ], - "type": "BedTabixAdapter", - "bedGzLocation": bedgzlocation, - "index": { - "location": { - "uri": url + ".tbi", - }, - }, - }, - "displays": [ - { - "type": "LinearBasicDisplay", - "displayId": "%s-LinearBasicDisplay" % tId, - "renderer": { - "type": "SvgFeatureRenderer", - "color1": "jexl:customColor(feature)", - }, - }, - { - "type": "LinearPileupDisplay", - "displayId": "%s-LinearPileupDisplay" % tId, - }, - { - "type": "LinearArcDisplay", - "displayId": "%s-LinearArcDisplay" % tId, - }, - ], - } - style_json = self._prepare_track_style(trackDict) - trackDict["style"] = style_json - if self.config_json.get("plugins", None): - self.config_json["plugins"].append(bedPlugin) - else: - self.config_json["plugins"] = [ - bedPlugin, - ] - self.tracksToAdd[trackData["assemblyNames"]].append(copy.copy(trackDict)) - self.trackIdlist.append(tId) + def _convert_maf(self, data, dest, assembly_name): + # Only convert if not already done + if not os.path.exists(dest): + + dest_bed = dest + ".bed" + cmd = ["python", os.path.join(SELF_LOCATION, "maf2bed.py"), assembly_name, data, dest_bed] + self.subprocess_check_call(cmd, cwd=False) - def add_paf(self, data, trackData, pafOpts, **kwargs): - canPIF = True - tname = trackData["name"] - tId = trackData["label"] - url = tId - usePIF = False # much faster if indexed remotely or locally but broken in biocontainer. - useuri = data.startswith("http://") or data.startswith("https://") - if not useuri: - if canPIF: - fakeName = os.path.join(self.outdir, "%s.paf" % tId) - url = "%s.pif.gz" % tId - cmd = ["cp", data, fakeName] - self.subprocess_check_call(cmd) - e = os.environ - e["SHELL"] = "/bin/sh" - cmd = [ - "jbrowse", - "make-pif", - fakeName, - ] - subprocess.run(cmd, env=e, shell=True) - usePIF = True - else: - dest = os.path.join(self.outdir, url) - self.symlink_or_copy(os.path.realpath(data), dest) - else: - url = data - if data.endswith(".pif.gz") or data.endswith(".paf.gz"): # is tabix - usePIF = True - categ = trackData["category"] - pg = pafOpts["genome"].split(",") - pgc = [x.strip() for x in pg if x.strip() > ""] - gnomes = [x.split("~~~") for x in pgc] - logging.debug("pg=%s, gnomes=%s" % (pg, gnomes)) - passnames = [trackData["assemblyNames"]] # always first - for i, (gpath, gname) in enumerate(gnomes): - # may have been forgotten by user for uri - if len(gname) == 0: - gn = os.path.basename(gpath) - gname = os.path.splitext(gn)[0] - # trouble from spacey names in command lines avoidance - if len(gname.split()) > 1: - gname = gname.split()[0] - if gname not in passnames: - passnames.append(gname) - useuri = pafOpts["useuri"] == "true" - if gname not in self.genome_names: - # ignore if already there - eg for duplicates among pafs. - asstrack, first_contig = self.make_assembly(gpath, gname, useuri) - self.genome_names.append(gname) - self.tracksToAdd[gname] = [] - self.assemblies.append(copy.copy(asstrack)) - self.ass_first_contigs.append(copy.copy(first_contig)) - trackDict = { - "type": "SyntenyTrack", - "trackId": tId, - "assemblyNames": passnames, - "category": [ - categ, - ], - "name": tname, - "displays": [ - { - "type": "LGVSyntenyDisplay", - "displayId": "%s-LGVSyntenyDisplay" % tId, - }, - { - "type": "DotplotDisplay", - "displayId": "%s-DotplotDisplay" % tId, - }, - { - "type": "LinearComparativeDisplay", - "displayId": "%s-LinearComparativeDisplay" % tId, - }, - { - "type": "LinearBasicDisplay", - "displayId": "%s-LinearSyntenyDisplay" % tId, - }, - ], - } - if usePIF: - trackDict["adapter"] = { - "type": "PairwiseIndexedPAFAdapter", - "pifGzLocation": {"uri": url}, - "assemblyNames": passnames, - "index": { - "location": { - "uri": url + ".tbi", - } - }, - } - else: - trackDict["adapter"] = { - "type": "PAFAdapter", - "pafLocation": {"uri": url}, - "assemblyNames": passnames, - } - if not usePIF: - style_json = { - "type": "LGVSyntenyDisplay", - "displayId": "%s-LGVSyntenyDisplay" % tId, - } - else: - style_json = { - "type": "LinearBasicDisplay", - "displayId": "%s-LinearBasicDisplay" % tId, - } - trackDict["style"] = style_json - self.tracksToAdd[trackData["assemblyNames"]].append(copy.copy(trackDict)) - self.trackIdlist.append(tId) + cmd = ["sort", "-k1,1", "-k2,2n", dest_bed] + with open(dest, "w") as handle: + self.subprocess_check_call(cmd, output=handle) + + self.subprocess_check_call(["bgzip", "-f", dest], cwd=False) + self.subprocess_check_call(["tabix", "-f", "-p", "bed", dest + ".gz"], cwd=False) - def process_annotations(self, track): + def process_annotations(self, track, parent): category = track["category"].replace("__pd__date__pd__", TODAY) - tt1 = ",/ :;\\" - tt2 = "______" - labttab = str.maketrans(tt1, tt2) - for trackIndex, ( + + track_labels = [] + + for i, ( dataset_path, dataset_ext, - useuri, track_human_label, extra_metadata, ) in enumerate(track["trackfiles"]): - if not dataset_path.strip().startswith("http"): - # Unsanitize labels (element_identifiers are always sanitized by Galaxy) - for key, value in mapped_chars.items(): - track_human_label = track_human_label.replace(value, key) - track_human_label = track_human_label.translate(labttab) + # Unsanitize labels (element_identifiers are always sanitized by Galaxy) + for key, value in mapped_chars.items(): + track_human_label = track_human_label.replace(value, key) + + is_multi = type(dataset_path) is list + + log.info( + f"-----> Processing track {category} / {track_human_label} ({dataset_ext}, {len(dataset_path) if is_multi else 1} files)" + ) + outputTrackConfig = { "category": category, - "style": {}, } - outputTrackConfig["assemblyNames"] = track["assemblyNames"] outputTrackConfig["key"] = track_human_label - outputTrackConfig["useuri"] = useuri - outputTrackConfig["path"] = dataset_path - outputTrackConfig["ext"] = dataset_ext - outputTrackConfig["trackset"] = track.get("trackset", {}) - outputTrackConfig["label"] = track["label"] + # We add extra data to hash for the case of non-file tracks + if ( + "conf" in track + and "options" in track["conf"] + and "url" in track["conf"]["options"] + ): + non_file_info = track["conf"]["options"]["url"] + else: + non_file_info = "" + + # I chose to use track['category'] instead of 'category' here. This + # is intentional. This way re-running the tool on a different date + # will not generate different hashes and make comparison of outputs + # much simpler. + hashData = [ + str(dataset_path), + track_human_label, + track["category"], + non_file_info, + parent["uniq_id"], + ] + hashData = "|".join(hashData).encode("utf-8") + outputTrackConfig["label"] = hashlib.md5(hashData).hexdigest() + f"_{track['track_num']}_{i}" outputTrackConfig["metadata"] = extra_metadata - outputTrackConfig["name"] = track_human_label - if track["label"] in self.trackIdlist: - logging.error( - "### not adding %s already in %s" - % (track["label"], self.trackIdlist) - ) - yield None + + outputTrackConfig["style"] = track["style"] + + outputTrackConfig["formatdetails"] = track["formatdetails"] + + outputTrackConfig["remote"] = track["remote"] + + # Guess extension for remote data + if dataset_ext == "gff,gff3,bed": + if dataset_path.endswith(".bed") or dataset_path.endswith(".bed.gz"): + dataset_ext = "bed" + else: + dataset_ext = "gff" + elif dataset_ext == "vcf,vcf_bgzip": + if dataset_path.endswith(".vcf.gz"): + dataset_ext = "vcf_bgzip" + else: + dataset_ext = "vcf" + if dataset_ext in ("gff", "gff3"): self.add_gff( + parent, dataset_path, + dataset_ext, outputTrackConfig, - ) - elif dataset_ext in ("hic", "juicebox_hic"): - outputTrackConfig["wasCool"] = False - self.add_hic( - dataset_path, - outputTrackConfig, + track["conf"]["options"]["gff"], ) - elif dataset_ext in ("cool", "mcool", "scool"): - hic_url = outputTrackConfig["label"] - hic_path = os.path.join(self.outdir, hic_url) + ".hic" - outputTrackConfig["wasCool"] = True - self.subprocess_check_call( - [ - "hictk", - "convert", - "-f", - "--output-fmt", - "hic", - dataset_path, - hic_path, - ] - ) - self.add_hic( - hic_path, - outputTrackConfig, - ) - elif dataset_ext in ("bed",): + elif dataset_ext == "bed": self.add_bed( + parent, dataset_path, dataset_ext, outputTrackConfig, - ) - elif dataset_ext in ("maf",): - self.add_maf( - dataset_path, - outputTrackConfig, + track["conf"]["options"]["gff"], ) elif dataset_ext == "bigwig": - self.add_bigwig( - dataset_path, - outputTrackConfig, + if is_multi: + self.add_bigwig_multi( + parent, + dataset_path, outputTrackConfig, track["conf"]["options"]["wiggle"] + ) + else: + self.add_bigwig( + parent, + dataset_path, outputTrackConfig, track["conf"]["options"]["wiggle"] + ) + elif dataset_ext == "maf": + self.add_maf( + parent, + dataset_path, outputTrackConfig, track["conf"]["options"]["maf"] ) elif dataset_ext == "bam": - real_indexes = track["conf"]["options"]["bam"]["bam_index"] - self.add_bam( + + if track["remote"]: + bam_index = dataset_path + '.bai' + else: + real_indexes = track["conf"]["options"]["pileup"]["bam_indices"][ + "bam_index" + ] + if not isinstance(real_indexes, list): + # <bam_indices> + # <bam_index>/path/to/a.bam.bai</bam_index> + # </bam_indices> + # + # The above will result in the 'bam_index' key containing a + # string. If there are two or more indices, the container + # becomes a list. Fun! + real_indexes = [real_indexes] + + bam_index = real_indexes[i] + + self.add_xam( + parent, dataset_path, outputTrackConfig, - bam_indexes=real_indexes, + track["conf"]["options"]["pileup"], + index=bam_index, + ext="bam", ) elif dataset_ext == "cram": - real_indexes = track["conf"]["options"]["cram"]["cram_index"] - self.add_cram( + + if track["remote"]: + cram_index = dataset_path + '.crai' + else: + real_indexes = track["conf"]["options"]["cram"]["cram_indices"][ + "cram_index" + ] + if not isinstance(real_indexes, list): + # <bam_indices> + # <bam_index>/path/to/a.bam.bai</bam_index> + # </bam_indices> + # + # The above will result in the 'bam_index' key containing a + # string. If there are two or more indices, the container + # becomes a list. Fun! + real_indexes = [real_indexes] + + cram_index = real_indexes[i] + + self.add_xam( + parent, dataset_path, outputTrackConfig, - cram_indexes=real_indexes, + track["conf"]["options"]["cram"], + index=cram_index, + ext="cram", ) elif dataset_ext == "vcf": - self.add_vcf(dataset_path, outputTrackConfig) - elif dataset_ext == "paf": + self.add_vcf( + parent, + dataset_path, + outputTrackConfig + ) + elif dataset_ext == "vcf_bgzip": + self.add_vcf( + parent, + dataset_path, + outputTrackConfig, + zipped=True + ) + elif dataset_ext == "paf": # https://fr.wikipedia.org/wiki/Paf_le_chien self.add_paf( + parent, dataset_path, outputTrackConfig, - track["conf"]["options"]["paf"], + track["conf"]["options"]["synteny"] + ) + elif dataset_ext in ("hic"): + self.add_hic( + parent, + dataset_path, + outputTrackConfig, + track["conf"]["options"]["hic"] + ) + elif dataset_ext == "sparql": + sparql_query = track["conf"]["options"]["sparql"]["query"] + for key, value in mapped_chars.items(): + sparql_query = sparql_query.replace(value, key) + sparql_query_refnames = track["conf"]["options"]["sparql"].get("query_refnames", "") + if sparql_query_refnames: + for key, value in mapped_chars.items(): + sparql_query_refnames = sparql_query_refnames.replace(value, key) + self.add_sparql( + parent, + track["conf"]["options"]["sparql"]["url"], + sparql_query, + sparql_query_refnames, + outputTrackConfig, + ) + elif dataset_ext == "gc": + self.add_gc_content( + parent, + outputTrackConfig, ) else: - logging.warning("Do not know how to handle %s", dataset_ext) - # Return non-human label for use in other fields - yield outputTrackConfig["label"] + log.error(f"Do not know how to handle {dataset_ext}") + + track_labels.append(outputTrackConfig["label"]) + + # Return non-human label for use in other fields + return track_labels + + def add_default_view_genome(self, genome, default_loc, tracks_on): - def add_default_session(self, default_data): - """ - default session settings are hard and fragile. - .add_default_view() and other configuration code adapted from - https://github.com/abretaud/tools-iuc/blob/jbrowse2/tools/jbrowse2/jbrowse2.py - """ - # TODO using the default session for now, but check out session specs in the future https://github.com/GMOD/jbrowse-components/issues/2708 - bpPerPx = ( - self.bpPerPx - ) # Browser window width is unknown and default session cannot be used to figure it out in JB2 code so could be 200-2000+ pixels. - track_types = {} - with open(self.config_json_file, "r") as config_file: - config_json = json.load(config_file) - if self.config_json: - config_json.update(self.config_json) - if "defaultSession" in config_json: - session_json = config_json["defaultSession"] - session_views = [] + refName = "" + start = end = None + if default_loc: + loc_match = re.search(r"^(\w+):(\d+)\.+(\d+)$", default_loc) + if loc_match: + refName = loc_match.group(1) + start = int(loc_match.group(2)) + end = int(loc_match.group(3)) + + if not refName and self.assembly_ids[genome['uniq_id']]: + refName = self.assembly_ids[genome['uniq_id']] + + if start and end: + loc_str = f"{refName}:{start}-{end}" else: - session_json = {} - session_views = [] - for gnome in self.assmeta.keys(): # assemblies have their own tracks - tracks_data = [] - for track_conf in self.tracksToAdd[gnome]: - tId = track_conf["trackId"] - if tId in default_data[gnome]["visibility"]["default_on"]: - track_types[tId] = track_conf["type"] - style_data = default_data[gnome]["style"].get(tId, {}) - if not style_data: - logging.debug( - "No style data for %s in available default data %s" - % (tId, default_data) - ) - else: - logging.debug("style data for %s = %s" % (tId, style_data)) - if style_data.get("type", None) is None: - style_data["type"] = "LinearBasicDisplay" - if "displays" in track_conf: - disp = track_conf["displays"][0]["type"] - style_data["type"] = disp - if track_conf.get("displays", None): - style_data["configuration"] = track_conf["displays"][0][ - "displayId" + loc_str = refName + + # Updating an existing jbrowse instance, merge with pre-existing view + view_specs = None + if self.update: + for existing in self.default_views.values(): + if len(existing) and existing["type"] == "LinearGenomeView": + if existing['init']['assembly'] == genome['uniq_id']: + view_specs = existing + if loc_str: + view_specs['init']['loc'] = loc_str + view_specs['init']['tracks'].extend(tracks_on) + + if view_specs is None: # Not updating, or updating from synteny + view_specs = { + "type": "LinearGenomeView", + "init": { + "assembly": genome['uniq_id'], + "loc": loc_str, + "tracks": tracks_on + } + } + + return view_specs + + def add_default_view_synteny(self, genome_views, synteny_tracks): + + # Add json for cached synteny tracks + # We cache them because we need to know the target genome uniq_id + for strack in synteny_tracks: + + # Target assembly is the next genome, find its uniq_id + query_assembly = strack["assemblyNames"][0] + ass_uniq_ids = list(self.assembly_ids.keys()) + query_index = ass_uniq_ids.index(query_assembly) + target_assembly = ass_uniq_ids[query_index + 1] + + strack["assemblyNames"][1] = target_assembly + strack["adapter"]["assemblyNames"][1] = target_assembly + + self.subprocess_check_call( + [ + "jbrowse", + "add-track-json", + "--target", + self.outdir, + json.dumps(strack), + ] + ) + + # Configure the synteny view + levels = [] + + for strack in synteny_tracks: + lev = { + "type": "LinearSyntenyViewHelper", + "tracks": [ + { + "type": "SyntenyTrack", + "configuration": strack["trackId"], + "displays": [ + { + "type": "LinearSyntenyDisplay", + "configuration": strack["trackId"] + "_LinearSyntenyDisplay" + } ] - else: - logging.debug("no display in track_conf for %s" % tId) - if track_conf.get("style_labels", None): - # TODO fix this: it should probably go in a renderer block (SvgFeatureRenderer) but still does not work - # TODO move this to per track displays? - style_data["labels"] = track_conf["style_labels"] - tracks_data.append( - { - "type": track_types[tId], - "configuration": tId, - "displays": [style_data], - } - ) - first = [x for x in self.ass_first_contigs if x[0] == gnome] - drdict = { - "reversed": False, - "assemblyName": gnome, + } + ], + "height": 100, + "level": len(levels) } - if len(first) > 0: - [gnome, refName, end] = first[0] - drdict["refName"] = refName - drdict["start"] = 0 - end = int(end) - drdict["end"] = end - else: - ddl = default_data.get("defaultLocation", None) - if ddl: - loc_match = re.search(r"^([^:]+):([\d,]*)\.*([\d,]*)$", ddl) - # allow commas like 100,000 but ignore as integer - if loc_match: - refName = loc_match.group(1) - drdict["refName"] = refName - if loc_match.group(2) > "": - drdict["start"] = int(loc_match.group(2).replace(",", "")) - if loc_match.group(3) > "": - drdict["end"] = int(loc_match.group(3).replace(",", "")) - else: - logging.info( - "@@@ regexp could not match contig:start..end in the supplied location %s - please fix" - % ddl - ) - view_json = { - "type": "LinearGenomeView", - "offsetPx": 0, - "bpPerPx": bpPerPx, - "minimized": False, - "tracks": tracks_data, - } - if drdict.get("refName", None): - # TODO displayedRegions is not just zooming to the region, it hides the rest of the chromosome - view_json["displayedRegions"] = [ - drdict, - ] - logging.info("@@@ defaultlocation %s for default session" % drdict) - else: - logging.info( - "@@@ no track location for default session - please add one!" - ) - session_views.append(view_json) - session_name = default_data.get("session_name", "New session") - session_json["name"] = session_name + levels.append(lev) - if "views" not in session_json: - session_json["views"] = session_views - else: - session_json["views"] += session_views + view_specs = { + "type": "LinearSyntenyView", + "views": genome_views, + "levels": levels + } - pp = json.dumps(session_views, indent=2) - config_json["defaultSession"] = session_json - self.config_json.update(config_json) - logging.debug("defaultSession=%s" % (pp)) - with open(self.config_json_file, "w") as config_file: - json.dump(self.config_json, config_file, indent=2) + return view_specs - def add_defsess_to_index(self, data): + def add_default_session(self, default_views): """ - ---------------------------------------------------------- Add some default session settings: set some assemblies/tracks on/off This allows to select a default view: @@ -1447,8 +1423,11 @@ - default tracks - ... - Different methods to do that were tested/discussed: - - using a defaultSession item in config.json: this proved to be difficult: + Now using this method: + https://github.com/GMOD/jbrowse-components/pull/4907 + + Different methods that were tested/discussed earlier: + - using a defaultSession item in config.json before PR 4970: this proved to be difficult: forced to write a full session block, including hard-coded/hard-to-guess items, no good way to let Jbrowse2 display a scaffold without knowing its size - using JBrowse2 as an embedded React component in a tool-generated html file: @@ -1456,57 +1435,41 @@ - writing a session-spec inside the config.json file: this is not yet supported as of 2.10.2 (see PR 4148 below) a session-spec is a kind of simplified defaultSession where you don't need to specify every aspect of the session - passing a session-spec through URL params by embedding the JBrowse2 index.html inside an iframe - we selected this option Xrefs to understand the choices: https://github.com/GMOD/jbrowse-components/issues/2708 https://github.com/GMOD/jbrowse-components/discussions/3568 https://github.com/GMOD/jbrowse-components/pull/4148 """ - new_index = "Nothing written" - session_spec = {"views": []} - logging.debug("def ass_first=%s\ndata=%s" % (self.ass_first_contigs, data)) - for first_contig in self.ass_first_contigs: - logging.debug("first contig=%s" % self.ass_first_contigs) - [gnome, refName, end] = first_contig - start = 0 - aview = { - "assembly": gnome, - "loc": "{}:{}..{}".format(refName, start, end), - "type": "LinearGenomeView", - "tracks": data[gnome]["tracks"], - } - session_spec["views"].append(aview) - sess = json.dumps(session_spec, sort_keys=True, indent=2) - new_index = INDEX_TEMPLATE.replace( - "__SESSION_SPEC__", "&session=spec-{}".format(sess) - ) + + if self.use_synteny_viewer: + session_name = "Synteny" + else: + session_name = ', '.join(x['init']['assembly'] for x in default_views) - os.rename( - os.path.join(self.outdir, "index.html"), - os.path.join(self.outdir, "index_noview.html"), - ) + session_spec = { + "name": session_name, + "views": default_views + } - with open(os.path.join(self.outdir, "index.html"), "w") as nind: - nind.write(new_index) - logging.debug( - "#### add_defsession gnome=%s refname=%s\nsession_spec=%s\nnew_index=%s" - % (gnome, refName, sess, new_index) - ) + config_path = os.path.join(self.outdir, "config.json") + with open(config_path, "r") as config_file: + config_json = json.load(config_file) + + config_json["defaultSession"].update(session_spec) + + with open(config_path, "w") as config_file: + json.dump(config_json, config_file, indent=2) def add_general_configuration(self, data): """ Add some general configuration to the config.json file """ - config_path = self.config_json_file - if os.path.exists(config_path): - with open(config_path, "r") as config_file: - config_json = json.load(config_file) - else: - config_json = {} - if self.config_json: - config_json.update(self.config_json) + config_path = os.path.join(self.outdir, "config.json") + with open(config_path, "r") as config_file: + config_json = json.load(config_file) + config_data = {} config_data["disableAnalytics"] = data.get("analytics", "false") == "true" @@ -1520,240 +1483,268 @@ }, "typography": {"fontSize": int(data.get("font_size", 10))}, } - if not config_json.get("configuration", None): - config_json["configuration"] = {} + config_json["configuration"].update(config_data) - self.config_json.update(config_json) + with open(config_path, "w") as config_file: - json.dump(self.config_json, config_file, indent=2) + json.dump(config_json, config_file, indent=2) + + def add_plugins(self, data): + """ + Add plugins to the config.json file + """ - def clone_jbrowse(self, realclone=False): - """ - Clone a JBrowse directory into a destination directory. + config_path = os.path.join(self.outdir, "config.json") + with open(config_path, "r") as config_file: + config_json = json.load(config_file) + + if "plugins" not in config_json: + config_json["plugins"] = [] - `realclone=true` will use the `jbrowse create` command. - To allow running on internet-less compute and for reproducibility - use frozen code with `realclone=false + config_json["plugins"].extend(data) + with open(config_path, "w") as config_file: + json.dump(config_json, config_file, indent=2) + + def clone_jbrowse(self, jbrowse_dir, destination): """ - dest = self.outdir - if (not os.path.exists(self.jbrowse2path)) or realclone: - e = os.environ - e["SHELL"] = "/bin/sh" - cmd = ["jbrowse", "create", dest, "-f", "--tag", f"{JB2VER}"] - subprocess.run(cmd, env=e, shell=True) + Clone a JBrowse directory into a destination directory. + + Not using `jbrowse create` command to allow running on internet-less compute + to make sure code is frozen + """ + + copytree(jbrowse_dir, destination) + try: + shutil.rmtree(os.path.join(destination, "test_data")) + except OSError as e: + log.error(f"Error: {e.filename} - {e.strerror}.") + + if not os.path.exists(os.path.join(destination, "data")): + # It can already exist if upgrading an instance + os.makedirs(os.path.join(destination, "data")) + log.info(f"makedir {os.path.join(destination, 'data')}") + + os.symlink("./data/config.json", os.path.join(destination, "config.json")) + + +def copytree(src, dst, symlinks=False, ignore=None): + for item in os.listdir(src): + s = os.path.join(src, item) + d = os.path.join(dst, item) + if os.path.isdir(s): + shutil.copytree(s, d, symlinks, ignore) else: - shutil.copytree(self.jbrowse2path, dest, dirs_exist_ok=True) - for fn in [ - "asset-manifest.json", - "favicon.ico", - "robots.txt", - "umd_plugin.js", - "version.txt", - "test_data", - ]: - try: - path = os.path.join(dest, fn) - if os.path.isdir(path): - shutil.rmtree(path) - else: - os.remove(path) - except OSError as e: - log.error("Error: %s - %s." % (e.filename, e.strerror)) - for neededfile in ["jb2_webserver.py", "bedscoreplugin.js"]: - shutil.copyfile( - os.path.join(INSTALLED_TO, neededfile), os.path.join(dest, neededfile) - ) + shutil.copy2(s, d) def parse_style_conf(item): - if item.text.lower() in ["false", "true", "yes", "no"]: - return item.text.lower in ("yes", "true") - elif item.text.isdigit(): - return int(item.text) - return item.text + if "type" in item.attrib and item.attrib["type"] in ["boolean", "integer"]: + if item.attrib["type"] == "boolean": + return item.text in ("yes", "true", "True") + elif item.attrib["type"] == "integer": + return int(item.text) + else: + return item.text + + +def validate_synteny(real_root): + + if len(real_root.findall('assembly/tracks/track[@format="synteny"]')) == 0: + # No synteny data, all good + return False + + assemblies = real_root.findall("assembly") + + if len(assemblies[-1].findall('tracks/track[@format="synteny"]')) > 0 and \ + assemblies[-1].find('tracks/track[@format="synteny"]/options/style/display').text == "LinearSyntenyDisplay": + raise RuntimeError("You should not set a synteny track on the last genome.") + + for assembly in assemblies[1:0]: + if len(assembly.findall('tracks/track[@format="synteny"]')) != 1 and \ + assembly.find('tracks/track[@format="synteny"]/options/style/display').text == "LinearSyntenyDisplay": + raise RuntimeError("To use the synteny viewer, you should add a synteny track to each assembly, except the last one.") + + return True if __name__ == "__main__": parser = argparse.ArgumentParser(description="", epilog="") - parser.add_argument("--xml", help="Track Configuration") - parser.add_argument( - "--jbrowse2path", help="Path to JBrowse2 directory in BioContainer or Conda" - ) + parser.add_argument("xml", type=argparse.FileType("r"), help="Track Configuration") + + parser.add_argument('--jbrowse', help='Folder containing a jbrowse release') + parser.add_argument("--update", help="Update an existing JBrowse2 instance", action="store_true") parser.add_argument("--outdir", help="Output directory", default="out") - parser.add_argument("--version", "-V", action="version", version=JB2VER) args = parser.parse_args() - tree = ET.parse(args.xml) - root = tree.getroot() - removeMe = string.punctuation.replace(".", " ").replace("/", "").replace("-", "") - # first is a space because space needs to be added here for removal from labels as paths. - nopunct = str.maketrans(dict.fromkeys(removeMe)) + + tree = ET.parse(args.xml.name) + real_root = tree.getroot() + # This should be done ASAP - GALAXY_INFRASTRUCTURE_URL = root.find("metadata/galaxyUrl").text # Sometimes this comes as `localhost` without a protocol + GALAXY_INFRASTRUCTURE_URL = real_root.find("metadata/galaxyUrl").text if not GALAXY_INFRASTRUCTURE_URL.startswith("http"): # so we'll prepend `http://` and hope for the best. Requests *should* # be GET and not POST so it should redirect OK GALAXY_INFRASTRUCTURE_URL = "http://" + GALAXY_INFRASTRUCTURE_URL - jc = JbrowseConnector(outdir=args.outdir, jbrowse2path=args.jbrowse2path) + jc = JbrowseConnector( + jbrowse=args.jbrowse, + outdir=args.outdir, + update=args.update, + ) + + # Synteny options are special, check them first + jc.use_synteny_viewer = validate_synteny(real_root) + + for assembly in real_root.findall("assembly"): + genome_el = assembly.find('genome') + + is_remote = genome_el.attrib.get("remote", "false") == "true" - default_session_data = {} - trackI = 0 - for ass in root.findall("assembly"): - genomes = [ - { - "path": x.attrib["path"], - "label": x.attrib["label"].split(" ")[0].translate(nopunct), - "useuri": x.attrib["useuri"], - "meta": metadata_from_node(x.find("metadata")), - } - for x in ass.findall("metadata/genomes/genome") - ] - primaryGenome = jc.process_genomes(genomes) - if not default_session_data.get(primaryGenome, None): - default_session_data[primaryGenome] = { - "tracks": [], - "style": {}, - "style_labels": {}, - "visibility": { - "default_on": [], - "default_off": [], - }, - } - for track in ass.find("tracks"): + genome = { + "path": genome_el.attrib["path"] if is_remote else os.path.realpath(genome_el.attrib["path"]), + "meta": metadata_from_node(genome_el.find("metadata")), + "label": genome_el.attrib["label"], + } + + cytobands = None + cytobands_el = genome_el.find("cytobands") + if cytobands_el is not None and "path" in cytobands_el.attrib: + cytobands = cytobands_el.attrib["path"] + + ref_name_aliases = None + ref_name_aliases_el = genome_el.find("ref_name_aliases") + if ref_name_aliases_el is not None and "path" in ref_name_aliases_el.attrib: + ref_name_aliases = ref_name_aliases_el.attrib["path"] + + log.debug("Processing genome", genome) + genome["uniq_id"] = jc.add_assembly(genome["path"], genome["label"], is_remote, cytobands, ref_name_aliases) + + default_tracks_on = [] + + track_num = 0 + for track in assembly.findall("tracks/track"): track_conf = {} track_conf["trackfiles"] = [] - track_conf["assemblyNames"] = primaryGenome - is_multi_bigwig = False + track_conf["track_num"] = track_num + + trackfiles = track.findall("files/trackFile") or [] + + is_multi = False + multi_paths = [] + multi_type = None + multi_metadata = {} try: - if track.find("options/wiggle/multibigwig") and ( - track.find("options/wiggle/multibigwig").text == "True" - ): - is_multi_bigwig = True - multi_bigwig_paths = [] + multi_in_xml = track.find("options/multitrack") + if multi_in_xml is not None and parse_style_conf(multi_in_xml): + is_multi = True + multi_paths = [] + multi_type = trackfiles[0].attrib["ext"] except KeyError: pass - trackfiles = track.findall("files/trackFile") + is_remote = False if trackfiles: for x in trackfiles: - isBed = False - if x.attrib["ext"] == "bed": - isBed = True - track_conf["label"] = "%s_%d" % ( - x.attrib["label"].translate(nopunct), - trackI, - ) - trackI += 1 - track_conf["useuri"] = x.attrib["useuri"] - if is_multi_bigwig: - multi_bigwig_paths.append( - ( - track_conf["label"].translate(nopunct), - track_conf["useuri"], - os.path.realpath(x.attrib["path"]), - ) + if is_multi: + is_remote = x.attrib.get("remote", "false") == "true" + multi_paths.append( + (x.attrib["label"], x.attrib["path"] if is_remote else os.path.realpath(x.attrib["path"])) ) + multi_metadata.update(metadata_from_node(x.find("metadata"))) else: metadata = metadata_from_node(x.find("metadata")) - track_conf["dataset_id"] = metadata.get("dataset_id", "None") - if x.attrib["useuri"].lower() == "yes": - tfa = ( - x.attrib["path"], + is_remote = x.attrib.get("remote", "false") == "true" + track_conf["trackfiles"].append( + ( + x.attrib["path"] if is_remote else os.path.realpath(x.attrib["path"]), x.attrib["ext"], - x.attrib["useuri"], - track_conf["label"], + x.attrib["label"], metadata, ) - else: - tfa = ( - os.path.realpath(x.attrib["path"]), - x.attrib["ext"], - x.attrib["useuri"], - track_conf["label"], - metadata, - ) - track_conf["trackfiles"].append(tfa) + ) + else: + # For tracks without files (sparql, gc) + track_conf["trackfiles"].append( + ( + "", # N/A, no path for sparql or gc + track.attrib["format"], + track.find("options/label").text, + {}, + ) + ) - if is_multi_bigwig: - metadata = metadata_from_node(x.find("metadata")) + if is_multi: + etal_tracks_nb = len(multi_paths[1:]) + multi_label = f"{multi_paths[0][0]} + {etal_tracks_nb} other track{'s' if etal_tracks_nb > 1 else ''}" - track_conf["trackfiles"].append( - ( - multi_bigwig_paths, # Passing an array of paths to represent as one track - "bigwig_multiple", - "MultiBigWig", # Giving an hardcoded name for now - {}, # No metadata for multiple bigwig - ) + track_conf["trackfiles"].append( + ( + multi_paths, # Passing an array of paths to represent as one track + multi_type, # First file type + multi_label, # First file label + multi_metadata, # Mix of all metadata for multiple bigwig => only last file metadata coming from galaxy + custom oness ) + ) track_conf["category"] = track.attrib["cat"] track_conf["format"] = track.attrib["format"] + track_conf["style"] = { + item.tag: parse_style_conf(item) for item in (track.find("options/style") or []) + } + + track_conf["style"] = { + item.tag: parse_style_conf(item) for item in (track.find("options/style") or []) + } + + track_conf["style_labels"] = { + item.tag: parse_style_conf(item) + for item in (track.find("options/style_labels") or []) + } + track_conf["formatdetails"] = { + item.tag: parse_style_conf(item) for item in (track.find("options/formatdetails") or []) + } + track_conf["conf"] = etree_to_dict(track.find("options")) - keys = jc.process_annotations(track_conf) - if keys: - for key in keys: - vis = track.attrib.get("visibility", "default_off") - if not vis: - vis = "default_off" - default_session_data[primaryGenome]["visibility"][vis].append(key) - trakdat = jc.tracksToAdd[primaryGenome] - stile = {} - for trak in trakdat: - if trak["trackId"] == key: - stile = trak.get("style", {}) - if len(track.find("options/style")) > 0: - for item in track.find("options/style"): - if item.text: - stile[item.tag] = parse_style_conf(item) - logging.debug("stile=%s" % stile) - default_session_data[primaryGenome]["style"][key] = stile - default_session_data[primaryGenome]["tracks"].append(key) - default_session_data["defaultLocation"] = root.find( - "metadata/general/defaultLocation" - ).text - default_session_data["session_name"] = root.find( - "metadata/general/session_name" - ).text - logging.debug("default_session=%s" % (json.dumps(default_session_data, indent=2))) - jc.zipOut = root.find("metadata/general/zipOut").text == "true" - jc.bpPerPx = int(root.find("metadata/general/bpPerPx").text) + + track_conf["remote"] = is_remote + + track_labels = jc.process_annotations(track_conf, genome) + + if track.attrib["visibility"] == "default_on": + for tlabel in track_labels: + default_tracks_on.append(tlabel) + + track_num += 1 + + default_loc = assembly.find("defaultLocation").text + + jc.default_views[genome['uniq_id']] = jc.add_default_view_genome(genome, default_loc, default_tracks_on) + + if jc.use_synteny_viewer: + synteny_view = jc.add_default_view_synteny(list(jc.default_views.values()), jc.synteny_tracks) + + views_for_session = jc._load_old_synteny_views() + + views_for_session.append(synteny_view) + else: + old_views = jc._load_old_genome_views() + + for old_view in old_views: + if old_view not in jc.default_views: + jc.default_views[old_view] = old_views[old_view] + + views_for_session = list(jc.default_views.values()) + general_data = { - "analytics": root.find("metadata/general/analytics").text, - "primary_color": root.find("metadata/general/primary_color").text, - "secondary_color": root.find("metadata/general/secondary_color").text, - "tertiary_color": root.find("metadata/general/tertiary_color").text, - "quaternary_color": root.find("metadata/general/quaternary_color").text, - "font_size": root.find("metadata/general/font_size").text, + "analytics": real_root.find("metadata/general/analytics").text, + "primary_color": real_root.find("metadata/general/primary_color").text, + "secondary_color": real_root.find("metadata/general/secondary_color").text, + "tertiary_color": real_root.find("metadata/general/tertiary_color").text, + "quaternary_color": real_root.find("metadata/general/quaternary_color").text, + "font_size": real_root.find("metadata/general/font_size").text, } + + jc.add_default_session(views_for_session) jc.add_general_configuration(general_data) - jc.add_default_session(default_session_data) - trackconf = jc.config_json.get("tracks", []) - for gnome in jc.genome_names: - gtracks = jc.tracksToAdd[gnome] - if len(gtracks) > 0: - logging.debug( - "for genome %s adding gtracks %s" - % (gnome, json.dumps(gtracks, indent=2)) - ) - trackconf += gtracks - jc.config_json["tracks"] = trackconf - assconf = jc.config_json.get("assemblies", []) - assconf += jc.assemblies - jc.config_json["assemblies"] = assconf - logging.debug( - "assmeta=%s, first_contigs=%s, assemblies=%s, gnames=%s, trackidlist=%s, tracks=%s" - % ( - jc.assmeta, - jc.ass_first_contigs, - json.dumps(assconf, indent=2), - jc.genome_names, - jc.trackIdlist, - json.dumps(trackconf, indent=2), - ) - ) - jc.write_config() - # note that this can be left in the config.json but has NO EFFECT if add_defsess_to_index is called. - # jc.add_defsess_to_index(default_session_data) - # this command line tool appears currently broken - or at least not working here. - # jc.text_index() + jc.add_plugins(jc.plugins) + jc.text_index()