Mercurial > repos > fubar > jbrowse2

#!/usr/bin/env python
# change to accumulating all configuration for config.json based on the default from the clone
import argparse
import binascii
import datetime
import json
import logging
import os
import re
import shutil
import struct
import subprocess
import tempfile
import xml.etree.ElementTree as ET
from collections import defaultdict

logging.basicConfig(level=logging.INFO)
log = logging.getLogger("jbrowse")

JB2VER = "v2.10.2"
# version pinned for cloning

TODAY = datetime.datetime.now().strftime("%Y-%m-%d")
GALAXY_INFRASTRUCTURE_URL = None
JB2REL = "v2.10.1"
# version pinned for cloning

mapped_chars = {
    ">": "__gt__",
    "<": "__lt__",
    "'": "__sq__",
    '"': "__dq__",
    "[": "__ob__",
    "]": "__cb__",
    "{": "__oc__",
    "}": "__cc__",
    "@": "__at__",
    "#": "__pd__",
    "": "__cn__",
}


class ColorScaling(object):

    COLOR_FUNCTION_TEMPLATE = """
    function(feature, variableName, glyphObject, track) {{
        var score = {score};
        {opacity}
        return 'rgba({red}, {green}, {blue}, ' + opacity + ')';
    }}
    """

    COLOR_FUNCTION_TEMPLATE_QUAL = r"""
    function(feature, variableName, glyphObject, track) {{
        var search_up = function self(sf, attr){{
            if(sf.get(attr) !== undefined){{
                return sf.get(attr);
            }}
            if(sf.parent() === undefined) {{
                return;
            }}else{{
                return self(sf.parent(), attr);
            }}
        }};

        var search_down = function self(sf, attr){{
            if(sf.get(attr) !== undefined){{
                return sf.get(attr);
            }}
            if(sf.children() === undefined) {{
                return;
            }}else{{
                var kids = sf.children();
                for(var child_idx in kids){{
                    var x = self(kids[child_idx], attr);
                    if(x !== undefined){{
                        return x;
                    }}
                }}
                return;
            }}
        }};

        var color = ({user_spec_color} || search_up(feature, 'color') || search_down(feature, 'color') || {auto_gen_color});
        var score = (search_up(feature, 'score') || search_down(feature, 'score'));
        {opacity}
        if(score === undefined){{ opacity = 1; }}
        var result = /^#?([a-f\d]{{2}})([a-f\d]{{2}})([a-f\d]{{2}})$/i.exec(color);
        var red = parseInt(result[1], 16);
        var green = parseInt(result[2], 16);
        var blue = parseInt(result[3], 16);
        if(isNaN(opacity) || opacity < 0){{ opacity = 0; }}
        return 'rgba(' + red + ',' + green + ',' + blue + ',' + opacity + ')';
    }}
    """

    OPACITY_MATH = {
        "linear": """
            var opacity = (score - ({min})) / (({max}) - ({min}));
        """,
        "logarithmic": """
            var opacity = Math.log10(score - ({min})) / Math.log10(({max}) - ({min}));
        """,
        "blast": """
            var opacity = 0;
            if(score == 0.0) {{
                opacity = 1;
            }} else {{
                opacity = (20 - Math.log10(score)) / 180;
            }}
        """,
    }

    BREWER_COLOUR_IDX = 0
    BREWER_COLOUR_SCHEMES = [
        (166, 206, 227),
        (31, 120, 180),
        (178, 223, 138),
        (51, 160, 44),
        (251, 154, 153),
        (227, 26, 28),
        (253, 191, 111),
        (255, 127, 0),
        (202, 178, 214),
        (106, 61, 154),
        (255, 255, 153),
        (177, 89, 40),
        (228, 26, 28),
        (55, 126, 184),
        (77, 175, 74),
        (152, 78, 163),
        (255, 127, 0),
    ]

    BREWER_DIVERGING_PALLETES = {
        "BrBg": ("#543005", "#003c30"),
        "PiYg": ("#8e0152", "#276419"),
        "PRGn": ("#40004b", "#00441b"),
        "PuOr": ("#7f3b08", "#2d004b"),
        "RdBu": ("#67001f", "#053061"),
        "RdGy": ("#67001f", "#1a1a1a"),
        "RdYlBu": ("#a50026", "#313695"),
        "RdYlGn": ("#a50026", "#006837"),
        "Spectral": ("#9e0142", "#5e4fa2"),
    }

    def __init__(self):
        self.brewer_colour_idx = 0

    def rgb_from_hex(self, hexstr):
        # http://stackoverflow.com/questions/4296249/how-do-i-convert-a-hex-triplet-to-an-rgb-tuple-and-back
        return struct.unpack("BBB", binascii.unhexlify(hexstr))

    def min_max_gff(self, gff_file):
        min_val = None
        max_val = None
        with open(gff_file, "r") as handle:
            for line in handle:
                try:
                    value = float(line.split("\t")[5])
                    min_val = min(value, (min_val or value))
                    max_val = max(value, (max_val or value))

                    if value < min_val:
                        min_val = value

                    if value > max_val:
                        max_val = value
                except Exception:
                    pass
        return min_val, max_val

    def hex_from_rgb(self, r, g, b):
        return "#%02x%02x%02x" % (r, g, b)

    def _get_colours(self):
        r, g, b = self.BREWER_COLOUR_SCHEMES[
            self.brewer_colour_idx % len(self.BREWER_COLOUR_SCHEMES)
        ]
        self.brewer_colour_idx += 1
        return r, g, b

    def parse_menus(self, track):
        trackConfig = {"menuTemplate": [{}, {}, {}, {}]}

        if "menu" in track["menus"]:
            menu_list = [track["menus"]["menu"]]
            if isinstance(track["menus"]["menu"], list):
                menu_list = track["menus"]["menu"]

            for m in menu_list:
                tpl = {
                    "action": m["action"],
                    "label": m.get("label", "{name}"),
                    "iconClass": m.get("iconClass", "dijitIconBookmark"),
                }
                if "url" in m:
                    tpl["url"] = m["url"]
                if "content" in m:
                    tpl["content"] = m["content"]
                if "title" in m:
                    tpl["title"] = m["title"]

                trackConfig["menuTemplate"].append(tpl)

        return trackConfig

    def parse_colours(self, track, trackFormat, gff3=None):
        # Wiggle tracks have a bicolor pallete
        trackConfig = {"style": {}}
        if trackFormat == "wiggle":

            trackConfig["style"]["pos_color"] = track["wiggle"]["color_pos"]
            trackConfig["style"]["neg_color"] = track["wiggle"]["color_neg"]

            if trackConfig["style"]["pos_color"] == "__auto__":
                trackConfig["style"]["neg_color"] = self.hex_from_rgb(
                    *self._get_colours()
                )
                trackConfig["style"]["pos_color"] = self.hex_from_rgb(
                    *self._get_colours()
                )

            # Wiggle tracks can change colour at a specified place
            bc_pivot = track["wiggle"]["bicolor_pivot"]
            if bc_pivot not in ("mean", "zero"):
                # The values are either one of those two strings
                # or a number
                bc_pivot = float(bc_pivot)
            trackConfig["bicolor_pivot"] = bc_pivot
        elif "scaling" in track:
            if track["scaling"]["method"] == "ignore":
                if track["scaling"]["scheme"]["color"] != "__auto__":
                    trackConfig["style"]["color"] = track["scaling"]["scheme"]["color"]
                else:
                    trackConfig["style"]["color"] = self.hex_from_rgb(
                        *self._get_colours()
                    )
            else:
                # Scored method
                algo = track["scaling"]["algo"]
                # linear, logarithmic, blast
                scales = track["scaling"]["scales"]
                # type __auto__, manual (min, max)
                scheme = track["scaling"]["scheme"]
                # scheme -> (type (opacity), color)
                # ==================================
                # GENE CALLS OR BLAST
                # ==================================
                if trackFormat == "blast":
                    red, green, blue = self._get_colours()
                    color_function = self.COLOR_FUNCTION_TEMPLATE.format(
                        **{
                            "score": "feature._parent.get('score')",
                            "opacity": self.OPACITY_MATH["blast"],
                            "red": red,
                            "green": green,
                            "blue": blue,
                        }
                    )
                    trackConfig["style"]["color"] = color_function.replace("\n", "")
                elif trackFormat == "gene_calls":
                    # Default values, based on GFF3 spec
                    min_val = 0
                    max_val = 1000
                    # Get min/max and build a scoring function since JBrowse doesn't
                    if scales["type"] == "automatic" or scales["type"] == "__auto__":
                        min_val, max_val = self.min_max_gff(gff3)
                    else:
                        min_val = scales.get("min", 0)
                        max_val = scales.get("max", 1000)

                    if scheme["color"] == "__auto__":
                        user_color = "undefined"
                        auto_color = "'%s'" % self.hex_from_rgb(*self._get_colours())
                    elif scheme["color"].startswith("#"):
                        user_color = "'%s'" % self.hex_from_rgb(
                            *self.rgb_from_hex(scheme["color"][1:])
                        )
                        auto_color = "undefined"
                    else:
                        user_color = "undefined"
                        auto_color = "'%s'" % self.hex_from_rgb(*self._get_colours())

                    color_function = self.COLOR_FUNCTION_TEMPLATE_QUAL.format(
                        **{
                            "opacity": self.OPACITY_MATH[algo].format(
                                **{"max": max_val, "min": min_val}
                            ),
                            "user_spec_color": user_color,
                            "auto_gen_color": auto_color,
                        }
                    )

                    trackConfig["style"]["color"] = color_function.replace("\n", "")
        return trackConfig


def etree_to_dict(t):
    if t is None:
        return {}

    d = {t.tag: {} if t.attrib else None}
    children = list(t)
    if children:
        dd = defaultdict(list)
        for dc in map(etree_to_dict, children):
            for k, v in dc.items():
                dd[k].append(v)
        d = {t.tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
    if t.attrib:
        d[t.tag].update(("@" + k, v) for k, v in t.attrib.items())
    if t.text:
        text = t.text.strip()
        if children or t.attrib:
            if text:
                d[t.tag]["#text"] = text
        else:
            d[t.tag] = text
    return d


INSTALLED_TO = os.path.dirname(os.path.realpath(__file__))


def metadata_from_node(node):
    metadata = {}
    try:
        if len(node.findall("dataset")) != 1:
            # exit early
            return metadata
    except Exception:
        return {}

    for (key, value) in node.findall("dataset")[0].attrib.items():
        metadata["dataset_%s" % key] = value

    if node.findall("history"):
        for (key, value) in node.findall("history")[0].attrib.items():
            metadata["history_%s" % key] = value

    if node.findall("metadata"):
        for (key, value) in node.findall("metadata")[0].attrib.items():
            metadata["metadata_%s" % key] = value
            # Additional Mappings applied:
            metadata[
                "dataset_edam_format"
            ] = '<a target="_blank" href="http://edamontology.org/{0}">{1}</a>'.format(
                metadata["dataset_edam_format"], metadata["dataset_file_ext"]
            )
            metadata["history_user_email"] = '<a href="mailto:{0}">{0}</a>'.format(
                metadata["history_user_email"]
            )
            metadata["hist_name"] = metadata["history_display_name"]
            metadata[
                "history_display_name"
            ] = '<a target="_blank" href="{galaxy}/history/view/{encoded_hist_id}">{hist_name}</a>'.format(
                galaxy=GALAXY_INFRASTRUCTURE_URL,
                encoded_hist_id=metadata["history_id"],
                hist_name=metadata["history_display_name"],
            )
    if node.findall("tool"):
        for (key, value) in node.findall("tool")[0].attrib.items():
            metadata["tool_%s" % key] = value
        metadata[
            "tool_tool"
        ] = '<a target="_blank" href="{galaxy}/datasets/{encoded_id}/show_params">{tool_id}{tool_version}</a>'.format(
            galaxy=GALAXY_INFRASTRUCTURE_URL,
            encoded_id=metadata.get("dataset_id", ""),
            tool_id=metadata.get("tool_tool_id", ""),
            tool_version=metadata.get("tool_tool_version", ""),
        )
    return metadata


class JbrowseConnector(object):
    def __init__(self, outdir, genomes):
        self.giURL = GALAXY_INFRASTRUCTURE_URL
        self.outdir = outdir
        os.makedirs(self.outdir, exist_ok=True)
        self.genome_paths = genomes
        self.genome_name = None
        self.genome_names = []
        self.trackIdlist = []
        self.tracksToAdd = []
        self.config_json = {}
        self.config_json_file = os.path.join(outdir, "config.json")
        self.clone_jbrowse()

    def subprocess_check_call(self, command, output=None):
        if output:
            log.debug("cd %s && %s >  %s", self.outdir, " ".join(command), output)
            subprocess.check_call(command, cwd=self.outdir, stdout=output)
        else:
            log.debug("cd %s && %s", self.outdir, " ".join(command))
            subprocess.check_call(command, cwd=self.outdir)

    def subprocess_popen(self, command):
        log.debug(command)
        p = subprocess.Popen(
            command,
            cwd=self.outdir,
            shell=True,
            stdin=subprocess.PIPE,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        )
        output, err = p.communicate()
        retcode = p.returncode
        if retcode != 0:
            log.error(command)
            log.error(output)
            log.error(err)
            raise RuntimeError("Command failed with exit code %s" % (retcode))

    def subprocess_check_output(self, command):
        log.debug(" ".join(command))
        return subprocess.check_output(command, cwd=self.outdir)

    def symlink_or_copy(self, src, dest):
        if "GALAXY_JBROWSE_SYMLINKS" in os.environ and bool(
            os.environ["GALAXY_JBROWSE_SYMLINKS"]
        ):
            cmd = ["ln", "-s", src, dest]
        else:
            cmd = ["cp", src, dest]

        return self.subprocess_check_call(cmd)

    def _prepare_track_style(self, trackDict):

        style_data = {
            "type": "LinearBasicDisplay",
            "displayId": "%s-LinearBasicDisplay" % trackDict["trackId"],
        }

        if trackDict.get("displays", None):  # use first if multiple like bed
            style_data["type"] = trackDict["displays"][0]["type"]
            style_data["displayId"] = trackDict["displays"][0]["displayId"]
        return {
            "displays": [
                style_data,
            ]
        }

    def process_genomes(self):
        assemblies = []
        for i, genome_node in enumerate(self.genome_paths):
            genome_name = genome_node["meta"]["dataset_dname"].strip()
            if len(genome_name.split()) > 1:
                genome_name = genome_name.split()[0]
                # spaces and cruft break scripts when substituted
            if genome_name not in self.genome_names:
                # ignore dupes - can have multiple pafs with same references?
                fapath = genome_node["path"]
                assem = self.make_assembly(fapath, genome_name)
                assemblies.append(assem)
                self.genome_names.append(genome_name)
                if self.genome_name is None:
                    self.genome_name = (
                        genome_name  # first one for all tracks - other than paf
                    )
                    self.genome_firstcontig = None
                    fl = open(fapath, "r").readline().strip().split(">")
                    if len(fl) > 1:
                        fl = fl[1]
                        if len(fl.split()) > 1:
                            self.genome_firstcontig = fl.split()[0].strip()
                        else:
                            self.genome_firstcontig = fl
        if self.config_json.get("assemblies", None):
            self.config_json["assemblies"] += assemblies
        else:
            self.config_json["assemblies"] = assemblies

    def make_assembly(self, fapath, gname):

        faname = gname + ".fa.gz"
        fadest = os.path.join(self.outdir, faname)
        cmd = "bgzip -i -c %s -I %s.gzi > %s && samtools faidx %s" % (
            fapath,
            fadest,
            fadest,
            fadest,
        )
        self.subprocess_popen(cmd)
        adapter = {
            "type": "BgzipFastaAdapter",
            "fastaLocation": {
                "uri": faname,
            },
            "faiLocation": {
                "uri": faname + ".fai",
            },
            "gziLocation": {
                "uri": faname + ".gzi",
            },
        }
        self.genome_sequence_adapter = adapter
        trackDict = {
            "name": gname,
            "sequence": {
                "type": "ReferenceSequenceTrack",
                "trackId": gname,
                "adapter": adapter,
            },
            "rendering": {"type": "DivSequenceRenderer"},
        }
        return trackDict

    def add_default_view(self):
        cmd = [
            "jbrowse",
            "set-default-session",
            "-s",
            self.config_json_file,
            "-t",
            ",".join(self.trackIdlist),
            "-n",
            "JBrowse2 in Galaxy",
            "--target",
            self.config_json_file,
            "-v",
            " LinearGenomeView",
        ]
        self.subprocess_check_call(cmd)

    def write_config(self):
        with open(self.config_json_file, "w") as fp:
            json.dump(self.config_json, fp)

    def text_index(self):
        # Index tracks
        args = [
            "jbrowse",
            "text-index",
            "--target",
            os.path.join(self.outdir, "data"),
            "--assemblies",
            self.genome_name,
        ]

        tracks = ",".join(self.trackIdlist)
        if tracks:
            args += ["--tracks", tracks]

            self.subprocess_check_call(args)

    def add_hic(self, data, trackData):
        """
        HiC adapter.
        https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md
        for testing locally, these work:
        HiC data is from https://s3.amazonaws.com/igv.broadinstitute.org/data/hic/intra_nofrag_30.hic
        using hg19 reference track as a
        'BgzipFastaAdapter'
            fastaLocation:
            uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz',
            faiLocation:
            uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.fai',
            gziLocation:
            uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.gzi',
        Cool will not be likely to be a good fit - see discussion at https://github.com/GMOD/jbrowse-components/issues/2438
        """
        tId = trackData["label"]
        # can be served - if public.
        # dsId = trackData["metadata"]["dataset_id"]
        # url = "%s/api/datasets/%s/display?to_ext=hic " % (self.giURL, dsId)
        hname = trackData["hic_url"]
        floc = {
            "uri": hname,
        }
        trackDict = {
            "type": "HicTrack",
            "trackId": tId,
            "name": hname,
            "assemblyNames": [self.genome_name],
            "adapter": {
                "type": "HicAdapter",
                "hicLocation": floc,
            },
            "displays": [
                {
                    "type": "LinearHicDisplay",
                    "displayId": "%s-LinearHicDisplay" % tId,
                },
            ],
        }
        style_json = self._prepare_track_style(trackDict)
        trackDict["style"] = style_json
        self.tracksToAdd.append(trackDict)
        self.trackIdlist.append(tId)

    def add_maf(self, data, trackData):
        """
        from https://github.com/cmdcolin/maf2bed
        Note: Both formats start with a MAF as input, and note that your MAF file should contain the species name and chromosome name
        e.g. hg38.chr1 in the sequence identifiers.
        need the reference id - eg hg18, for maf2bed.pl as the first parameter
        """
        mafPlugin = {
            "plugins": [
                {
                    "name": "MafViewer",
                    "url": "https://unpkg.com/jbrowse-plugin-mafviewer/dist/jbrowse-plugin-mafviewer.umd.production.min.js",
                }
            ]
        }
        tId = trackData["label"]
        fname = "%s.bed" % tId
        dest = "%s/%s" % (self.outdir, fname)
        gname = self.genome_name
        cmd = [
            "bash",
            os.path.join(INSTALLED_TO, "convertMAF.sh"),
            data,
            gname,
            INSTALLED_TO,
            dest,
        ]
        self.subprocess_check_call(cmd)
        # Construct samples list
        # We could get this from galaxy metadata, not sure how easily.
        ps = subprocess.Popen(["grep", "^s [^ ]*", "-o", data], stdout=subprocess.PIPE)
        output = subprocess.check_output(("sort", "-u"), stdin=ps.stdout)
        ps.wait()
        outp = output.decode("ascii")
        soutp = outp.split("\n")
        samp = [x.split("s ")[1] for x in soutp if x.startswith("s ")]
        samples = [x.split(".")[0] for x in samp]
        trackDict = {
            "type": "MafTrack",
            "trackId": tId,
            "name": trackData["name"],
            "adapter": {
                "type": "MafTabixAdapter",
                "samples": samples,
                "bedGzLocation": {
                    "uri": fname + ".sorted.bed.gz",
                },
                "index": {
                    "location": {
                        "uri": fname + ".sorted.bed.gz.tbi",
                    },
                },
            },
            "assemblyNames": [self.genome_name],
            "displays": [
                {
                    "type": "LinearBasicDisplay",
                    "displayId": "%s-LinearBasicDisplay" % tId,
                },
                {
                    "type": "LinearArcDisplay",
                    "displayId": "%s-LinearArcDisplay" % tId,
                },
            ],
        }
        style_json = self._prepare_track_style(trackDict)
        trackDict["style"] = style_json
        self.tracksToAdd.append(trackDict)
        self.trackIdlist.append(tId)
        if self.config_json.get("plugins", None):
            self.config_json["plugins"].append(mafPlugin[0])
        else:
            self.config_json.update(mafPlugin)

    def _blastxml_to_gff3(self, xml, min_gap=10):
        gff3_unrebased = tempfile.NamedTemporaryFile(delete=False)
        cmd = [
            "python",
            os.path.join(INSTALLED_TO, "blastxml_to_gapped_gff3.py"),
            "--trim",
            "--trim_end",
            "--include_seq",
            "--min_gap",
            str(min_gap),
            xml,
        ]
        subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_unrebased)
        gff3_unrebased.close()
        return gff3_unrebased.name

    def add_blastxml(self, data, trackData, blastOpts, **kwargs):
        gff3 = self._blastxml_to_gff3(data, min_gap=blastOpts["min_gap"])

        if "parent" in blastOpts and blastOpts["parent"] != "None":
            gff3_rebased = tempfile.NamedTemporaryFile(delete=False)
            cmd = ["python", os.path.join(INSTALLED_TO, "gff3_rebase.py")]
            if blastOpts.get("protein", "false") == "true":
                cmd.append("--protein2dna")
            cmd.extend([os.path.realpath(blastOpts["parent"]), gff3])
            subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_rebased)
            gff3_rebased.close()

            # Replace original gff3 file
            shutil.copy(gff3_rebased.name, gff3)
            os.unlink(gff3_rebased.name)
        url = "%s.gff3" % trackData["label"]
        dest = "%s/%s" % (self.outdir, url)
        self._sort_gff(gff3, dest)
        url = url + ".gz"
        tId = trackData["label"]
        trackDict = {
            "type": "FeatureTrack",
            "trackId": tId,
            "name": trackData["name"],
            "assemblyNames": [self.genome_name],
            "adapter": {
                "type": "Gff3TabixAdapter",
                "gffGzLocation": {
                    "uri": url,
                },
                "index": {
                    "location": {
                        "uri": url + ".tbi",
                    }
                },
            },
            "displays": [
                {
                    "type": "LinearBasicDisplay",
                    "displayId": "%s-LinearBasicDisplay" % tId,
                },
                {
                    "type": "LinearArcDisplay",
                    "displayId": "%s-LinearArcDisplay" % tId,
                },
            ],
        }
        style_json = self._prepare_track_style(trackDict)
        trackDict["style"] = style_json
        self.tracksToAdd.append(trackDict)
        self.trackIdlist.append(tId)
        os.unlink(gff3)

    def add_bigwig(self, data, trackData):
        """ "type": "LinearWiggleDisplay",
        "configuration": {},
        "selectedRendering": "",
        "resolution": 1,
        "posColor": "rgb(228, 26, 28)",
        "negColor": "rgb(255, 255, 51)",
        "constraints": {}
        """
        url = "%s.bigwig" % trackData["label"]
        # slashes in names cause path trouble
        dest = os.path.join(self.outdir, url)
        cmd = ["cp", data, dest]
        self.subprocess_check_call(cmd)
        bwloc = {"uri": url}
        tId = trackData["label"]
        trackDict = {
            "type": "QuantitativeTrack",
            "trackId": tId,
            "name": trackData["name"],
            "assemblyNames": [
                self.genome_name,
            ],
            "adapter": {
                "type": "BigWigAdapter",
                "bigWigLocation": bwloc,
            },
            "displays": [
                {
                    "type": "LinearWiggleDisplay",
                    "displayId": "%s-LinearWiggleDisplay" % tId,
                }
            ],
        }
        style_json = self._prepare_track_style(trackDict)
        trackDict["style"] = style_json
        self.tracksToAdd.append(trackDict)
        self.trackIdlist.append(tId)

    def add_bam(self, data, trackData, bamOpts, bam_index=None, **kwargs):
        tId = trackData["label"]
        fname = "%s.bam" % trackData["label"]
        dest = "%s/%s" % (self.outdir, fname)
        url = fname
        self.subprocess_check_call(["cp", data, dest])
        bloc = {"uri": url}
        if bam_index is not None and os.path.exists(os.path.realpath(bam_index)):
            # bai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest
            self.subprocess_check_call(
                ["cp", os.path.realpath(bam_index), dest + ".bai"]
            )
        else:
            # Can happen in exotic condition
            # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam
            #      => no index generated by galaxy, but there might be one next to the symlink target
            #      this trick allows to skip the bam sorting made by galaxy if already done outside
            if os.path.exists(os.path.realpath(data) + ".bai"):
                self.symlink_or_copy(os.path.realpath(data) + ".bai", dest + ".bai")
            else:
                log.warn("Could not find a bam index (.bai file) for %s", data)
        trackDict = {
            "type": "AlignmentsTrack",
            "trackId": tId,
            "name": trackData["name"],
            "assemblyNames": [self.genome_name],
            "adapter": {
                "type": "BamAdapter",
                "bamLocation": bloc,
                "index": {
                    "location": {
                        "uri": fname + ".bai",
                    }
                },
            },
            "displays": [
                {
                    "type": "LinearAlignmentsDisplay",
                    "displayId": "%s-LinearAlignmentsDisplay" % tId,
                },
            ],
        }
        style_json = self._prepare_track_style(trackDict)
        trackDict["style"] = style_json
        self.tracksToAdd.append(trackDict)
        self.trackIdlist.append(tId)

    def add_cram(self, data, trackData, cramOpts, cram_index=None, **kwargs):
        tId = trackData["label"]
        fname = "%s.cram" % trackData["label"]
        dest = "%s/%s" % (self.outdir, fname)
        url = fname
        self.subprocess_check_call(["cp", data, dest])
        bloc = {"uri": url}
        if cram_index is not None and os.path.exists(os.path.realpath(cram_index)):
            # most probably made by galaxy and stored in galaxy dirs, need to copy it to dest
            self.subprocess_check_call(
                ["cp", os.path.realpath(cram_index), dest + ".crai"]
            )
        else:
            # Can happen in exotic condition
            # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam
            #      => no index generated by galaxy, but there might be one next to the symlink target
            #      this trick allows to skip the bam sorting made by galaxy if already done outside
            if os.path.exists(os.path.realpath(data) + ".crai"):
                self.symlink_or_copy(os.path.realpath(data) + ".crai", dest + ".crai")
            else:
                log.warn("Could not find a cram index (.crai file) for %s", data)
        trackDict = {
            "type": "AlignmentsTrack",
            "trackId": tId,
            "name": trackData["name"],
            "assemblyNames": [self.genome_name],
            "adapter": {
                "type": "CramAdapter",
                "cramLocation": bloc,
                "craiLocation": {
                    "uri": fname + ".crai",
                },
                "sequenceAdapter": self.genome_sequence_adapter,
            },
            "displays": [
                {
                    "type": "LinearAlignmentsDisplay",
                    "displayId": "%s-LinearAlignmentsDisplay" % tId,
                },
            ],
        }
        style_json = self._prepare_track_style(trackDict)
        trackDict["style"] = style_json
        self.tracksToAdd.append(trackDict)
        self.trackIdlist.append(tId)

    def add_vcf(self, data, trackData):
        tId = trackData["label"]
        # url = "%s/api/datasets/%s/display" % (
        # self.giURL,
        # trackData["metadata"]["dataset_id"],
        # )
        url = "%s.vcf.gz" % tId
        dest = "%s/%s" % (self.outdir, url)
        cmd = "bgzip -c %s  > %s" % (data, dest)
        self.subprocess_popen(cmd)
        cmd = ["tabix", "-f", "-p", "vcf", dest]
        self.subprocess_check_call(cmd)
        trackDict = {
            "type": "VariantTrack",
            "trackId": tId,
            "name": trackData["name"],
            "assemblyNames": [self.genome_name],
            "adapter": {
                "type": "VcfTabixAdapter",
                "vcfGzLocation": {
                    "uri": url,
                },
                "index": {
                    "location": {
                        "uri": url + ".tbi",
                    }
                },
            },
            "displays": [
                {
                    "type": "LinearVariantDisplay",
                    "displayId": "%s-LinearVariantDisplay" % tId,
                },
                {
                    "type": "ChordVariantDisplay",
                    "displayId": "%s-ChordVariantDisplay" % tId,
                },
                {
                    "type": "LinearPairedArcDisplay",
                    "displayId": "%s-LinearPairedArcDisplay" % tId,
                },
            ],
        }
        style_json = self._prepare_track_style(trackDict)
        trackDict["style"] = style_json
        self.tracksToAdd.append(trackDict)
        self.trackIdlist.append(tId)

    def _sort_gff(self, data, dest):
        # Only index if not already done
        if not os.path.exists(dest + ".gz"):
            cmd = "jbrowse sort-gff '%s' | bgzip -c > '%s.gz'" % (
                data,
                dest,
            )  # "gff3sort.pl --precise '%s' | grep -v \"^$\" > '%s'"
            self.subprocess_popen(cmd)
            self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest + ".gz"])

    def _sort_bed(self, data, dest):
        # Only index if not already done
        if not os.path.exists(dest):
            cmd = "sort -k1,1 -k2,2n '%s' | bgzip -c > '%s'" % (data, dest)
            self.subprocess_popen(cmd)
            cmd = ["tabix", "-f", "-p", "bed", dest]
            self.subprocess_check_call(cmd)

    def add_gff(self, data, ext, trackData):
        url = "%s.%s" % (trackData["label"], ext)
        dest = "%s/%s" % (self.outdir, url)
        self._sort_gff(data, dest)
        url = url + ".gz"
        tId = trackData["label"]
        trackDict = {
            "type": "FeatureTrack",
            "trackId": tId,
            "name": trackData["name"],
            "assemblyNames": [self.genome_name],
            "adapter": {
                "type": "Gff3TabixAdapter",
                "gffGzLocation": {
                    "uri": url,
                },
                "index": {
                    "location": {
                        "uri": url + ".tbi",
                    }
                },
            },
            "displays": [
                {
                    "type": "LinearBasicDisplay",
                    "displayId": "%s-LinearBasicDisplay" % tId,
                },
                {
                    "type": "LinearArcDisplay",
                    "displayId": "%s-LinearArcDisplay" % tId,
                },
            ],
        }
        style_json = self._prepare_track_style(trackDict)
        trackDict["style"] = style_json
        self.tracksToAdd.append(trackDict)
        self.trackIdlist.append(tId)

    def add_bed(self, data, ext, trackData):
        url = "%s.%s" % (trackData["label"], ext)
        dest = "%s/%s.gz" % (self.outdir, url)
        self._sort_bed(data, dest)
        tId = trackData["label"]
        url = url + ".gz"
        trackDict = {
            "type": "FeatureTrack",
            "trackId": tId,
            "name": trackData["name"],
            "assemblyNames": [self.genome_name],
            "adapter": {
                "type": "BedTabixAdapter",
                "bedGzLocation": {
                    "uri": url,
                },
                "index": {
                    "location": {
                        "uri": url + ".tbi",
                    }
                },
            },
            "displays": [
                {
                    "type": "LinearBasicDisplay",
                    "displayId": "%s-LinearBasicDisplay" % tId,
                },
                {
                    "type": "LinearPileupDisplay",
                    "displayId": "%s-LinearPileupDisplay" % tId,
                },
                {
                    "type": "LinearArcDisplay",
                    "displayId": "%s-LinearArcDisplay" % tId,
                },
            ],
        }
        style_json = self._prepare_track_style(trackDict)
        trackDict["style"] = style_json
        self.tracksToAdd.append(trackDict)
        self.trackIdlist.append(tId)

    def add_paf(self, data, trackData, pafOpts, **kwargs):
        tname = trackData["name"]
        tId = trackData["label"]
        pgnames = [x.strip() for x in pafOpts["genome_label"].split(",")]
        pgpaths = [x.strip() for x in pafOpts["genome"].split(",")]
        passnames = [self.genome_name]  # always first
        for i, gname in enumerate(pgnames):
            if len(gname.split()) > 1:
                gname = gname.split()[0]
                # trouble from spacey names in command lines avoidance
                if gname not in self.genome_names:
                    passnames.append(gname)
                    # ignore if already there - eg for duplicates among pafs.
                    asstrack = self.make_assembly(pgpaths[i], gname)
                    self.genome_names.append(gname)
                    if self.config_json.get("assemblies", None):
                        self.config_json["assemblies"].append(asstrack)
                    else:
                        self.config_json["assemblies"] = [
                            asstrack,
                        ]
        url = "%s.paf" % (trackData["label"])
        dest = "%s/%s" % (self.outdir, url)
        self.symlink_or_copy(os.path.realpath(data), dest)
        trackDict = {
            "type": "SyntenyTrack",
            "trackId": tId,
            "assemblyNames": passnames,
            "name": tname,
            "adapter": {
                "type": "PAFAdapter",
                "pafLocation": {"uri": url},
                "assemblyNames": passnames,
            },
            # "displays": [
            # {
            # "type": "LinearSyntenyDisplay",
            # "displayId": "%s-LinearSyntenyDisplay" % tId,
            # },
            # {
            # "type": "DotPlotDisplay",
            # "displayId": "%s-DotPlotDisplay" % tId,
            # },
            # ],
        }
        style_json = self._prepare_track_style(trackDict)
        trackDict["style"] = style_json
        self.tracksToAdd.append(trackDict)
        self.trackIdlist.append(tId)

    def process_annotations(self, track):
        category = track["category"].replace("__pd__date__pd__", TODAY)
        for i, (
            dataset_path,
            dataset_ext,
            track_human_label,
            extra_metadata,
        ) in enumerate(track["trackfiles"]):
            # Unsanitize labels (element_identifiers are always sanitized by Galaxy)
            for key, value in mapped_chars.items():
                track_human_label = track_human_label.replace(value, key)
            track_human_label = track_human_label.replace(" ", "_")
            outputTrackConfig = {
                "category": category,
                "style": {},
            }

            outputTrackConfig["key"] = track_human_label

            outputTrackConfig["trackset"] = track.get("trackset", {})
            outputTrackConfig["label"] = "%s_%i_%s" % (
                dataset_ext,
                i,
                track_human_label,
            )
            outputTrackConfig["metadata"] = extra_metadata
            outputTrackConfig["name"] = track_human_label

            if dataset_ext in ("gff", "gff3"):
                self.add_gff(
                    dataset_path,
                    dataset_ext,
                    outputTrackConfig,
                )
            elif dataset_ext in ("hic", "juicebox_hic"):
                self.add_hic(
                    dataset_path,
                    outputTrackConfig,
                )
            elif dataset_ext in ("cool", "mcool", "scool"):
                hic_url = "%s_%d.juicebox_hic" % (track_human_label, i)
                hic_path = os.path.join(self.outdir, hic_url)
                self.subprocess_check_call(
                    [
                        "hictk",
                        "convert",
                        "-f",
                        "--output-fmt",
                        "hic",
                        dataset_path,
                        hic_path,
                    ]
                )
                outputTrackConfig["hic_url"] = hic_url
                self.add_hic(
                    hic_path,
                    outputTrackConfig,
                )
            elif dataset_ext in ("bed",):
                self.add_bed(
                    dataset_path,
                    dataset_ext,
                    outputTrackConfig,
                )
            elif dataset_ext in ("maf",):
                self.add_maf(
                    dataset_path,
                    outputTrackConfig,
                )
            elif dataset_ext == "bigwig":
                self.add_bigwig(
                    dataset_path,
                    outputTrackConfig,
                )
            elif dataset_ext == "bam":
                real_indexes = track["conf"]["options"]["bam"]["bam_indices"][
                    "bam_index"
                ]
                if not isinstance(real_indexes, list):
                    real_indexes = [real_indexes]

                self.add_bam(
                    dataset_path,
                    outputTrackConfig,
                    track["conf"]["options"]["bam"],
                    bam_index=real_indexes[i],
                )
            elif dataset_ext == "cram":
                real_indexes = track["conf"]["options"]["cram"]["cram_indices"][
                    "cram_index"
                ]
                if not isinstance(real_indexes, list):
                    real_indexes = [real_indexes]

                self.add_cram(
                    dataset_path,
                    outputTrackConfig,
                    track["conf"]["options"]["cram"],
                    cram_index=real_indexes[i],
                )
            elif dataset_ext == "blastxml":
                self.add_blastxml(
                    dataset_path,
                    outputTrackConfig,
                    track["conf"]["options"]["blast"],
                )
            elif dataset_ext == "vcf":
                self.add_vcf(dataset_path, outputTrackConfig)
            elif dataset_ext == "paf":
                self.add_paf(
                    dataset_path,
                    outputTrackConfig,
                    track["conf"]["options"]["paf"],
                )
            else:
                logging.warn("Do not know how to handle %s", dataset_ext)
            # Return non-human label for use in other fields
            yield outputTrackConfig["label"]

    def add_default_session(self, data):
        """
        Add some default session settings: set some assemblies/tracks on/off
        """
        tracks_data = []

        # TODO using the default session for now, but check out session specs in the future https://github.com/GMOD/jbrowse-components/issues/2708

        # We need to know the track type from the config.json generated just before
        track_types = {}
        with open(self.config_json_file, "r") as config_file:
            config_json = json.load(config_file)
        if self.config_json:
            config_json.update(self.config_json)

        for track_conf in self.tracksToAdd:
            track_types[track_conf["trackId"]] = track_conf["type"]
            tId = track_conf["trackId"]
            if tId in data["visibility"]["default_on"]:
                style_data = {"type": "LinearBasicDisplay"}
                if "displays" in track_conf:
                    style_data["type"] = track_conf["displays"][0]["type"]
                if track_conf.get("style_labels", None):
                    # TODO fix this: it should probably go in a renderer block (SvgFeatureRenderer) but still does not work
                    # TODO move this to per track displays?
                    style_data["labels"] = track_conf["style_labels"]
                tracks_data.append(
                    {
                        "type": track_types[tId],
                        "configuration": tId,
                        "displays": [style_data],
                    }
                )

        # The view for the assembly we're adding
        view_json = {"type": "LinearGenomeView", "tracks": tracks_data}

        refName = None
        drdict = {
            "reversed": False,
            "assemblyName": self.genome_name,
            "start": 0,
            "end": 100000,
        }

        if data.get("defaultLocation", ""):
            ddl = data["defaultLocation"]
            loc_match = re.search(r"^([^:]+):([\d,]*)\.*([\d,]*)$", ddl)
            # allow commas like 100,000 but ignore as integer
            if loc_match:
                refName = loc_match.group(1)
                drdict["refName"] = refName
                if loc_match.group(2) > "":
                    drdict["start"] = int(loc_match.group(2).replace(",", ""))
                if loc_match.group(3) > "":
                    drdict["end"] = int(loc_match.group(3).replace(",", ""))
            else:
                logging.info(
                    "@@@ regexp could not match contig:start..end in the supplied location %s - please fix"
                    % ddl
                )
        else:
            drdict["refName"] = self.genome_firstcontig
        if drdict.get("refName", None):
            # TODO displayedRegions is not just zooming to the region, it hides the rest of the chromosome
            view_json["displayedRegions"] = [
                drdict,
            ]

            logging.info("@@@ defaultlocation %s for default session" % drdict)
        else:
            logging.info(
                "@@@ no contig name found for default session - please add one!"
            )
        session_name = data.get("session_name", "New session")
        for key, value in mapped_chars.items():
            session_name = session_name.replace(value, key)
        # Merge with possibly existing defaultSession (if upgrading a jbrowse instance)
        session_json = {}
        if "defaultSession" in config_json:
            session_json = config_json["defaultSession"]

        session_json["name"] = session_name

        if "views" not in session_json:
            session_json["views"] = []

        session_json["views"].append(view_json)

        config_json["defaultSession"] = session_json
        self.config_json.update(config_json)

        with open(self.config_json_file, "w") as config_file:
            json.dump(self.config_json, config_file, indent=2)

    def add_general_configuration(self, data):
        """
        Add some general configuration to the config.json file
        """

        config_path = self.config_json_file
        if os.path.exists(config_path):
            with open(config_path, "r") as config_file:
                config_json = json.load(config_file)
        else:
            config_json = {}
        if self.config_json:
            config_json.update(self.config_json)
        config_data = {}

        config_data["disableAnalytics"] = data.get("analytics", "false") == "true"

        config_data["theme"] = {
            "palette": {
                "primary": {"main": data.get("primary_color", "#0D233F")},
                "secondary": {"main": data.get("secondary_color", "#721E63")},
                "tertiary": {"main": data.get("tertiary_color", "#135560")},
                "quaternary": {"main": data.get("quaternary_color", "#FFB11D")},
            },
            "typography": {"fontSize": int(data.get("font_size", 10))},
        }
        if not config_json.get("configuration", None):
            config_json["configuration"] = {}
        config_json["configuration"].update(config_data)
        self.config_json.update(config_json)
        with open(config_path, "w") as config_file:
            json.dump(self.config_json, config_file, indent=2)

    def clone_jbrowse(self):
        """Clone a JBrowse directory into a destination directory."""
        # dest = os.path.realpath(self.outdir)
        dest = self.outdir
        cmd = ["rm", "-rf", dest + "/*"]
        self.subprocess_check_call(cmd)
        cmd = ["jbrowse", "create", dest, "-t", JB2VER, "-f"]
        self.subprocess_check_call(cmd)
        for fn in [
            "asset-manifest.json",
            "favicon.ico",
            "robots.txt",
            "umd_plugin.js",
            "version.txt",
            "test_data",
        ]:
            cmd = ["rm", "-rf", os.path.join(self.outdir, fn)]
            self.subprocess_check_call(cmd)
        cmd = [
            "cp",
            os.path.join(INSTALLED_TO, "jb2_webserver.py"),
            self.outdir,
        ]
        self.subprocess_check_call(cmd)


def parse_style_conf(item):
    if "type" in item.attrib and item.attrib["type"] in [
        "boolean",
        "integer",
    ]:
        if item.attrib["type"] == "boolean":
            return item.text in ("yes", "true", "True")
        elif item.attrib["type"] == "integer":
            return int(item.text)
    else:
        return item.text


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="", epilog="")
    parser.add_argument("--xml", help="Track Configuration")
    parser.add_argument("--outdir", help="Output directory", default="out")
    parser.add_argument("--version", "-V", action="version", version="%(prog)s 2.0.1")
    args = parser.parse_args()
    tree = ET.parse(args.xml)
    root = tree.getroot()

    # This should be done ASAP
    GALAXY_INFRASTRUCTURE_URL = root.find("metadata/galaxyUrl").text
    # Sometimes this comes as `localhost` without a protocol
    if not GALAXY_INFRASTRUCTURE_URL.startswith("http"):
        # so we'll prepend `http://` and hope for the best. Requests *should*
        # be GET and not POST so it should redirect OK
        GALAXY_INFRASTRUCTURE_URL = "http://" + GALAXY_INFRASTRUCTURE_URL
    jc = JbrowseConnector(
        outdir=args.outdir,
        genomes=[
            {
                "path": os.path.realpath(x.attrib["path"]),
                "meta": metadata_from_node(x.find("metadata")),
            }
            for x in root.findall("metadata/genomes/genome")
        ],
    )
    jc.process_genomes()

    # .add_default_view() replace from https://github.com/abretaud/tools-iuc/blob/jbrowse2/tools/jbrowse2/jbrowse2.py
    default_session_data = {
        "visibility": {
            "default_on": [],
            "default_off": [],
        },
        "style": {},
        "style_labels": {},
    }

    for track in root.findall("tracks/track"):
        track_conf = {}
        track_conf["trackfiles"] = []

        is_multi_bigwig = False
        try:
            if track.find("options/wiggle/multibigwig") and (
                track.find("options/wiggle/multibigwig").text == "True"
            ):
                is_multi_bigwig = True
                multi_bigwig_paths = []
        except KeyError:
            pass

        trackfiles = track.findall("files/trackFile")
        if trackfiles:
            for x in track.findall("files/trackFile"):
                if is_multi_bigwig:
                    multi_bigwig_paths.append(
                        (
                            x.attrib["label"],
                            os.path.realpath(x.attrib["path"]),
                        )
                    )
                else:
                    if trackfiles:
                        metadata = metadata_from_node(x.find("metadata"))
                        track_conf["dataset_id"] = metadata["dataset_id"]
                        track_conf["trackfiles"].append(
                            (
                                os.path.realpath(x.attrib["path"]),
                                x.attrib["ext"],
                                x.attrib["label"],
                                metadata,
                            )
                        )

        if is_multi_bigwig:
            metadata = metadata_from_node(x.find("metadata"))

            track_conf["trackfiles"].append(
                (
                    multi_bigwig_paths,  # Passing an array of paths to represent as one track
                    "bigwig_multiple",
                    "MultiBigWig",  # Giving an hardcoded name for now
                    {},  # No metadata for multiple bigwig
                )
            )
        track_conf["category"] = track.attrib["cat"]
        track_conf["format"] = track.attrib["format"]
        if track.find("options/style"):
            track_conf["style"] = {
                item.tag: parse_style_conf(item) for item in track.find("options/style")
            }
        if track.find("options/style_labels"):
            track_conf["style_labels"] = {
                item.tag: parse_style_conf(item)
                for item in track.find("options/style_labels")
            }

        track_conf["conf"] = etree_to_dict(track.find("options"))
        track_conf["category"] = track.attrib["cat"]
        track_conf["format"] = track.attrib["format"]
        try:
            # Only pertains to gff3 + blastxml. TODO?
            track_conf["style"] = {t.tag: t.text for t in track.find("options/style")}
        except TypeError:
            track_conf["style"] = {}
            pass
        track_conf["conf"] = etree_to_dict(track.find("options"))
        keys = jc.process_annotations(track_conf)

        if keys:
            for key in keys:
                default_session_data["visibility"][
                    track.attrib.get("visibility", "default_off")
                ].append(key)
                if track_conf.get("style", None):
                    default_session_data["style"][key] = track_conf[
                        "style"
                    ]  # TODO do we need this anymore?
                if track_conf.get("style_lables", None):
                    default_session_data["style_labels"][key] = track_conf.get(
                        "style_labels", None
                    )
    default_session_data["defaultLocation"] = root.find(
        "metadata/general/defaultLocation"
    ).text
    default_session_data["session_name"] = root.find(
        "metadata/general/session_name"
    ).text
    jc.zipOut = root.find("metadata/general/zipOut").text == "true"
    general_data = {
        "analytics": root.find("metadata/general/analytics").text,
        "primary_color": root.find("metadata/general/primary_color").text,
        "secondary_color": root.find("metadata/general/secondary_color").text,
        "tertiary_color": root.find("metadata/general/tertiary_color").text,
        "quaternary_color": root.find("metadata/general/quaternary_color").text,
        "font_size": root.find("metadata/general/font_size").text,
    }
    jc.add_general_configuration(general_data)
    trackconf = jc.config_json.get("tracks", None)
    if trackconf:
        jc.config_json["tracks"].update(jc.tracksToAdd)
    else:
        jc.config_json["tracks"] = jc.tracksToAdd
    jc.write_config()
    jc.add_default_session(default_session_data)
    # jc.text_index() not sure what broke here.
author	fubar
date	Thu, 29 Feb 2024 00:47:58 +0000
parents	15da358c3108
children	7adde511daa1