Mercurial > repos > fubar > jbrowse2dev

diff jbrowse2/jbrowse2_jbrowsenotjson.py @ 6:88b9b105c09b draft
Uploaded
author: fubar
date: Fri, 05 Jan 2024 01:58:02 +0000
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/jbrowse2/jbrowse2_jbrowsenotjson.py	Fri Jan 05 01:58:02 2024 +0000
@@ -0,0 +1,1034 @@
+#!/usr/bin/env python
+# change to accumulating all configuration for config.json based on the default from the clone
+import argparse
+import datetime
+import hashlib
+import json
+import logging
+import os
+import shutil
+import subprocess
+import tempfile
+import xml.etree.ElementTree as ET
+from collections import defaultdict
+
+logging.basicConfig(level=logging.INFO)
+log = logging.getLogger("jbrowse")
+TODAY = datetime.datetime.now().strftime("%Y-%m-%d")
+GALAXY_INFRASTRUCTURE_URL = None
+mapped_chars = {
+    ">": "__gt__",
+    "<": "__lt__",
+    "'": "__sq__",
+    '"': "__dq__",
+    "[": "__ob__",
+    "]": "__cb__",
+    "{": "__oc__",
+    "}": "__cc__",
+    "@": "__at__",
+    "#": "__pd__",
+    "": "__cn__",
+}
+
+
+def etree_to_dict(t):
+    if t is None:
+        return {}
+
+    d = {t.tag: {} if t.attrib else None}
+    children = list(t)
+    if children:
+        dd = defaultdict(list)
+        for dc in map(etree_to_dict, children):
+            for k, v in dc.items():
+                dd[k].append(v)
+        d = {t.tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
+    if t.attrib:
+        d[t.tag].update(("@" + k, v) for k, v in t.attrib.items())
+    if t.text:
+        text = t.text.strip()
+        if children or t.attrib:
+            if text:
+                d[t.tag]["#text"] = text
+        else:
+            d[t.tag] = text
+    return d
+
+
+INSTALLED_TO = os.path.dirname(os.path.realpath(__file__))
+
+
+def metadata_from_node(node):
+    metadata = {}
+    try:
+        if len(node.findall("dataset")) != 1:
+            # exit early
+            return metadata
+    except Exception:
+        return {}
+
+    for (key, value) in node.findall("dataset")[0].attrib.items():
+        metadata["dataset_%s" % key] = value
+
+    for (key, value) in node.findall("history")[0].attrib.items():
+        metadata["history_%s" % key] = value
+
+    for (key, value) in node.findall("metadata")[0].attrib.items():
+        metadata["metadata_%s" % key] = value
+
+    for (key, value) in node.findall("tool")[0].attrib.items():
+        metadata["tool_%s" % key] = value
+
+    # Additional Mappings applied:
+    metadata[
+        "dataset_edam_format"
+    ] = '<a target="_blank" href="http://edamontology.org/{0}">{1}</a>'.format(
+        metadata["dataset_edam_format"], metadata["dataset_file_ext"]
+    )
+    metadata["history_user_email"] = '<a href="mailto:{0}">{0}</a>'.format(
+        metadata["history_user_email"]
+    )
+    metadata["hist_name"] = metadata["history_display_name"]
+    metadata[
+        "history_display_name"
+    ] = '<a target="_blank" href="{galaxy}/history/view/{encoded_hist_id}">{hist_name}</a>'.format(
+        galaxy=GALAXY_INFRASTRUCTURE_URL,
+        encoded_hist_id=metadata["history_id"],
+        hist_name=metadata["history_display_name"],
+    )
+    metadata[
+        "tool_tool"
+    ] = '<a target="_blank" href="{galaxy}/datasets/{encoded_id}/show_params">{tool_id}</a>'.format(
+        galaxy=GALAXY_INFRASTRUCTURE_URL,
+        encoded_id=metadata["dataset_id"],
+        tool_id=metadata["tool_tool_id"],
+        # tool_version=metadata['tool_tool_version'],
+    )
+    return metadata
+
+
+class JbrowseConnector(object):
+    def __init__(self, jbrowse, outdir, genomes, standalone=None):
+        self.debug = False
+        self.usejson = True
+        self.giURL = GALAXY_INFRASTRUCTURE_URL
+        self.jbrowse = jbrowse
+        self.outdir = outdir
+        os.makedirs(self.outdir, exist_ok=True)
+        self.genome_paths = genomes
+        self.standalone = standalone
+        self.trackIdlist = []
+        self.tracksToAdd = []
+        self.config_json = {}
+        self.config_json_file = os.path.realpath(os.path.join(outdir, "config.json"))
+        if standalone == "complete":
+            self.clone_jbrowse(self.jbrowse, self.outdir)
+        elif standalone == "minimal":
+            self.clone_jbrowse(self.jbrowse, self.outdir, minimal=True)
+
+    def subprocess_check_call(self, command, output=None):
+        if output:
+            if self.debug:
+                log.debug("cd %s && %s >  %s", self.outdir, " ".join(command), output)
+            subprocess.check_call(command, cwd=self.outdir, stdout=output)
+        else:
+            log.debug("cd %s && %s", self.outdir, " ".join(command))
+            subprocess.check_call(command, cwd=self.outdir)
+
+    def subprocess_popen(self, command):
+        if self.debug:
+            log.debug("cd %s && %s", self.outdir, command)
+        p = subprocess.Popen(
+            command,
+            shell=True,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        output, err = p.communicate()
+        retcode = p.returncode
+        if retcode != 0:
+            log.error("cd %s && %s", self.outdir, command)
+            log.error(output)
+            log.error(err)
+            raise RuntimeError("Command failed with exit code %s" % (retcode))
+
+    def subprocess_check_output(self, command):
+        if self.debug:
+            log.debug("cd %s && %s", self.outdir, " ".join(command))
+        return subprocess.check_output(command, cwd=self.outdir)
+
+    def _jbrowse_bin(self, command):
+        return os.path.realpath(os.path.join(self.jbrowse, "bin", command))
+
+    def symlink_or_copy(self, src, dest):
+        if "GALAXY_JBROWSE_SYMLINKS" in os.environ and bool(
+            os.environ["GALAXY_JBROWSE_SYMLINKS"]
+        ):
+            cmd = ["ln", "-s", src, dest]
+        else:
+            cmd = ["cp", src, dest]
+
+        return self.subprocess_check_call(cmd)
+
+    def process_genomes(self):
+        assemblies = []
+        for i, genome_node in enumerate(self.genome_paths):
+            log.info("genome_node=%s" % str(genome_node))
+            # We only expect one input genome per run. This for loop is just
+            # easier to write than the alternative / catches any possible
+            # issues.
+            genome_name = genome_node["meta"]["dataset_dname"]
+            dsId = genome_node["meta"]["dataset_id"]
+            fapath = genome_node["path"]
+            faname = genome_name + ".fa.gz"
+            faind = os.path.realpath(os.path.join(self.outdir, faname + ".gzi"))
+            if self.standalone == "complete":
+                fadest = os.path.realpath(os.path.join(self.outdir, faname))
+                cmd = "bgzip -i -c %s > %s && samtools faidx %s" % (
+                    fapath,
+                    fadest,
+                    fadest,
+                )
+                self.subprocess_popen(cmd)
+                adapter = {
+                    "type": "BgzipFastaAdapter",
+                    "fastaLocation": {
+                        "uri": faname,
+                    },
+                    "faiLocation": {
+                        "uri": faname + ".fai",
+                    },
+                    "gziLocation": {
+                        "uri": faname + ".gzi",
+                    },
+                }
+            else:
+                faurl = "%s/api/datasets/%s/display" % (self.giURL, dsId)
+                fastalocation = {
+                    "uri": faurl,
+                }
+                failocation = {
+                    "uri": faname + ".fai",
+                }
+                adapter = {
+                    "type": "IndexedFastaAdapter",
+                    "fastaLocation": fastalocation,
+                    "faiLocation": failocation,
+                }
+
+                cmd = ["samtools", "faidx", fapath, "--fai-idx", faind]
+                self.subprocess_check_call(cmd)
+            trackDict = {
+                "name": genome_name,
+                "sequence": {
+                    "type": "ReferenceSequenceTrack",
+                    "trackId": genome_name,
+                    "adapter": adapter,
+                },
+                "rendering": {"type": "DivSequenceRenderer"},
+            }
+            assemblies.append(trackDict)
+        self.genome_name = genome_name
+        if self.usejson:
+            self.config_json["assemblies"] = assemblies
+        else:
+            cmd = [
+                "jbrowse",
+                "add-assembly",
+                faname,
+                "-t",
+                "bgzipFasta",
+                "-n",
+                genome_name,
+                "--load",
+                "inPlace",
+                "--faiLocation",
+                faname + ".fai",
+                "--gziLocation",
+                faname + ".gzi",
+                "--target",
+                self.outdir,
+            ]
+            self.subprocess_check_call(cmd)
+
+    def add_default_view(self):
+        cmd = [
+            "jbrowse",
+            "set-default-session",
+            "-s",
+            self.config_json_file,
+            "-t",
+            ",".join(self.trackIdlist),
+            "-n",
+            "JBrowse2 in Galaxy",
+            "--target",
+            self.config_json_file,
+            "-v",
+            " LinearGenomeView",
+        ]
+        if True or self.debug:
+            log.info("### calling set-default-session with cmd=%s" % "  ".join(cmd))
+        self.subprocess_check_call(cmd)
+
+    def write_config(self):
+        with open(self.config_json_file, "w") as fp:
+            json.dump(self.config_json, fp)
+
+    def add_hic(self, data, trackData):
+        """
+        HiC adapter.
+        https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md
+        for testing locally, these work:
+        HiC data is from https://s3.amazonaws.com/igv.broadinstitute.org/data/hic/intra_nofrag_30.hic
+        using hg19 reference track as a
+        'BgzipFastaAdapter'
+            fastaLocation:
+            uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz',
+            faiLocation:
+            uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.fai',
+            gziLocation:
+            uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.gzi',
+        Cool will not be likely to be a good fit - see discussion at https://github.com/GMOD/jbrowse-components/issues/2438
+        """
+        log.info("#### trackData=%s" % trackData)
+        tId = trackData["label"]
+        dsId = trackData["metadata"]["dataset_id"]
+        url = "%s/api/datasets/%s/display?to_ext=hic " % (
+            self.giURL,
+            dsId,
+        )
+        hname = trackData["name"]
+        if self.standalone == "complete":
+            dest = os.path.realpath(os.path.join(self.outdir, hname))
+            url = hname
+            cmd = ["cp", data, dest]
+            self.subprocess_check_call(cmd)
+            floc = {
+                "uri": hname,
+            }
+        else:
+            url = "%s/api/datasets/%s/display?to_ext=hic" % (self.giURL, dsId)
+            floc = {
+                "uri": url,
+            }
+        trackDict = {
+            "type": "HicTrack",
+            "trackId": tId,
+            "name": hname,
+            "assemblyNames": [self.genome_name],
+            "adapter": {
+                "type": "HicAdapter",
+                "hicLocation": floc,
+            },
+        }
+        if self.usejson:
+            self.tracksToAdd.append(trackDict)
+            self.trackIdlist.append(tId)
+        else:
+            cmd = [
+                "jbrowse",
+                "add-track",
+                url,
+                "-t",
+                "HicTrack",
+                "-a",
+                self.genome_name,
+                "-n",
+                hname,
+                "--load",
+                "inPlace",
+                "--target",
+                self.outdir,
+            ]
+            self.subprocess_check_call(cmd)
+
+    def add_maf(self, data, trackData):
+        """
+        from https://github.com/cmdcolin/maf2bed
+        Note: Both formats start with a MAF as input, and note that your MAF file should contain the species name and chromosome name
+        e.g. hg38.chr1 in the sequence identifiers.
+        need the reference id - eg hg18, for maf2bed.pl as the first parameter
+        """
+        mafPlugin = {
+            "plugins": [
+                {
+                    "name": "MafViewer",
+                    "url": "https://unpkg.com/jbrowse-plugin-mafviewer/dist/jbrowse-plugin-mafviewer.umd.production.min.js",
+                }
+            ]
+        }
+        tId = trackData["label"]
+        fname = "%s.bed" % tId
+        dest = os.path.realpath("%s/%s" % (self.outdir, fname))
+        # self.symlink_or_copy(data, dest)
+        # Process MAF to bed-like. Need build to munge chromosomes
+        gname = self.genome_name
+        cmd = [
+            "bash",
+            os.path.join(INSTALLED_TO, "convertMAF.sh"),
+            data,
+            gname,
+            INSTALLED_TO,
+            dest,
+        ]
+        self.subprocess_check_call(cmd)
+        if True or self.debug:
+            log.info("### convertMAF.sh called as %s" % " ".join(cmd))
+        # Construct samples list
+        # We could get this from galaxy metadata, not sure how easily.
+        ps = subprocess.Popen(["grep", "^s [^ ]*", "-o", data], stdout=subprocess.PIPE)
+        output = subprocess.check_output(("sort", "-u"), stdin=ps.stdout)
+        ps.wait()
+        outp = output.decode("ascii")
+        soutp = outp.split("\n")
+        samp = [x.split("s ")[1] for x in soutp if x.startswith("s ")]
+        samples = [x.split(".")[0] for x in samp]
+        if self.debug:
+            log.info("### got samples = %s " % (samples))
+        trackDict = {
+            "type": "MafTrack",
+            "trackId": tId,
+            "name": trackData["name"],
+            "adapter": {
+                "type": "MafTabixAdapter",
+                "samples": samples,
+                "bedGzLocation": {
+                    "uri": fname + ".sorted.bed.gz",
+                },
+                "index": {
+                    "location": {
+                        "uri": fname + ".sorted.bed.gz.tbi",
+                    },
+                },
+            },
+            "assemblyNames": [self.genome_name],
+        }
+        self.tracksToAdd.append(trackDict)
+        self.trackIdlist.append(tId)
+        if self.config_json.get("plugins", None):
+            self.config_json["plugins"].append(mafPlugin[0])
+        else:
+            self.config_json.update(mafPlugin)
+
+    def _blastxml_to_gff3(self, xml, min_gap=10):
+        gff3_unrebased = tempfile.NamedTemporaryFile(delete=False)
+        cmd = [
+            "python",
+            os.path.join(INSTALLED_TO, "blastxml_to_gapped_gff3.py"),
+            "--trim",
+            "--trim_end",
+            "--include_seq",
+            "--min_gap",
+            str(min_gap),
+            xml,
+        ]
+        subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_unrebased)
+        gff3_unrebased.close()
+        return gff3_unrebased.name
+
+    def add_blastxml(self, data, trackData, blastOpts, **kwargs):
+        gff3 = self._blastxml_to_gff3(data, min_gap=blastOpts["min_gap"])
+
+        if "parent" in blastOpts and blastOpts["parent"] != "None":
+            gff3_rebased = tempfile.NamedTemporaryFile(delete=False)
+            cmd = ["python", os.path.join(INSTALLED_TO, "gff3_rebase.py")]
+            if blastOpts.get("protein", "false") == "true":
+                cmd.append("--protein2dna")
+            cmd.extend([os.path.realpath(blastOpts["parent"]), gff3])
+            subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_rebased)
+            gff3_rebased.close()
+
+            # Replace original gff3 file
+            shutil.copy(gff3_rebased.name, gff3)
+            os.unlink(gff3_rebased.name)
+        url = "%s.gff3" % trackData["label"]
+        dest = os.path.realpath("%s/%s" % (self.outdir, url))
+        self._sort_gff(gff3, dest)
+        url = url + ".gz"
+        tId = trackData["label"]
+        trackDict = {
+            "type": "FeatureTrack",
+            "trackId": tId,
+            "name": trackData["name"],
+            "assemblyNames": [self.genome_name],
+            "adapter": {
+                "type": "Gff3TabixAdapter",
+                "gffGzLocation": {
+                    "uri": url,
+                },
+                "index": {
+                    "location": {
+                        "uri": url + ".tbi",
+                    }
+                },
+            },
+            "displays": [
+                {
+                    "type": "LinearBasicDisplay",
+                    "displayId": "%s-LinearBasicDisplay" % tId,
+                },
+                {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
+            ],
+        }
+        if self.usejson:
+            self.tracksToAdd.append(trackDict)
+            self.trackIdlist.append(tId)
+        else:
+            cmd = [
+                "jbrowse",
+                "add-track",
+                url,
+                "-t",
+                "FeatureTrack",
+                "-a",
+                self.genome_name,
+                "--indexFile",
+                url + ".tbi",
+                "-n",
+                trackData["name"],
+                "--load",
+                "inPlace",
+                "--target",
+                self.outdir,
+            ]
+            self.subprocess_check_call(cmd)
+        os.unlink(gff3)
+
+    def add_bigwig(self, data, trackData):
+        url = "%s.bw" % trackData["name"]
+        if self.standalone == "complete":
+            dest = os.path.realpath(os.path.join(self.outdir, url))
+            cmd = ["cp", data, dest]
+            self.subprocess_check_call(cmd)
+            bwloc = {"uri": url}
+        else:
+            dsId = trackData["metadata"]["dataset_id"]
+            url = "%s/api/datasets/%s/display?to_ext=fasta" % (self.giURL, dsId)
+            bwloc = {"uri": url}
+        tId = trackData["label"]
+        trackDict = {
+            "type": "QuantitativeTrack",
+            "trackId": tId,
+            "name": url,
+            "assemblyNames": [
+                self.genome_name,
+            ],
+            "adapter": {
+                "type": "BigWigAdapter",
+                "bigWigLocation": bwloc,
+            },
+            "displays": [
+                {
+                    "type": "LinearWiggleDisplay",
+                    "displayId": "%s-LinearWiggleDisplay" % tId,
+                }
+            ],
+        }
+        if self.usejson:
+            self.tracksToAdd.append(trackDict)
+            self.trackIdlist.append(tId)
+        else:
+            cmd = [
+                "jbrowse",
+                "add-track",
+                url,
+                "-t",
+                "QuantitativeTrack",
+                "-a",
+                self.genome_name,
+                "-n",
+                trackData["name"],
+                "--load",
+                "inPlace",
+                "--target",
+                self.outdir,
+            ]
+            self.subprocess_check_call(cmd)
+
+    def add_bam(self, data, trackData, bamOpts, bam_index=None, **kwargs):
+        tId = trackData["label"]
+        fname = "%s.bam" % trackData["label"]
+        dest = os.path.realpath("%s/%s" % (self.outdir, fname))
+        if self.standalone == "complete":
+            url = fname
+            self.subprocess_check_call(["cp", data, dest])
+            log.info("### copied %s to %s" % (data, dest))
+            bloc = {"uri": url}
+        else:
+            dsId = trackData["metadata"]["dataset_id"]
+            url = "%s/api/datasets/%s/display?to_ext=bam" % (self.giURL, dsId)
+            bloc = {"uri": url}
+        if bam_index is not None and os.path.exists(os.path.realpath(bam_index)):
+            # bai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest
+            self.subprocess_check_call(
+                ["cp", os.path.realpath(bam_index), dest + ".bai"]
+            )
+        else:
+            # Can happen in exotic condition
+            # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam
+            #      => no index generated by galaxy, but there might be one next to the symlink target
+            #      this trick allows to skip the bam sorting made by galaxy if already done outside
+            if os.path.exists(os.path.realpath(data) + ".bai"):
+                self.symlink_or_copy(os.path.realpath(data) + ".bai", dest + ".bai")
+            else:
+                log.warn("Could not find a bam index (.bai file) for %s", data)
+        trackDict = {
+            "type": "AlignmentsTrack",
+            "trackId": tId,
+            "name": trackData["name"],
+            "assemblyNames": [self.genome_name],
+            "adapter": {
+                "type": "BamAdapter",
+                "bamLocation": bloc,
+                "index": {
+                    "location": {
+                        "uri": fname + ".bai",
+                    }
+                },
+            },
+        }
+        if self.usejson:
+            self.tracksToAdd.append(trackDict)
+            self.trackIdlist.append(tId)
+        else:
+            cmd = [
+                "jbrowse",
+                "add-track",
+                fname,
+                "-t",
+                "AlignmentsTrack",
+                "-l",
+                "inPlace",
+                "-a",
+                self.genome_name,
+                "--indexFile",
+                fname + ".bai",
+                "-n",
+                trackData["name"],
+                "--target",
+                self.outdir,
+            ]
+            self.subprocess_check_call(cmd)
+
+    def add_vcf(self, data, trackData):
+        tId = trackData["label"]
+        url = "%s/api/datasets/%s/display" % (
+            self.giURL,
+            trackData["metadata"]["dataset_id"],
+        )
+        url = "%s.vcf.gz" % tId
+        dest = os.path.realpath("%s/%s" % (self.outdir, url))
+        cmd = "bgzip -c %s  > %s" % (data, dest)
+        self.subprocess_popen(cmd)
+        cmd = ["tabix", "-p", "vcf", dest]
+        self.subprocess_check_call(cmd)
+        trackDict = {
+            "type": "VariantTrack",
+            "trackId": tId,
+            "name": trackData["name"],
+            "assemblyNames": [self.genome_name],
+            "adapter": {
+                "type": "VcfTabixAdapter",
+                "vcfGzLocation": {
+                    "uri": url,
+                },
+                "index": {
+                    "location": {
+                        "uri": url + ".tbi",
+                    }
+                },
+            },
+            "displays": [
+                {
+                    "type": "LinearVariantDisplay",
+                    "displayId": "%s-LinearVariantDisplay" % tId,
+                },
+                {
+                    "type": "ChordVariantDisplay",
+                    "displayId": "%s-ChordVariantDisplay" % tId,
+                },
+                {
+                    "type": "LinearPairedArcDisplay",
+                    "displayId": "%s-LinearPairedArcDisplay" % tId,
+                },
+            ],
+        }
+        if self.usejson:
+            self.tracksToAdd.append(trackDict)
+            self.trackIdlist.append(tId)
+        else:
+            cmd = [
+                "jbrowse",
+                "add-track",
+                url,
+                "-t",
+                "VariantTrack",
+                "-a",
+                self.genome_name,
+                "--indexFile",
+                url + ".tbi",
+                "-n",
+                trackData["name"],
+                "--load",
+                "inPlace",
+                "--target",
+                self.outdir,
+            ]
+            self.subprocess_check_call(cmd)
+
+    def _sort_gff(self, data, dest):
+        # Only index if not already done
+        if not os.path.exists(dest + ".gz"):
+            cmd = "jbrowse sort-gff %s | bgzip -c > %s.gz" % (
+                data,
+                dest,
+            )  # "gff3sort.pl --precise '%s' | grep -v \"^$\" > '%s'"
+            self.subprocess_popen(cmd)
+            self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest + ".gz"])
+
+    def _sort_bed(self, data, dest):
+        # Only index if not already done
+        if not os.path.exists(dest):
+            cmd = "sort -k1,1 -k2,2n %s | bgzip -c > %s" % (data, dest)
+            self.subprocess_popen(cmd)
+            cmd = ["tabix", "-f", "-p", "bed", dest]
+            self.subprocess_check_call(cmd)
+
+    def add_gff(self, data, ext, trackData):
+        url = "%s.%s" % (trackData["label"], ext)
+        dest = os.path.realpath("%s/%s" % (self.outdir, url))
+        self._sort_gff(data, dest)
+        url = url + ".gz"
+        tId = trackData["label"]
+        trackDict = {
+            "type": "FeatureTrack",
+            "trackId": tId,
+            "name": trackData["name"],
+            "assemblyNames": [self.genome_name],
+            "adapter": {
+                "type": "Gff3TabixAdapter",
+                "gffGzLocation": {
+                    "uri": url,
+                },
+                "index": {
+                    "location": {
+                        "uri": url + ".tbi",
+                    }
+                },
+            },
+            "displays": [
+                {
+                    "type": "LinearBasicDisplay",
+                    "displayId": "%s-LinearBasicDisplay" % tId,
+                },
+                {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
+            ],
+        }
+        if self.usejson:
+            self.tracksToAdd.append(trackDict)
+            self.trackIdlist.append(tId)
+        else:
+            cmd = [
+                "jbrowse",
+                "add-track",
+                url,
+                "-t",
+                "FeatureTrack",
+                "-a",
+                self.genome_name,
+                "-n",
+                trackData["name"],
+                "--load",
+                "inPlace",
+                "--target",
+                self.outdir,
+            ]
+            self.subprocess_check_call(cmd)
+
+    def add_bed(self, data, ext, trackData):
+        url = "%s.%s" % (trackData["label"], ext)
+        dest = os.path.realpath("%s/%s.gz" % (self.outdir, url))
+        self._sort_bed(data, dest)
+        tId = trackData["label"]
+        url = url + ".gz"
+        trackDict = {
+            "type": "FeatureTrack",
+            "trackId": tId,
+            "name": trackData["name"],
+            "assemblyNames": [self.genome_name],
+            "adapter": {
+                "type": "BedTabixAdapter",
+                "bedGzLocation": {
+                    "uri": url,
+                },
+                "index": {
+                    "location": {
+                        "uri": url + ".tbi",
+                    }
+                },
+            },
+            "displays": [
+                {
+                    "type": "LinearBasicDisplay",
+                    "displayId": "%s-LinearBasicDisplay" % tId,
+                },
+                {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
+            ],
+        }
+        if self.usejson:
+            self.tracksToAdd.append(trackDict)
+            self.trackIdlist.append(tId)
+        else:
+            cmd = [
+                "jbrowse",
+                "add-track",
+                url,
+                "-t",
+                "FeatureTrack",
+                "-a",
+                self.genome_name,
+                "--indexFile",
+                url + ".tbi",
+                "-n",
+                trackData["name"],
+                "--load",
+                "inPlace",
+                "--target",
+                self.outdir,
+            ]
+            self.subprocess_check_call(cmd)
+
+    def process_annotations(self, track):
+        category = track["category"].replace("__pd__date__pd__", TODAY)
+        for i, (
+            dataset_path,
+            dataset_ext,
+            track_human_label,
+            extra_metadata,
+        ) in enumerate(track["trackfiles"]):
+            # Unsanitize labels (element_identifiers are always sanitized by Galaxy)
+            for key, value in mapped_chars.items():
+                track_human_label = track_human_label.replace(value, key)
+            outputTrackConfig = {
+                "category": category,
+            }
+            if self.debug:
+                log.info(
+                    "Processing category = %s, track_human_label = %s",
+                    category,
+                    track_human_label,
+                )
+            # We add extra data to hash for the case of REST + SPARQL.
+            if (
+                "conf" in track
+                and "options" in track["conf"]
+                and "url" in track["conf"]["options"]
+            ):
+                rest_url = track["conf"]["options"]["url"]
+            else:
+                rest_url = ""
+
+            # I chose to use track['category'] instead of 'category' here. This
+            # is intentional. This way re-running the tool on a different date
+            # will not generate different hashes and make comparison of outputs
+            # much simpler.
+            hashData = [
+                str(dataset_path),
+                track_human_label,
+                track["category"],
+                rest_url,
+            ]
+            hashData = "|".join(hashData).encode("utf-8")
+            outputTrackConfig["label"] = hashlib.md5(hashData).hexdigest() + "_%s" % i
+            outputTrackConfig["metadata"] = extra_metadata
+            outputTrackConfig["name"] = track_human_label
+
+            if dataset_ext in ("gff", "gff3"):
+                self.add_gff(
+                    dataset_path,
+                    dataset_ext,
+                    outputTrackConfig,
+                )
+            elif dataset_ext in ("hic",):
+                self.add_hic(
+                    dataset_path,
+                    outputTrackConfig,
+                )
+            elif dataset_ext in ("bed",):
+                self.add_bed(
+                    dataset_path,
+                    dataset_ext,
+                    outputTrackConfig,
+                )
+            elif dataset_ext in ("maf",):
+                self.add_maf(
+                    dataset_path,
+                    outputTrackConfig,
+                )
+            elif dataset_ext == "bigwig":
+                self.add_bigwig(
+                    dataset_path,
+                    outputTrackConfig,
+                )
+            elif dataset_ext == "bam":
+                real_indexes = track["conf"]["options"]["pileup"]["bam_indices"][
+                    "bam_index"
+                ]
+                if not isinstance(real_indexes, list):
+                    # <bam_indices>
+                    #  <bam_index>/path/to/a.bam.bai</bam_index>
+                    # </bam_indices>
+                    #
+                    # The above will result in the 'bam_index' key containing a
+                    # string. If there are two or more indices, the container
+                    # becomes a list. Fun!
+                    real_indexes = [real_indexes]
+
+                self.add_bam(
+                    dataset_path,
+                    outputTrackConfig,
+                    track["conf"]["options"]["pileup"],
+                    bam_index=real_indexes[i],
+                )
+            elif dataset_ext == "blastxml":
+                self.add_blastxml(
+                    dataset_path, outputTrackConfig, track["conf"]["options"]["blast"]
+                )
+            elif dataset_ext == "vcf":
+                self.add_vcf(dataset_path, outputTrackConfig)
+            else:
+                log.warn("Do not know how to handle %s", dataset_ext)
+
+    def clone_jbrowse(self, jbrowse_dir, destination, minimal=False):
+        """Clone a JBrowse directory into a destination directory."""
+        cmd = ["jbrowse", "create", "-f", self.outdir]
+        self.subprocess_check_call(cmd)
+        for fn in [
+            "asset-manifest.json",
+            "favicon.ico",
+            "robots.txt",
+            "umd_plugin.js",
+            "version.txt",
+            "test_data",
+        ]:
+            cmd = ["rm", "-rf", os.path.join(self.outdir, fn)]
+            self.subprocess_check_call(cmd)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="", epilog="")
+    parser.add_argument("xml", type=argparse.FileType("r"), help="Track Configuration")
+
+    parser.add_argument("--jbrowse", help="Folder containing a jbrowse release")
+    parser.add_argument("--outdir", help="Output directory", default="out")
+    parser.add_argument(
+        "--standalone",
+        choices=["complete", "minimal", "data"],
+        help="Standalone mode includes a copy of JBrowse",
+    )
+    parser.add_argument("--version", "-V", action="version", version="%(prog)s 0.8.0")
+    args = parser.parse_args()
+
+    tree = ET.parse(args.xml.name)
+    root = tree.getroot()
+
+    # This should be done ASAP
+    GALAXY_INFRASTRUCTURE_URL = root.find("metadata/galaxyUrl").text
+    # Sometimes this comes as `localhost` without a protocol
+    if not GALAXY_INFRASTRUCTURE_URL.startswith("http"):
+        # so we'll prepend `http://` and hope for the best. Requests *should*
+        # be GET and not POST so it should redirect OK
+        GALAXY_INFRASTRUCTURE_URL = "http://" + GALAXY_INFRASTRUCTURE_URL
+
+    jc = JbrowseConnector(
+        jbrowse=args.jbrowse,
+        outdir=args.outdir,
+        genomes=[
+            {
+                "path": os.path.realpath(x.attrib["path"]),
+                "meta": metadata_from_node(x.find("metadata")),
+            }
+            for x in root.findall("metadata/genomes/genome")
+        ],
+        standalone=args.standalone,
+    )
+    jc.process_genomes()
+
+    for track in root.findall("tracks/track"):
+        track_conf = {}
+        track_conf["trackfiles"] = []
+
+        is_multi_bigwig = False
+        try:
+            if track.find("options/wiggle/multibigwig") and (
+                track.find("options/wiggle/multibigwig").text == "True"
+            ):
+                is_multi_bigwig = True
+                multi_bigwig_paths = []
+        except KeyError:
+            pass
+
+        trackfiles = track.findall("files/trackFile")
+        if trackfiles:
+            for x in track.findall("files/trackFile"):
+                if is_multi_bigwig:
+                    multi_bigwig_paths.append(
+                        (x.attrib["label"], os.path.realpath(x.attrib["path"]))
+                    )
+                else:
+                    if trackfiles:
+                        metadata = metadata_from_node(x.find("metadata"))
+                        track_conf["dataset_id"] = metadata["dataset_id"]
+                        track_conf["trackfiles"].append(
+                            (
+                                os.path.realpath(x.attrib["path"]),
+                                x.attrib["ext"],
+                                x.attrib["label"],
+                                metadata,
+                            )
+                        )
+        else:
+            # For tracks without files (rest, sparql)
+            track_conf["trackfiles"].append(
+                (
+                    "",  # N/A, no path for rest or sparql
+                    track.attrib["format"],
+                    track.find("options/label").text,
+                    {},
+                )
+            )
+
+        if is_multi_bigwig:
+            metadata = metadata_from_node(x.find("metadata"))
+
+            track_conf["trackfiles"].append(
+                (
+                    multi_bigwig_paths,  # Passing an array of paths to represent as one track
+                    "bigwig_multiple",
+                    "MultiBigWig",  # Giving an hardcoded name for now
+                    {},  # No metadata for multiple bigwig
+                )
+            )
+
+        track_conf["category"] = track.attrib["cat"]
+        track_conf["format"] = track.attrib["format"]
+        try:
+            # Only pertains to gff3 + blastxml. TODO?
+            track_conf["style"] = {t.tag: t.text for t in track.find("options/style")}
+        except TypeError:
+            track_conf["style"] = {}
+            pass
+        track_conf["conf"] = etree_to_dict(track.find("options"))
+        jc.process_annotations(track_conf)
+        print("## processed", str(track_conf), "trackIdlist", jc.trackIdlist)
+    print(
+        "###done processing, trackIdlist=",
+        jc.trackIdlist,
+        "config=",
+        str(jc.config_json),
+    )
+    jc.config_json["tracks"] = jc.tracksToAdd
+    if jc.usejson:
+        jc.write_config()
+    jc.add_default_view()
author	fubar
date	Fri, 05 Jan 2024 01:58:02 +0000
parents
children