Mercurial > repos > fubar > jbrowse2dev
diff jbrowse2/jbrowse2_jbrowsenotjson.py @ 6:88b9b105c09b draft
Uploaded
author | fubar |
---|---|
date | Fri, 05 Jan 2024 01:58:02 +0000 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/jbrowse2/jbrowse2_jbrowsenotjson.py Fri Jan 05 01:58:02 2024 +0000 @@ -0,0 +1,1034 @@ +#!/usr/bin/env python +# change to accumulating all configuration for config.json based on the default from the clone +import argparse +import datetime +import hashlib +import json +import logging +import os +import shutil +import subprocess +import tempfile +import xml.etree.ElementTree as ET +from collections import defaultdict + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger("jbrowse") +TODAY = datetime.datetime.now().strftime("%Y-%m-%d") +GALAXY_INFRASTRUCTURE_URL = None +mapped_chars = { + ">": "__gt__", + "<": "__lt__", + "'": "__sq__", + '"': "__dq__", + "[": "__ob__", + "]": "__cb__", + "{": "__oc__", + "}": "__cc__", + "@": "__at__", + "#": "__pd__", + "": "__cn__", +} + + +def etree_to_dict(t): + if t is None: + return {} + + d = {t.tag: {} if t.attrib else None} + children = list(t) + if children: + dd = defaultdict(list) + for dc in map(etree_to_dict, children): + for k, v in dc.items(): + dd[k].append(v) + d = {t.tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}} + if t.attrib: + d[t.tag].update(("@" + k, v) for k, v in t.attrib.items()) + if t.text: + text = t.text.strip() + if children or t.attrib: + if text: + d[t.tag]["#text"] = text + else: + d[t.tag] = text + return d + + +INSTALLED_TO = os.path.dirname(os.path.realpath(__file__)) + + +def metadata_from_node(node): + metadata = {} + try: + if len(node.findall("dataset")) != 1: + # exit early + return metadata + except Exception: + return {} + + for (key, value) in node.findall("dataset")[0].attrib.items(): + metadata["dataset_%s" % key] = value + + for (key, value) in node.findall("history")[0].attrib.items(): + metadata["history_%s" % key] = value + + for (key, value) in node.findall("metadata")[0].attrib.items(): + metadata["metadata_%s" % key] = value + + for (key, value) in node.findall("tool")[0].attrib.items(): + metadata["tool_%s" % key] = value + + # Additional Mappings applied: + metadata[ + "dataset_edam_format" + ] = '<a target="_blank" href="http://edamontology.org/{0}">{1}</a>'.format( + metadata["dataset_edam_format"], metadata["dataset_file_ext"] + ) + metadata["history_user_email"] = '<a href="mailto:{0}">{0}</a>'.format( + metadata["history_user_email"] + ) + metadata["hist_name"] = metadata["history_display_name"] + metadata[ + "history_display_name" + ] = '<a target="_blank" href="{galaxy}/history/view/{encoded_hist_id}">{hist_name}</a>'.format( + galaxy=GALAXY_INFRASTRUCTURE_URL, + encoded_hist_id=metadata["history_id"], + hist_name=metadata["history_display_name"], + ) + metadata[ + "tool_tool" + ] = '<a target="_blank" href="{galaxy}/datasets/{encoded_id}/show_params">{tool_id}</a>'.format( + galaxy=GALAXY_INFRASTRUCTURE_URL, + encoded_id=metadata["dataset_id"], + tool_id=metadata["tool_tool_id"], + # tool_version=metadata['tool_tool_version'], + ) + return metadata + + +class JbrowseConnector(object): + def __init__(self, jbrowse, outdir, genomes, standalone=None): + self.debug = False + self.usejson = True + self.giURL = GALAXY_INFRASTRUCTURE_URL + self.jbrowse = jbrowse + self.outdir = outdir + os.makedirs(self.outdir, exist_ok=True) + self.genome_paths = genomes + self.standalone = standalone + self.trackIdlist = [] + self.tracksToAdd = [] + self.config_json = {} + self.config_json_file = os.path.realpath(os.path.join(outdir, "config.json")) + if standalone == "complete": + self.clone_jbrowse(self.jbrowse, self.outdir) + elif standalone == "minimal": + self.clone_jbrowse(self.jbrowse, self.outdir, minimal=True) + + def subprocess_check_call(self, command, output=None): + if output: + if self.debug: + log.debug("cd %s && %s > %s", self.outdir, " ".join(command), output) + subprocess.check_call(command, cwd=self.outdir, stdout=output) + else: + log.debug("cd %s && %s", self.outdir, " ".join(command)) + subprocess.check_call(command, cwd=self.outdir) + + def subprocess_popen(self, command): + if self.debug: + log.debug("cd %s && %s", self.outdir, command) + p = subprocess.Popen( + command, + shell=True, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + output, err = p.communicate() + retcode = p.returncode + if retcode != 0: + log.error("cd %s && %s", self.outdir, command) + log.error(output) + log.error(err) + raise RuntimeError("Command failed with exit code %s" % (retcode)) + + def subprocess_check_output(self, command): + if self.debug: + log.debug("cd %s && %s", self.outdir, " ".join(command)) + return subprocess.check_output(command, cwd=self.outdir) + + def _jbrowse_bin(self, command): + return os.path.realpath(os.path.join(self.jbrowse, "bin", command)) + + def symlink_or_copy(self, src, dest): + if "GALAXY_JBROWSE_SYMLINKS" in os.environ and bool( + os.environ["GALAXY_JBROWSE_SYMLINKS"] + ): + cmd = ["ln", "-s", src, dest] + else: + cmd = ["cp", src, dest] + + return self.subprocess_check_call(cmd) + + def process_genomes(self): + assemblies = [] + for i, genome_node in enumerate(self.genome_paths): + log.info("genome_node=%s" % str(genome_node)) + # We only expect one input genome per run. This for loop is just + # easier to write than the alternative / catches any possible + # issues. + genome_name = genome_node["meta"]["dataset_dname"] + dsId = genome_node["meta"]["dataset_id"] + fapath = genome_node["path"] + faname = genome_name + ".fa.gz" + faind = os.path.realpath(os.path.join(self.outdir, faname + ".gzi")) + if self.standalone == "complete": + fadest = os.path.realpath(os.path.join(self.outdir, faname)) + cmd = "bgzip -i -c %s > %s && samtools faidx %s" % ( + fapath, + fadest, + fadest, + ) + self.subprocess_popen(cmd) + adapter = { + "type": "BgzipFastaAdapter", + "fastaLocation": { + "uri": faname, + }, + "faiLocation": { + "uri": faname + ".fai", + }, + "gziLocation": { + "uri": faname + ".gzi", + }, + } + else: + faurl = "%s/api/datasets/%s/display" % (self.giURL, dsId) + fastalocation = { + "uri": faurl, + } + failocation = { + "uri": faname + ".fai", + } + adapter = { + "type": "IndexedFastaAdapter", + "fastaLocation": fastalocation, + "faiLocation": failocation, + } + + cmd = ["samtools", "faidx", fapath, "--fai-idx", faind] + self.subprocess_check_call(cmd) + trackDict = { + "name": genome_name, + "sequence": { + "type": "ReferenceSequenceTrack", + "trackId": genome_name, + "adapter": adapter, + }, + "rendering": {"type": "DivSequenceRenderer"}, + } + assemblies.append(trackDict) + self.genome_name = genome_name + if self.usejson: + self.config_json["assemblies"] = assemblies + else: + cmd = [ + "jbrowse", + "add-assembly", + faname, + "-t", + "bgzipFasta", + "-n", + genome_name, + "--load", + "inPlace", + "--faiLocation", + faname + ".fai", + "--gziLocation", + faname + ".gzi", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def add_default_view(self): + cmd = [ + "jbrowse", + "set-default-session", + "-s", + self.config_json_file, + "-t", + ",".join(self.trackIdlist), + "-n", + "JBrowse2 in Galaxy", + "--target", + self.config_json_file, + "-v", + " LinearGenomeView", + ] + if True or self.debug: + log.info("### calling set-default-session with cmd=%s" % " ".join(cmd)) + self.subprocess_check_call(cmd) + + def write_config(self): + with open(self.config_json_file, "w") as fp: + json.dump(self.config_json, fp) + + def add_hic(self, data, trackData): + """ + HiC adapter. + https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md + for testing locally, these work: + HiC data is from https://s3.amazonaws.com/igv.broadinstitute.org/data/hic/intra_nofrag_30.hic + using hg19 reference track as a + 'BgzipFastaAdapter' + fastaLocation: + uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz', + faiLocation: + uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.fai', + gziLocation: + uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.gzi', + Cool will not be likely to be a good fit - see discussion at https://github.com/GMOD/jbrowse-components/issues/2438 + """ + log.info("#### trackData=%s" % trackData) + tId = trackData["label"] + dsId = trackData["metadata"]["dataset_id"] + url = "%s/api/datasets/%s/display?to_ext=hic " % ( + self.giURL, + dsId, + ) + hname = trackData["name"] + if self.standalone == "complete": + dest = os.path.realpath(os.path.join(self.outdir, hname)) + url = hname + cmd = ["cp", data, dest] + self.subprocess_check_call(cmd) + floc = { + "uri": hname, + } + else: + url = "%s/api/datasets/%s/display?to_ext=hic" % (self.giURL, dsId) + floc = { + "uri": url, + } + trackDict = { + "type": "HicTrack", + "trackId": tId, + "name": hname, + "assemblyNames": [self.genome_name], + "adapter": { + "type": "HicAdapter", + "hicLocation": floc, + }, + } + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "HicTrack", + "-a", + self.genome_name, + "-n", + hname, + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def add_maf(self, data, trackData): + """ + from https://github.com/cmdcolin/maf2bed + Note: Both formats start with a MAF as input, and note that your MAF file should contain the species name and chromosome name + e.g. hg38.chr1 in the sequence identifiers. + need the reference id - eg hg18, for maf2bed.pl as the first parameter + """ + mafPlugin = { + "plugins": [ + { + "name": "MafViewer", + "url": "https://unpkg.com/jbrowse-plugin-mafviewer/dist/jbrowse-plugin-mafviewer.umd.production.min.js", + } + ] + } + tId = trackData["label"] + fname = "%s.bed" % tId + dest = os.path.realpath("%s/%s" % (self.outdir, fname)) + # self.symlink_or_copy(data, dest) + # Process MAF to bed-like. Need build to munge chromosomes + gname = self.genome_name + cmd = [ + "bash", + os.path.join(INSTALLED_TO, "convertMAF.sh"), + data, + gname, + INSTALLED_TO, + dest, + ] + self.subprocess_check_call(cmd) + if True or self.debug: + log.info("### convertMAF.sh called as %s" % " ".join(cmd)) + # Construct samples list + # We could get this from galaxy metadata, not sure how easily. + ps = subprocess.Popen(["grep", "^s [^ ]*", "-o", data], stdout=subprocess.PIPE) + output = subprocess.check_output(("sort", "-u"), stdin=ps.stdout) + ps.wait() + outp = output.decode("ascii") + soutp = outp.split("\n") + samp = [x.split("s ")[1] for x in soutp if x.startswith("s ")] + samples = [x.split(".")[0] for x in samp] + if self.debug: + log.info("### got samples = %s " % (samples)) + trackDict = { + "type": "MafTrack", + "trackId": tId, + "name": trackData["name"], + "adapter": { + "type": "MafTabixAdapter", + "samples": samples, + "bedGzLocation": { + "uri": fname + ".sorted.bed.gz", + }, + "index": { + "location": { + "uri": fname + ".sorted.bed.gz.tbi", + }, + }, + }, + "assemblyNames": [self.genome_name], + } + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + if self.config_json.get("plugins", None): + self.config_json["plugins"].append(mafPlugin[0]) + else: + self.config_json.update(mafPlugin) + + def _blastxml_to_gff3(self, xml, min_gap=10): + gff3_unrebased = tempfile.NamedTemporaryFile(delete=False) + cmd = [ + "python", + os.path.join(INSTALLED_TO, "blastxml_to_gapped_gff3.py"), + "--trim", + "--trim_end", + "--include_seq", + "--min_gap", + str(min_gap), + xml, + ] + subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_unrebased) + gff3_unrebased.close() + return gff3_unrebased.name + + def add_blastxml(self, data, trackData, blastOpts, **kwargs): + gff3 = self._blastxml_to_gff3(data, min_gap=blastOpts["min_gap"]) + + if "parent" in blastOpts and blastOpts["parent"] != "None": + gff3_rebased = tempfile.NamedTemporaryFile(delete=False) + cmd = ["python", os.path.join(INSTALLED_TO, "gff3_rebase.py")] + if blastOpts.get("protein", "false") == "true": + cmd.append("--protein2dna") + cmd.extend([os.path.realpath(blastOpts["parent"]), gff3]) + subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_rebased) + gff3_rebased.close() + + # Replace original gff3 file + shutil.copy(gff3_rebased.name, gff3) + os.unlink(gff3_rebased.name) + url = "%s.gff3" % trackData["label"] + dest = os.path.realpath("%s/%s" % (self.outdir, url)) + self._sort_gff(gff3, dest) + url = url + ".gz" + tId = trackData["label"] + trackDict = { + "type": "FeatureTrack", + "trackId": tId, + "name": trackData["name"], + "assemblyNames": [self.genome_name], + "adapter": { + "type": "Gff3TabixAdapter", + "gffGzLocation": { + "uri": url, + }, + "index": { + "location": { + "uri": url + ".tbi", + } + }, + }, + "displays": [ + { + "type": "LinearBasicDisplay", + "displayId": "%s-LinearBasicDisplay" % tId, + }, + {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, + ], + } + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "FeatureTrack", + "-a", + self.genome_name, + "--indexFile", + url + ".tbi", + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + os.unlink(gff3) + + def add_bigwig(self, data, trackData): + url = "%s.bw" % trackData["name"] + if self.standalone == "complete": + dest = os.path.realpath(os.path.join(self.outdir, url)) + cmd = ["cp", data, dest] + self.subprocess_check_call(cmd) + bwloc = {"uri": url} + else: + dsId = trackData["metadata"]["dataset_id"] + url = "%s/api/datasets/%s/display?to_ext=fasta" % (self.giURL, dsId) + bwloc = {"uri": url} + tId = trackData["label"] + trackDict = { + "type": "QuantitativeTrack", + "trackId": tId, + "name": url, + "assemblyNames": [ + self.genome_name, + ], + "adapter": { + "type": "BigWigAdapter", + "bigWigLocation": bwloc, + }, + "displays": [ + { + "type": "LinearWiggleDisplay", + "displayId": "%s-LinearWiggleDisplay" % tId, + } + ], + } + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "QuantitativeTrack", + "-a", + self.genome_name, + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def add_bam(self, data, trackData, bamOpts, bam_index=None, **kwargs): + tId = trackData["label"] + fname = "%s.bam" % trackData["label"] + dest = os.path.realpath("%s/%s" % (self.outdir, fname)) + if self.standalone == "complete": + url = fname + self.subprocess_check_call(["cp", data, dest]) + log.info("### copied %s to %s" % (data, dest)) + bloc = {"uri": url} + else: + dsId = trackData["metadata"]["dataset_id"] + url = "%s/api/datasets/%s/display?to_ext=bam" % (self.giURL, dsId) + bloc = {"uri": url} + if bam_index is not None and os.path.exists(os.path.realpath(bam_index)): + # bai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest + self.subprocess_check_call( + ["cp", os.path.realpath(bam_index), dest + ".bai"] + ) + else: + # Can happen in exotic condition + # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam + # => no index generated by galaxy, but there might be one next to the symlink target + # this trick allows to skip the bam sorting made by galaxy if already done outside + if os.path.exists(os.path.realpath(data) + ".bai"): + self.symlink_or_copy(os.path.realpath(data) + ".bai", dest + ".bai") + else: + log.warn("Could not find a bam index (.bai file) for %s", data) + trackDict = { + "type": "AlignmentsTrack", + "trackId": tId, + "name": trackData["name"], + "assemblyNames": [self.genome_name], + "adapter": { + "type": "BamAdapter", + "bamLocation": bloc, + "index": { + "location": { + "uri": fname + ".bai", + } + }, + }, + } + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + fname, + "-t", + "AlignmentsTrack", + "-l", + "inPlace", + "-a", + self.genome_name, + "--indexFile", + fname + ".bai", + "-n", + trackData["name"], + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def add_vcf(self, data, trackData): + tId = trackData["label"] + url = "%s/api/datasets/%s/display" % ( + self.giURL, + trackData["metadata"]["dataset_id"], + ) + url = "%s.vcf.gz" % tId + dest = os.path.realpath("%s/%s" % (self.outdir, url)) + cmd = "bgzip -c %s > %s" % (data, dest) + self.subprocess_popen(cmd) + cmd = ["tabix", "-p", "vcf", dest] + self.subprocess_check_call(cmd) + trackDict = { + "type": "VariantTrack", + "trackId": tId, + "name": trackData["name"], + "assemblyNames": [self.genome_name], + "adapter": { + "type": "VcfTabixAdapter", + "vcfGzLocation": { + "uri": url, + }, + "index": { + "location": { + "uri": url + ".tbi", + } + }, + }, + "displays": [ + { + "type": "LinearVariantDisplay", + "displayId": "%s-LinearVariantDisplay" % tId, + }, + { + "type": "ChordVariantDisplay", + "displayId": "%s-ChordVariantDisplay" % tId, + }, + { + "type": "LinearPairedArcDisplay", + "displayId": "%s-LinearPairedArcDisplay" % tId, + }, + ], + } + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "VariantTrack", + "-a", + self.genome_name, + "--indexFile", + url + ".tbi", + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def _sort_gff(self, data, dest): + # Only index if not already done + if not os.path.exists(dest + ".gz"): + cmd = "jbrowse sort-gff %s | bgzip -c > %s.gz" % ( + data, + dest, + ) # "gff3sort.pl --precise '%s' | grep -v \"^$\" > '%s'" + self.subprocess_popen(cmd) + self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest + ".gz"]) + + def _sort_bed(self, data, dest): + # Only index if not already done + if not os.path.exists(dest): + cmd = "sort -k1,1 -k2,2n %s | bgzip -c > %s" % (data, dest) + self.subprocess_popen(cmd) + cmd = ["tabix", "-f", "-p", "bed", dest] + self.subprocess_check_call(cmd) + + def add_gff(self, data, ext, trackData): + url = "%s.%s" % (trackData["label"], ext) + dest = os.path.realpath("%s/%s" % (self.outdir, url)) + self._sort_gff(data, dest) + url = url + ".gz" + tId = trackData["label"] + trackDict = { + "type": "FeatureTrack", + "trackId": tId, + "name": trackData["name"], + "assemblyNames": [self.genome_name], + "adapter": { + "type": "Gff3TabixAdapter", + "gffGzLocation": { + "uri": url, + }, + "index": { + "location": { + "uri": url + ".tbi", + } + }, + }, + "displays": [ + { + "type": "LinearBasicDisplay", + "displayId": "%s-LinearBasicDisplay" % tId, + }, + {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, + ], + } + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "FeatureTrack", + "-a", + self.genome_name, + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def add_bed(self, data, ext, trackData): + url = "%s.%s" % (trackData["label"], ext) + dest = os.path.realpath("%s/%s.gz" % (self.outdir, url)) + self._sort_bed(data, dest) + tId = trackData["label"] + url = url + ".gz" + trackDict = { + "type": "FeatureTrack", + "trackId": tId, + "name": trackData["name"], + "assemblyNames": [self.genome_name], + "adapter": { + "type": "BedTabixAdapter", + "bedGzLocation": { + "uri": url, + }, + "index": { + "location": { + "uri": url + ".tbi", + } + }, + }, + "displays": [ + { + "type": "LinearBasicDisplay", + "displayId": "%s-LinearBasicDisplay" % tId, + }, + {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, + ], + } + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "FeatureTrack", + "-a", + self.genome_name, + "--indexFile", + url + ".tbi", + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def process_annotations(self, track): + category = track["category"].replace("__pd__date__pd__", TODAY) + for i, ( + dataset_path, + dataset_ext, + track_human_label, + extra_metadata, + ) in enumerate(track["trackfiles"]): + # Unsanitize labels (element_identifiers are always sanitized by Galaxy) + for key, value in mapped_chars.items(): + track_human_label = track_human_label.replace(value, key) + outputTrackConfig = { + "category": category, + } + if self.debug: + log.info( + "Processing category = %s, track_human_label = %s", + category, + track_human_label, + ) + # We add extra data to hash for the case of REST + SPARQL. + if ( + "conf" in track + and "options" in track["conf"] + and "url" in track["conf"]["options"] + ): + rest_url = track["conf"]["options"]["url"] + else: + rest_url = "" + + # I chose to use track['category'] instead of 'category' here. This + # is intentional. This way re-running the tool on a different date + # will not generate different hashes and make comparison of outputs + # much simpler. + hashData = [ + str(dataset_path), + track_human_label, + track["category"], + rest_url, + ] + hashData = "|".join(hashData).encode("utf-8") + outputTrackConfig["label"] = hashlib.md5(hashData).hexdigest() + "_%s" % i + outputTrackConfig["metadata"] = extra_metadata + outputTrackConfig["name"] = track_human_label + + if dataset_ext in ("gff", "gff3"): + self.add_gff( + dataset_path, + dataset_ext, + outputTrackConfig, + ) + elif dataset_ext in ("hic",): + self.add_hic( + dataset_path, + outputTrackConfig, + ) + elif dataset_ext in ("bed",): + self.add_bed( + dataset_path, + dataset_ext, + outputTrackConfig, + ) + elif dataset_ext in ("maf",): + self.add_maf( + dataset_path, + outputTrackConfig, + ) + elif dataset_ext == "bigwig": + self.add_bigwig( + dataset_path, + outputTrackConfig, + ) + elif dataset_ext == "bam": + real_indexes = track["conf"]["options"]["pileup"]["bam_indices"][ + "bam_index" + ] + if not isinstance(real_indexes, list): + # <bam_indices> + # <bam_index>/path/to/a.bam.bai</bam_index> + # </bam_indices> + # + # The above will result in the 'bam_index' key containing a + # string. If there are two or more indices, the container + # becomes a list. Fun! + real_indexes = [real_indexes] + + self.add_bam( + dataset_path, + outputTrackConfig, + track["conf"]["options"]["pileup"], + bam_index=real_indexes[i], + ) + elif dataset_ext == "blastxml": + self.add_blastxml( + dataset_path, outputTrackConfig, track["conf"]["options"]["blast"] + ) + elif dataset_ext == "vcf": + self.add_vcf(dataset_path, outputTrackConfig) + else: + log.warn("Do not know how to handle %s", dataset_ext) + + def clone_jbrowse(self, jbrowse_dir, destination, minimal=False): + """Clone a JBrowse directory into a destination directory.""" + cmd = ["jbrowse", "create", "-f", self.outdir] + self.subprocess_check_call(cmd) + for fn in [ + "asset-manifest.json", + "favicon.ico", + "robots.txt", + "umd_plugin.js", + "version.txt", + "test_data", + ]: + cmd = ["rm", "-rf", os.path.join(self.outdir, fn)] + self.subprocess_check_call(cmd) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="", epilog="") + parser.add_argument("xml", type=argparse.FileType("r"), help="Track Configuration") + + parser.add_argument("--jbrowse", help="Folder containing a jbrowse release") + parser.add_argument("--outdir", help="Output directory", default="out") + parser.add_argument( + "--standalone", + choices=["complete", "minimal", "data"], + help="Standalone mode includes a copy of JBrowse", + ) + parser.add_argument("--version", "-V", action="version", version="%(prog)s 0.8.0") + args = parser.parse_args() + + tree = ET.parse(args.xml.name) + root = tree.getroot() + + # This should be done ASAP + GALAXY_INFRASTRUCTURE_URL = root.find("metadata/galaxyUrl").text + # Sometimes this comes as `localhost` without a protocol + if not GALAXY_INFRASTRUCTURE_URL.startswith("http"): + # so we'll prepend `http://` and hope for the best. Requests *should* + # be GET and not POST so it should redirect OK + GALAXY_INFRASTRUCTURE_URL = "http://" + GALAXY_INFRASTRUCTURE_URL + + jc = JbrowseConnector( + jbrowse=args.jbrowse, + outdir=args.outdir, + genomes=[ + { + "path": os.path.realpath(x.attrib["path"]), + "meta": metadata_from_node(x.find("metadata")), + } + for x in root.findall("metadata/genomes/genome") + ], + standalone=args.standalone, + ) + jc.process_genomes() + + for track in root.findall("tracks/track"): + track_conf = {} + track_conf["trackfiles"] = [] + + is_multi_bigwig = False + try: + if track.find("options/wiggle/multibigwig") and ( + track.find("options/wiggle/multibigwig").text == "True" + ): + is_multi_bigwig = True + multi_bigwig_paths = [] + except KeyError: + pass + + trackfiles = track.findall("files/trackFile") + if trackfiles: + for x in track.findall("files/trackFile"): + if is_multi_bigwig: + multi_bigwig_paths.append( + (x.attrib["label"], os.path.realpath(x.attrib["path"])) + ) + else: + if trackfiles: + metadata = metadata_from_node(x.find("metadata")) + track_conf["dataset_id"] = metadata["dataset_id"] + track_conf["trackfiles"].append( + ( + os.path.realpath(x.attrib["path"]), + x.attrib["ext"], + x.attrib["label"], + metadata, + ) + ) + else: + # For tracks without files (rest, sparql) + track_conf["trackfiles"].append( + ( + "", # N/A, no path for rest or sparql + track.attrib["format"], + track.find("options/label").text, + {}, + ) + ) + + if is_multi_bigwig: + metadata = metadata_from_node(x.find("metadata")) + + track_conf["trackfiles"].append( + ( + multi_bigwig_paths, # Passing an array of paths to represent as one track + "bigwig_multiple", + "MultiBigWig", # Giving an hardcoded name for now + {}, # No metadata for multiple bigwig + ) + ) + + track_conf["category"] = track.attrib["cat"] + track_conf["format"] = track.attrib["format"] + try: + # Only pertains to gff3 + blastxml. TODO? + track_conf["style"] = {t.tag: t.text for t in track.find("options/style")} + except TypeError: + track_conf["style"] = {} + pass + track_conf["conf"] = etree_to_dict(track.find("options")) + jc.process_annotations(track_conf) + print("## processed", str(track_conf), "trackIdlist", jc.trackIdlist) + print( + "###done processing, trackIdlist=", + jc.trackIdlist, + "config=", + str(jc.config_json), + ) + jc.config_json["tracks"] = jc.tracksToAdd + if jc.usejson: + jc.write_config() + jc.add_default_view()