Mercurial > repos > fubar > jbrowse2
diff jbrowse2.py @ 46:4181e97c70a7 draft
planemo upload for repository https://github.com/usegalaxy-eu/temporary-tools/tree/master/jbrowse2 commit 3a43e9e0ffce0966101203102e769d1ced28618a
author | fubar |
---|---|
date | Mon, 04 Mar 2024 09:47:19 +0000 |
parents | bea0916e1080 |
children | 3e53204c2419 |
line wrap: on
line diff
--- a/jbrowse2.py Fri Mar 01 05:15:41 2024 +0000 +++ b/jbrowse2.py Mon Mar 04 09:47:19 2024 +0000 @@ -11,6 +11,7 @@ import struct import subprocess import tempfile +import urllib.request import xml.etree.ElementTree as ET from collections import defaultdict @@ -446,7 +447,10 @@ def process_genomes(self): assemblies = [] + useuri = False for i, genome_node in enumerate(self.genome_paths): + if genome_node["useuri"].strip().lower() == "yes": + useuri = True genome_name = genome_node["meta"]["dataset_dname"].strip() if len(genome_name.split()) > 1: genome_name = genome_name.split()[0] @@ -454,7 +458,9 @@ if genome_name not in self.genome_names: # ignore dupes - can have multiple pafs with same references? fapath = genome_node["path"] - assem = self.make_assembly(fapath, genome_name) + if not useuri: + fapath = os.path.realpath(fapath) + assem = self.make_assembly(fapath, genome_name, useuri) assemblies.append(assem) self.genome_names.append(genome_name) if self.genome_name is None: @@ -462,41 +468,67 @@ genome_name # first one for all tracks - other than paf ) self.genome_firstcontig = None - fl = open(fapath, "r").readline().strip().split(">") - if len(fl) > 1: - fl = fl[1] - if len(fl.split()) > 1: - self.genome_firstcontig = fl.split()[0].strip() + if not useuri: + # https://lazarus.name/jbrowse/fish/bigwig_0_coverage_bedgraph_cov_count_count_bw.bigwig + # https://lazarus.name/jbrowse/fish/klBraLanc5.haps_combined.decontam.20230620.fasta.fa.gz + fl = open(fapath, "r").readline() + fls = fl.strip().split(">") + if len(fls) > 1: + fl = fls[1] + if len(fl.split()) > 1: + self.genome_firstcontig = fl.split()[0].strip() + else: + self.genome_firstcontig = fl else: - self.genome_firstcontig = fl + fl = urrlib.request.urlopen(faname+".fai").readline() + if fl: # is first row of the text fai so the first contig name + self.genome_firstcontig = fl.decode('utf8').strip().split()[0] if self.config_json.get("assemblies", None): self.config_json["assemblies"] += assemblies else: self.config_json["assemblies"] = assemblies - def make_assembly(self, fapath, gname): + def make_assembly(self, fapath, gname, useuri): + if useuri: + faname = fapath + adapter = { + "type": "BgzipFastaAdapter", + "fastaLocation": { + "uri": faname, + "locationType": "UriLocation", + }, + "faiLocation": { + "uri": faname + ".fai", + "locationType": "UriLocation", + }, + "gziLocation": { + "uri": faname + ".gzi", + "locationType": "UriLocation", + }, + } + else: + faname = gname + ".fa.gz" + fadest = os.path.realpath(os.path.join(self.outdir, faname)) + cmd = "bgzip -i -c %s -I %s.gzi > %s && samtools faidx %s" % ( + fapath, + fadest, + fadest, + fadest, + ) + self.subprocess_popen(cmd) - faname = gname + ".fa.gz" - fadest = os.path.realpath(os.path.join(self.outdir, faname)) - cmd = "bgzip -i -c %s -I %s.gzi > %s && samtools faidx %s" % ( - fapath, - fadest, - fadest, - fadest, - ) - self.subprocess_popen(cmd) - adapter = { - "type": "BgzipFastaAdapter", - "fastaLocation": { - "uri": faname, - }, - "faiLocation": { - "uri": faname + ".fai", - }, - "gziLocation": { - "uri": faname + ".gzi", - }, - } + adapter = { + "type": "BgzipFastaAdapter", + "fastaLocation": { + "uri": faname, + }, + "faiLocation": { + "uri": faname + ".fai", + }, + "gziLocation": { + "uri": faname + ".gzi", + }, + } self.genome_sequence_adapter = adapter trackDict = { "name": gname, @@ -528,7 +560,7 @@ def write_config(self): with open(self.config_json_file, "w") as fp: - json.dump(self.config_json, fp) + json.dump(self.config_json, fp, indent=2) def text_index(self): # Index tracks @@ -567,18 +599,19 @@ # can be served - if public. # dsId = trackData["metadata"]["dataset_id"] # url = "%s/api/datasets/%s/display?to_ext=hic " % (self.giURL, dsId) - hname = trackData["hic_url"] - floc = { - "uri": hname, - } + useuri = trackData["useuri"].lower() == "yes" + if useuri: + uri = data + else: + uri = trackData["hic_url"] trackDict = { "type": "HicTrack", "trackId": tId, - "name": hname, + "name": uri, "assemblyNames": [self.genome_name], "adapter": { "type": "HicAdapter", - "hicLocation": floc, + "hicLocation": uri, }, "displays": [ { @@ -599,6 +632,7 @@ e.g. hg38.chr1 in the sequence identifiers. need the reference id - eg hg18, for maf2bed.pl as the first parameter """ + tId = trackData["label"] mafPlugin = { "plugins": [ { @@ -607,7 +641,7 @@ } ] } - tId = trackData["label"] + fname = "%s.bed" % tId dest = "%s/%s" % (self.outdir, fname) gname = self.genome_name @@ -744,11 +778,15 @@ "negColor": "rgb(255, 255, 51)", "constraints": {} """ - url = "%s.bigwig" % trackData["label"] - # slashes in names cause path trouble - dest = os.path.join(self.outdir, url) - cmd = ["cp", data, dest] - self.subprocess_check_call(cmd) + useuri = trackData["useuri"].lower() == "yes" + if useuri: + url = data + else: + url = "%s.bigwig" % trackData["label"] + # slashes in names cause path trouble + dest = os.path.join(self.outdir, url) + cmd = ["cp", data, dest] + self.subprocess_check_call(cmd) bwloc = {"uri": url} tId = trackData["label"] trackDict = { @@ -774,27 +812,33 @@ self.tracksToAdd.append(trackDict) self.trackIdlist.append(tId) - def add_bam(self, data, trackData, bamOpts, bam_index=None, **kwargs): + def add_bam(self, data, trackData, bam_index=None, **kwargs): tId = trackData["label"] - fname = "%s.bam" % trackData["label"] - dest = "%s/%s" % (self.outdir, fname) - url = fname - self.subprocess_check_call(["cp", data, dest]) - bloc = {"uri": url} - if bam_index is not None and os.path.exists(os.path.realpath(bam_index)): - # bai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest - self.subprocess_check_call( - ["cp", os.path.realpath(bam_index), dest + ".bai"] - ) + useuri = trackData["useuri"].lower() == "yes" + bindex = bam_index + if useuri: + url = data else: - # Can happen in exotic condition - # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam - # => no index generated by galaxy, but there might be one next to the symlink target - # this trick allows to skip the bam sorting made by galaxy if already done outside - if os.path.exists(os.path.realpath(data) + ".bai"): - self.symlink_or_copy(os.path.realpath(data) + ".bai", dest + ".bai") - else: - log.warn("Could not find a bam index (.bai file) for %s", data) + fname = "%s.bam" % trackData["label"] + dest = "%s/%s" % (self.outdir, fname) + url = fname + bindex = fname + '.bai' + self.subprocess_check_call(["cp", data, dest]) + if bam_index is not None and os.path.exists(bam_index): + if not os.path.exists(bindex): + # bai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest + self.subprocess_check_call( + ["cp", bam_index, bindex] + ) + else: + # Can happen in exotic condition + # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam + # => no index generated by galaxy, but there might be one next to the symlink target + # this trick allows to skip the bam sorting made by galaxy if already done outside + if os.path.exists(os.path.realpath(data) + ".bai"): + self.symlink_or_copy(os.path.realpath(data) + ".bai", bindex) + else: + log.warn("Could not find a bam index (.bai file) for %s", data) trackDict = { "type": "AlignmentsTrack", "trackId": tId, @@ -802,10 +846,10 @@ "assemblyNames": [self.genome_name], "adapter": { "type": "BamAdapter", - "bamLocation": bloc, + "bamLocation": {"uri": url}, "index": { "location": { - "uri": fname + ".bai", + "uri": bindex, } }, }, @@ -821,27 +865,36 @@ self.tracksToAdd.append(trackDict) self.trackIdlist.append(tId) - def add_cram(self, data, trackData, cramOpts, cram_index=None, **kwargs): + def add_cram(self, data, trackData, cram_index=None, **kwargs): tId = trackData["label"] - fname = "%s.cram" % trackData["label"] - dest = "%s/%s" % (self.outdir, fname) - url = fname - self.subprocess_check_call(["cp", data, dest]) - bloc = {"uri": url} - if cram_index is not None and os.path.exists(os.path.realpath(cram_index)): - # most probably made by galaxy and stored in galaxy dirs, need to copy it to dest - self.subprocess_check_call( - ["cp", os.path.realpath(cram_index), dest + ".crai"] - ) + useuri = trackData["useuri"].lower() == "yes" + bindex = cram_index + if useuri: + url = data else: - # Can happen in exotic condition - # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam - # => no index generated by galaxy, but there might be one next to the symlink target - # this trick allows to skip the bam sorting made by galaxy if already done outside - if os.path.exists(os.path.realpath(data) + ".crai"): - self.symlink_or_copy(os.path.realpath(data) + ".crai", dest + ".crai") - else: - log.warn("Could not find a cram index (.crai file) for %s", data) + fname = "%s.cram" % trackData["label"] + dest = "%s/%s" % (self.outdir, fname) + bindex = fname + '.bai' + url = fname + self.subprocess_check_call(["cp", data, dest]) + + if bindex is not None and os.path.exists(bindex): + if not os.path.exists(dest+'.crai'): + # most probably made by galaxy and stored in galaxy dirs, need to copy it to dest + self.subprocess_check_call( + ["cp", os.path.realpath(cram_index), dest + ".crai"] + ) + else: + # Can happen in exotic condition + # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam + # => no index generated by galaxy, but there might be one next to the symlink target + # this trick allows to skip the bam sorting made by galaxy if already done outside + if os.path.exists(os.path.realpath(data) + ".crai"): + self.symlink_or_copy( + os.path.realpath(data) + ".crai", dest + ".crai" + ) + else: + log.warn("Could not find a cram index (.crai file) for %s", data) trackDict = { "type": "AlignmentsTrack", "trackId": tId, @@ -849,9 +902,9 @@ "assemblyNames": [self.genome_name], "adapter": { "type": "CramAdapter", - "cramLocation": bloc, + "cramLocation": {"uri": url}, "craiLocation": { - "uri": fname + ".crai", + "uri": bindex, }, "sequenceAdapter": self.genome_sequence_adapter, }, @@ -873,12 +926,17 @@ # self.giURL, # trackData["metadata"]["dataset_id"], # ) - url = "%s.vcf.gz" % tId - dest = "%s/%s" % (self.outdir, url) - cmd = "bgzip -c %s > %s" % (data, dest) - self.subprocess_popen(cmd) - cmd = ["tabix", "-f", "-p", "vcf", dest] - self.subprocess_check_call(cmd) + + useuri = trackData["useuri"].lower() == "yes" + if useuri: + url = data + else: + url = "%s.vcf.gz" % tId + dest = "%s/%s" % (self.outdir, url) + cmd = "bgzip -c %s > %s" % (data, dest) + self.subprocess_popen(cmd) + cmd = ["tabix", "-f", "-p", "vcf", dest] + self.subprocess_check_call(cmd) trackDict = { "type": "VariantTrack", "trackId": tId, @@ -887,7 +945,7 @@ "adapter": { "type": "VcfTabixAdapter", "vcfGzLocation": { - "uri": url, + "uri": url }, "index": { "location": { @@ -917,13 +975,13 @@ def _sort_gff(self, data, dest): # Only index if not already done - if not os.path.exists(dest + ".gz"): - cmd = "jbrowse sort-gff '%s' | bgzip -c > '%s.gz'" % ( + if not os.path.exists(dest): + cmd = "jbrowse sort-gff '%s' | bgzip -c > '%s'" % ( data, dest, ) # "gff3sort.pl --precise '%s' | grep -v \"^$\" > '%s'" self.subprocess_popen(cmd) - self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest + ".gz"]) + self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest]) def _sort_bed(self, data, dest): # Only index if not already done @@ -934,10 +992,13 @@ self.subprocess_check_call(cmd) def add_gff(self, data, ext, trackData): - url = "%s.%s" % (trackData["label"], ext) - dest = "%s/%s" % (self.outdir, url) - self._sort_gff(data, dest) - url = url + ".gz" + useuri = trackData["useuri"].lower() == "yes" + if useuri: + url = trackData["path"] + else: + url = "%s.%s.gz" % (trackData["label"], ext) + dest = "%s/%s" % (self.outdir, url) + self._sort_gff(data, dest) tId = trackData["label"] trackDict = { "type": "FeatureTrack", @@ -972,11 +1033,14 @@ self.trackIdlist.append(tId) def add_bed(self, data, ext, trackData): - url = "%s.%s" % (trackData["label"], ext) - dest = "%s/%s.gz" % (self.outdir, url) - self._sort_bed(data, dest) tId = trackData["label"] - url = url + ".gz" + useuri = trackData["useuri"].lower() == "yes" + if useuri: + url = data + else: + url = "%s.%s.gz" % (trackData["label"], ext) + dest = "%s/%s" % (self.outdir, url) + self._sort_bed(data, dest) trackDict = { "type": "FeatureTrack", "trackId": tId, @@ -1068,19 +1132,22 @@ for i, ( dataset_path, dataset_ext, + useuri, track_human_label, extra_metadata, ) in enumerate(track["trackfiles"]): - # Unsanitize labels (element_identifiers are always sanitized by Galaxy) - for key, value in mapped_chars.items(): - track_human_label = track_human_label.replace(value, key) - track_human_label = track_human_label.replace(" ", "_") + if not dataset_path.strip().startswith("http"): + # Unsanitize labels (element_identifiers are always sanitized by Galaxy) + for key, value in mapped_chars.items(): + track_human_label = track_human_label.replace(value, key) + track_human_label = track_human_label.replace(" ", "_") outputTrackConfig = { "category": category, "style": {}, } outputTrackConfig["key"] = track_human_label + outputTrackConfig["useuri"] = useuri outputTrackConfig["trackset"] = track.get("trackset", {}) outputTrackConfig["label"] = "%s_%i_%s" % ( @@ -1139,25 +1206,17 @@ ) elif dataset_ext == "bam": real_indexes = track["conf"]["options"]["bam"]["bam_index"] - if not isinstance(real_indexes, list): - real_indexes = [real_indexes] - self.add_bam( dataset_path, outputTrackConfig, - track["conf"]["options"]["bam"], - bam_index=real_indexes[i], + bam_index=real_indexes, ) elif dataset_ext == "cram": - real_indexes = track["conf"]["options"]["cram"][ "cram_index"] - if not isinstance(real_indexes, list): - real_indexes = [real_indexes] - + real_indexes = track["conf"]["options"]["cram"]["cram_index"] self.add_cram( dataset_path, outputTrackConfig, - track["conf"]["options"]["cram"], - cram_index=real_indexes[i], + cram_index=real_indexes, ) elif dataset_ext == "blastxml": self.add_blastxml( @@ -1221,6 +1280,7 @@ "assemblyName": self.genome_name, "start": 0, "end": 100000, + "refName": "x", } if data.get("defaultLocation", ""): @@ -1307,9 +1367,9 @@ json.dump(self.config_json, config_file, indent=2) def clone_jbrowse(self): - """Clone a JBrowse directory into a destination directory. This also works in Biocontainer testing now """ + """Clone a JBrowse directory into a destination directory. This also works in Biocontainer testing now""" dest = self.outdir - #self.subprocess_check_call(['jbrowse', 'create', dest, '--tag', f"{JB_VER}"]) + # self.subprocess_check_call(['jbrowse', 'create', dest, '--tag', f"{JB_VER}"]) shutil.copytree(self.jbrowse2path, dest, dirs_exist_ok=True) for fn in [ "asset-manifest.json", @@ -1341,7 +1401,9 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description="", epilog="") parser.add_argument("--xml", help="Track Configuration") - parser.add_argument("--jbrowse2path", help="Path to JBrowse2 directory in biocontainer or Conda") + parser.add_argument( + "--jbrowse2path", help="Path to JBrowse2 directory in biocontainer or Conda" + ) parser.add_argument("--outdir", help="Output directory", default="out") parser.add_argument("--version", "-V", action="version", version="%(prog)s 2.0.1") args = parser.parse_args() @@ -1360,7 +1422,9 @@ jbrowse2path=args.jbrowse2path, genomes=[ { - "path": os.path.realpath(x.attrib["path"]), + "path": x.attrib["path"], + "label": x.attrib["label"], + "useuri": x.attrib["useuri"], "meta": metadata_from_node(x.find("metadata")), } for x in root.findall("metadata/genomes/genome") @@ -1395,10 +1459,12 @@ trackfiles = track.findall("files/trackFile") if trackfiles: for x in track.findall("files/trackFile"): + track_conf["useuri"] = x.attrib["useuri"] if is_multi_bigwig: multi_bigwig_paths.append( ( x.attrib["label"], + x.attrib["useuri"], os.path.realpath(x.attrib["path"]), ) ) @@ -1406,14 +1472,23 @@ if trackfiles: metadata = metadata_from_node(x.find("metadata")) track_conf["dataset_id"] = metadata["dataset_id"] - track_conf["trackfiles"].append( - ( - os.path.realpath(x.attrib["path"]), + if x.attrib["useuri"].lower() == "yes": + tfa = ( + x.attrib["path"], x.attrib["ext"], + x.attrib["useuri"], x.attrib["label"], metadata, ) - ) + else: + tfa = ( + os.path.realpath(x.attrib["path"]), + x.attrib["ext"], + x.attrib["useuri"], + x.attrib["label"], + metadata, + ) + track_conf["trackfiles"].append(tfa) if is_multi_bigwig: metadata = metadata_from_node(x.find("metadata")) @@ -1447,7 +1522,6 @@ except TypeError: track_conf["style"] = {} pass - track_conf["conf"] = etree_to_dict(track.find("options")) keys = jc.process_annotations(track_conf) if keys: