comparison jbrowse2/jbrowse2_json.py @ 6:88b9b105c09b draft

Uploaded
author fubar
date Fri, 05 Jan 2024 01:58:02 +0000
parents
children
comparison
equal deleted inserted replaced
5:42ca8804cd93 6:88b9b105c09b
1 #!/usr/bin/env python
2 # change to accumulating all configuration for config.json based on the default from the clone
3 import argparse
4 import datetime
5 import hashlib
6 import json
7 import logging
8 import os
9 import shutil
10 import subprocess
11 import tempfile
12 import xml.etree.ElementTree as ET
13 from collections import defaultdict
14
15 logging.basicConfig(level=logging.INFO)
16 log = logging.getLogger("jbrowse")
17 TODAY = datetime.datetime.now().strftime("%Y-%m-%d")
18 GALAXY_INFRASTRUCTURE_URL = None
19 mapped_chars = {
20 ">": "__gt__",
21 "<": "__lt__",
22 "'": "__sq__",
23 '"': "__dq__",
24 "[": "__ob__",
25 "]": "__cb__",
26 "{": "__oc__",
27 "}": "__cc__",
28 "@": "__at__",
29 "#": "__pd__",
30 "": "__cn__",
31 }
32
33
34 def etree_to_dict(t):
35 if t is None:
36 return {}
37
38 d = {t.tag: {} if t.attrib else None}
39 children = list(t)
40 if children:
41 dd = defaultdict(list)
42 for dc in map(etree_to_dict, children):
43 for k, v in dc.items():
44 dd[k].append(v)
45 d = {t.tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}}
46 if t.attrib:
47 d[t.tag].update(("@" + k, v) for k, v in t.attrib.items())
48 if t.text:
49 text = t.text.strip()
50 if children or t.attrib:
51 if text:
52 d[t.tag]["#text"] = text
53 else:
54 d[t.tag] = text
55 return d
56
57
58 INSTALLED_TO = os.path.dirname(os.path.realpath(__file__))
59
60
61 def metadata_from_node(node):
62 metadata = {}
63 try:
64 if len(node.findall("dataset")) != 1:
65 # exit early
66 return metadata
67 except Exception:
68 return {}
69
70 for (key, value) in node.findall("dataset")[0].attrib.items():
71 metadata["dataset_%s" % key] = value
72
73 for (key, value) in node.findall("history")[0].attrib.items():
74 metadata["history_%s" % key] = value
75
76 for (key, value) in node.findall("metadata")[0].attrib.items():
77 metadata["metadata_%s" % key] = value
78
79 for (key, value) in node.findall("tool")[0].attrib.items():
80 metadata["tool_%s" % key] = value
81
82 # Additional Mappings applied:
83 metadata[
84 "dataset_edam_format"
85 ] = '<a target="_blank" href="http://edamontology.org/{0}">{1}</a>'.format(
86 metadata["dataset_edam_format"], metadata["dataset_file_ext"]
87 )
88 metadata["history_user_email"] = '<a href="mailto:{0}">{0}</a>'.format(
89 metadata["history_user_email"]
90 )
91 metadata["hist_name"] = metadata["history_display_name"]
92 metadata[
93 "history_display_name"
94 ] = '<a target="_blank" href="{galaxy}/history/view/{encoded_hist_id}">{hist_name}</a>'.format(
95 galaxy=GALAXY_INFRASTRUCTURE_URL,
96 encoded_hist_id=metadata["history_id"],
97 hist_name=metadata["history_display_name"],
98 )
99 metadata[
100 "tool_tool"
101 ] = '<a target="_blank" href="{galaxy}/datasets/{encoded_id}/show_params">{tool_id}</a>'.format(
102 galaxy=GALAXY_INFRASTRUCTURE_URL,
103 encoded_id=metadata["dataset_id"],
104 tool_id=metadata["tool_tool_id"],
105 # tool_version=metadata['tool_tool_version'],
106 )
107 return metadata
108
109
110 class JbrowseConnector(object):
111 def __init__(self, jbrowse, outdir, genomes, standalone=None):
112 self.debug = False
113 self.giURL = GALAXY_INFRASTRUCTURE_URL
114 self.jbrowse = jbrowse
115 self.outdir = outdir
116 os.makedirs(self.outdir, exist_ok=True)
117 self.genome_paths = genomes
118 self.standalone = standalone
119 self.trackIdlist = []
120 self.tracksToAdd = []
121 self.config_json = {}
122 self.config_json_file = os.path.realpath(os.path.join(outdir, "config.json"))
123 if standalone == "complete":
124 self.clone_jbrowse(self.jbrowse, self.outdir)
125 elif standalone == "minimal":
126 self.clone_jbrowse(self.jbrowse, self.outdir, minimal=True)
127
128 def subprocess_check_call(self, command, output=None):
129 if output:
130 if self.debug:
131 log.debug("cd %s && %s > %s", self.outdir, " ".join(command), output)
132 subprocess.check_call(command, cwd=self.outdir, stdout=output)
133 else:
134 log.debug("cd %s && %s", self.outdir, " ".join(command))
135 subprocess.check_call(command, cwd=self.outdir)
136
137 def subprocess_popen(self, command):
138 if self.debug:
139 log.debug("cd %s && %s", self.outdir, command)
140 p = subprocess.Popen(
141 command,
142 shell=True,
143 stdin=subprocess.PIPE,
144 stdout=subprocess.PIPE,
145 stderr=subprocess.PIPE,
146 )
147 output, err = p.communicate()
148 retcode = p.returncode
149 if retcode != 0:
150 log.error("cd %s && %s", self.outdir, command)
151 log.error(output)
152 log.error(err)
153 raise RuntimeError("Command failed with exit code %s" % (retcode))
154
155 def subprocess_check_output(self, command):
156 if self.debug:
157 log.debug("cd %s && %s", self.outdir, " ".join(command))
158 return subprocess.check_output(command, cwd=self.outdir)
159
160 def _jbrowse_bin(self, command):
161 return os.path.realpath(os.path.join(self.jbrowse, "bin", command))
162
163 def symlink_or_copy(self, src, dest):
164 if "GALAXY_JBROWSE_SYMLINKS" in os.environ and bool(
165 os.environ["GALAXY_JBROWSE_SYMLINKS"]
166 ):
167 cmd = ["ln", "-s", src, dest]
168 else:
169 cmd = ["cp", src, dest]
170
171 return self.subprocess_check_call(cmd)
172
173 def _add_track(self, track_data):
174 if len(track_data) == 0:
175 return
176 cmd = [
177 "jbrowse",
178 "add-track",
179 track_data["path"],
180 "-t",
181 track_data["type"],
182 "-n",
183 track_data["name"],
184 "-l",
185 "move",
186 "--trackId",
187 track_data["label"],
188 "--target",
189 self.outdir,
190 ]
191 if track_data.get("indexfile"):
192 cmd += ["--indexFile", track_data["indexfile"]]
193 if track_data.get("category"):
194 for c in track_data["category"]:
195 cmd += ["--category", c]
196
197 def process_genomes(self):
198 assemblies = []
199 for i, genome_node in enumerate(self.genome_paths):
200 log.info("genome_node=%s" % str(genome_node))
201 # We only expect one input genome per run. This for loop is just
202 # easier to write than the alternative / catches any possible
203 # issues.
204 genome_name = genome_node["meta"]["dataset_dname"]
205 dsId = genome_node["meta"]["dataset_id"]
206 fapath = genome_node["path"]
207 faname = genome_name + ".fa.gz"
208 faind = os.path.realpath(os.path.join(self.outdir, faname + ".gzi"))
209 if True or self.standalone == "complete":
210 fadest = os.path.realpath(os.path.join(self.outdir, faname))
211 cmd = "bgzip -i -c %s > %s && samtools faidx %s" % (
212 fapath,
213 fadest,
214 fadest,
215 )
216 self.subprocess_popen(cmd)
217 adapter = {
218 "type": "BgzipFastaAdapter",
219 "fastaLocation": {
220 "uri": faname,
221 },
222 "faiLocation": {
223 "uri": faname + ".fai",
224 },
225 "gziLocation": {
226 "uri": faname + ".gzi",
227 },
228 }
229 else:
230 faurl = "%s/api/datasets/%s/display" % (self.giURL, dsId)
231 fastalocation = {
232 "uri": faurl,
233 }
234 failocation = {
235 "uri": faname + ".fai",
236 }
237 adapter = {
238 "type": "IndexedFastaAdapter",
239 "fastaLocation": fastalocation,
240 "faiLocation": failocation,
241 }
242
243 cmd = ["samtools", "faidx", fapath, "--fai-idx", faind]
244 self.subprocess_check_call(cmd)
245 trackDict = {
246 "name": genome_name,
247 "sequence": {
248 "type": "ReferenceSequenceTrack",
249 "trackId": genome_name,
250 "adapter": adapter,
251 },
252 "rendering": {"type": "DivSequenceRenderer"},
253 }
254 assemblies.append(trackDict)
255 # self.config_json["assemblies"] = assemblies
256 self.genome_name = genome_name
257 cmd = [
258 "jbrowse",
259 "add-assembly",
260 faname,
261 "-t",
262 "bgzipFasta",
263 "-n",
264 genome_name,
265 "--load",
266 "inPlace",
267 "--faiLocation",
268 faname + ".fai",
269 "--gziLocation",
270 faname + ".gzi",
271 "--target",
272 self.outdir,
273 ]
274 self.subprocess_check_call(cmd)
275
276 def add_default_view(self):
277 cmd = [
278 "jbrowse",
279 "set-default-session",
280 "-s",
281 self.config_json_file,
282 "-t",
283 ",".join(self.trackIdlist),
284 "-n",
285 "JBrowse2 in Galaxy",
286 "--target",
287 self.config_json_file,
288 "-v",
289 " LinearGenomeView",
290 ]
291 if True or self.debug:
292 log.info("### calling set-default-session with cmd=%s" % " ".join(cmd))
293 self.subprocess_check_call(cmd)
294
295 def write_config(self):
296 with open(self.config_json_file, "w") as fp:
297 json.dump(self.config_json, fp)
298
299 def add_hic(self, data, trackData):
300 """
301 HiC adapter.
302 https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md
303 for testing locally, these work:
304 HiC data is from https://s3.amazonaws.com/igv.broadinstitute.org/data/hic/intra_nofrag_30.hic
305 using hg19 reference track as a
306 'BgzipFastaAdapter'
307 fastaLocation:
308 uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz',
309 faiLocation:
310 uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.fai',
311 gziLocation:
312 uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.gzi',
313 Cool will not be likely to be a good fit - see discussion at https://github.com/GMOD/jbrowse-components/issues/2438
314 """
315 log.info("#### trackData=%s" % trackData)
316 tId = trackData["label"]
317 dsId = trackData["metadata"]["dataset_id"]
318 url = "%s/api/datasets/%s/display?to_ext=hic " % (
319 self.giURL,
320 dsId,
321 )
322 hname = trackData["name"]
323 if True or self.standalone == "complete":
324 dest = os.path.realpath(os.path.join(self.outdir, hname))
325 url = hname
326 cmd = ["cp", data, dest]
327 self.subprocess_check_call(cmd)
328 floc = {
329 "uri": hname,
330 }
331 else:
332 url = "%s/api/datasets/%s/display?to_ext=hic" % (self.giURL, dsId)
333 floc = {
334 "uri": url,
335 }
336 trackDict = {
337 "type": "HicTrack",
338 "trackId": tId,
339 "name": hname,
340 "assemblyNames": [self.genome_name],
341 "adapter": {
342 "type": "HicAdapter",
343 "hicLocation": floc,
344 },
345 }
346 # self.tracksToAdd.append(trackDict)
347 # self.trackIdlist.append(tId)
348 cmd = [
349 "jbrowse",
350 "add-track",
351 url,
352 "-t",
353 "HicTrack",
354 "-a",
355 self.genome_name,
356 "-n",
357 hname,
358 "--load",
359 "inPlace",
360 "--target",
361 self.outdir,
362 ]
363 self.subprocess_check_call(cmd)
364
365 def add_maf(self, data, trackData):
366 """
367 from https://github.com/cmdcolin/maf2bed
368 Note: Both formats start with a MAF as input, and note that your MAF file should contain the species name and chromosome name
369 e.g. hg38.chr1 in the sequence identifiers.
370 need the reference id - eg hg18, for maf2bed.pl as the first parameter
371 """
372 mafPlugin = {
373 "plugins": [
374 {
375 "name": "MafViewer",
376 "url": "https://unpkg.com/jbrowse-plugin-mafviewer/dist/jbrowse-plugin-mafviewer.umd.production.min.js",
377 }
378 ]
379 }
380 tId = trackData["label"]
381 fname = "%s.bed" % tId
382 dest = os.path.realpath("%s/%s" % (self.outdir, fname))
383 # self.symlink_or_copy(data, dest)
384 # Process MAF to bed-like. Need build to munge chromosomes
385 gname = self.genome_name
386 cmd = [
387 "bash",
388 os.path.join(INSTALLED_TO, "convertMAF.sh"),
389 data,
390 gname,
391 INSTALLED_TO,
392 dest,
393 ]
394 self.subprocess_check_call(cmd)
395 if True or self.debug:
396 log.info("### convertMAF.sh called as %s" % " ".join(cmd))
397 # Construct samples list
398 # We could get this from galaxy metadata, not sure how easily.
399 ps = subprocess.Popen(["grep", "^s [^ ]*", "-o", data], stdout=subprocess.PIPE)
400 output = subprocess.check_output(("sort", "-u"), stdin=ps.stdout)
401 ps.wait()
402 outp = output.decode("ascii")
403 soutp = outp.split("\n")
404 samp = [x.split("s ")[1] for x in soutp if x.startswith("s ")]
405 samples = [x.split(".")[0] for x in samp]
406 if self.debug:
407 log.info("### got samples = %s " % (samples))
408 trackDict = {
409 "type": "MafTrack",
410 "trackId": tId,
411 "name": trackData["name"],
412 "adapter": {
413 "type": "MafTabixAdapter",
414 "samples": samples,
415 "bedGzLocation": {
416 "uri": fname + ".sorted.bed.gz",
417 },
418 "index": {
419 "location": {
420 "uri": fname + ".sorted.bed.gz.tbi",
421 },
422 },
423 },
424 "assemblyNames": [self.genome_name],
425 }
426 self.tracksToAdd.append(trackDict)
427 self.trackIdlist.append(tId)
428 if self.config_json.get("plugins", None):
429 self.config_json["plugins"].append(mafPlugin[0])
430 else:
431 self.config_json.update(mafPlugin)
432
433 def _blastxml_to_gff3(self, xml, min_gap=10):
434 gff3_unrebased = tempfile.NamedTemporaryFile(delete=False)
435 cmd = [
436 "python",
437 os.path.join(INSTALLED_TO, "blastxml_to_gapped_gff3.py"),
438 "--trim",
439 "--trim_end",
440 "--include_seq",
441 "--min_gap",
442 str(min_gap),
443 xml,
444 ]
445 subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_unrebased)
446 gff3_unrebased.close()
447 return gff3_unrebased.name
448
449 def add_blastxml(self, data, trackData, blastOpts, **kwargs):
450 gff3 = self._blastxml_to_gff3(data, min_gap=blastOpts["min_gap"])
451
452 if "parent" in blastOpts and blastOpts["parent"] != "None":
453 gff3_rebased = tempfile.NamedTemporaryFile(delete=False)
454 cmd = ["python", os.path.join(INSTALLED_TO, "gff3_rebase.py")]
455 if blastOpts.get("protein", "false") == "true":
456 cmd.append("--protein2dna")
457 cmd.extend([os.path.realpath(blastOpts["parent"]), gff3])
458 subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_rebased)
459 gff3_rebased.close()
460
461 # Replace original gff3 file
462 shutil.copy(gff3_rebased.name, gff3)
463 os.unlink(gff3_rebased.name)
464 url = "%s.gff3" % trackData["label"]
465 dest = os.path.realpath("%s/%s" % (self.outdir, url))
466 self._sort_gff(gff3, dest)
467 url = url + ".gz"
468 tId = trackData["label"]
469 trackDict = {
470 "type": "FeatureTrack",
471 "trackId": tId,
472 "name": trackData["name"],
473 "assemblyNames": [self.genome_name],
474 "adapter": {
475 "type": "Gff3TabixAdapter",
476 "gffGzLocation": {
477 "uri": url,
478 },
479 "index": {
480 "location": {
481 "uri": url + ".tbi",
482 }
483 },
484 },
485 "displays": [
486 {
487 "type": "LinearBasicDisplay",
488 "displayId": "%s-LinearBasicDisplay" % tId,
489 },
490 {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
491 ],
492 }
493 # self.tracksToAdd.append(trackDict)
494 # self.trackIdlist.append(tId)
495 cmd = [
496 "jbrowse",
497 "add-track",
498 url,
499 "-t",
500 "FeatureTrack",
501 "-a",
502 self.genome_name,
503 "--indexFile",
504 url + ".tbi",
505 "-n",
506 trackData["name"],
507 "--load",
508 "inPlace",
509 "--target",
510 self.outdir,
511 ]
512 self.subprocess_check_call(cmd)
513 os.unlink(gff3)
514
515 def add_bigwig(self, data, trackData):
516 url = "%s.bw" % trackData["name"]
517 if True or self.standalone == "complete":
518 dest = os.path.realpath(os.path.join(self.outdir, url))
519 cmd = ["cp", data, dest]
520 self.subprocess_check_call(cmd)
521 bwloc = {"uri": url}
522 else:
523 dsId = trackData["metadata"]["dataset_id"]
524 url = "%s/api/datasets/%s/display?to_ext=fasta" % (self.giURL, dsId)
525 bwloc = {"uri": url}
526 tId = trackData["label"]
527 trackDict = {
528 "type": "QuantitativeTrack",
529 "trackId": tId,
530 "name": url,
531 "assemblyNames": [
532 self.genome_name,
533 ],
534 "adapter": {
535 "type": "BigWigAdapter",
536 "bigWigLocation": bwloc,
537 },
538 "displays": [
539 {
540 "type": "LinearWiggleDisplay",
541 "displayId": "%s-LinearWiggleDisplay" % tId,
542 }
543 ],
544 }
545 # self.tracksToAdd.append(trackDict)
546 # self.trackIdlist.append(tId)
547 cmd = [
548 "jbrowse",
549 "add-track",
550 url,
551 "-t",
552 "QuantitativeTrack",
553 "-a",
554 self.genome_name,
555 "-n",
556 trackData["name"],
557 "--load",
558 "inPlace",
559 "--target",
560 self.outdir,
561 ]
562 self.subprocess_check_call(cmd)
563
564 def add_bam(self, data, trackData, bamOpts, bam_index=None, **kwargs):
565 tId = trackData["label"]
566 fname = "%s.bam" % trackData["label"]
567 dest = os.path.realpath("%s/%s" % (self.outdir, fname))
568 if True or self.standalone == "complete":
569 url = fname
570 self.subprocess_check_call(["cp", data, dest])
571 log.info("### copied %s to %s" % (data, dest))
572 bloc = {"uri": url}
573 else:
574 dsId = trackData["metadata"]["dataset_id"]
575 url = "%s/api/datasets/%s/display?to_ext=bam" % (self.giURL, dsId)
576 bloc = {"uri": url}
577 if bam_index is not None and os.path.exists(os.path.realpath(bam_index)):
578 # bai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest
579 self.subprocess_check_call(
580 ["cp", os.path.realpath(bam_index), dest + ".bai"]
581 )
582 else:
583 # Can happen in exotic condition
584 # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam
585 # => no index generated by galaxy, but there might be one next to the symlink target
586 # this trick allows to skip the bam sorting made by galaxy if already done outside
587 if os.path.exists(os.path.realpath(data) + ".bai"):
588 self.symlink_or_copy(os.path.realpath(data) + ".bai", dest + ".bai")
589 else:
590 log.warn("Could not find a bam index (.bai file) for %s", data)
591 trackDict = {
592 "type": "AlignmentsTrack",
593 "trackId": tId,
594 "name": trackData["name"],
595 "assemblyNames": [self.genome_name],
596 "adapter": {
597 "type": "BamAdapter",
598 "bamLocation": bloc,
599 "index": {
600 "location": {
601 "uri": fname + ".bai",
602 }
603 },
604 },
605 }
606 # self.tracksToAdd.append(trackDict)
607 # self.trackIdlist.append(tId)
608 cmd = [
609 "jbrowse",
610 "add-track",
611 fname,
612 "-t",
613 "AlignmentsTrack",
614 "-l",
615 "inPlace",
616 "-a",
617 self.genome_name,
618 "--indexFile",
619 fname + ".bai",
620 "-n",
621 trackData["name"],
622 "--target",
623 self.outdir,
624 ]
625 self.subprocess_check_call(cmd)
626
627 def add_vcf(self, data, trackData):
628 tId = trackData["label"]
629 url = "%s/api/datasets/%s/display" % (
630 self.giURL,
631 trackData["metadata"]["dataset_id"],
632 )
633 url = "%s.vcf.gz" % tId
634 dest = os.path.realpath("%s/%s" % (self.outdir, url))
635 cmd = "bgzip -c %s > %s" % (data, dest)
636 self.subprocess_popen(cmd)
637 cmd = ["tabix", "-p", "vcf", dest]
638 self.subprocess_check_call(cmd)
639 trackDict = {
640 "type": "VariantTrack",
641 "trackId": tId,
642 "name": trackData["name"],
643 "assemblyNames": [self.genome_name],
644 "adapter": {
645 "type": "VcfTabixAdapter",
646 "vcfGzLocation": {
647 "uri": url,
648 },
649 "index": {
650 "location": {
651 "uri": url + ".tbi",
652 }
653 },
654 },
655 "displays": [
656 {
657 "type": "LinearVariantDisplay",
658 "displayId": "%s-LinearVariantDisplay" % tId,
659 },
660 {
661 "type": "ChordVariantDisplay",
662 "displayId": "%s-ChordVariantDisplay" % tId,
663 },
664 {
665 "type": "LinearPairedArcDisplay",
666 "displayId": "%s-LinearPairedArcDisplay" % tId,
667 },
668 ],
669 }
670 # self.tracksToAdd.append(trackDict)
671 # self.trackIdlist.append(tId)
672 cmd = [
673 "jbrowse",
674 "add-track",
675 url,
676 "-t",
677 "VariantTrack",
678 "-a",
679 self.genome_name,
680 "--indexFile",
681 url + ".tbi",
682 "-n",
683 trackData["name"],
684 "--load",
685 "inPlace",
686 "--target",
687 self.outdir,
688 ]
689 self.subprocess_check_call(cmd)
690
691 def _sort_gff(self, data, dest):
692 # Only index if not already done
693 if not os.path.exists(dest + ".gz"):
694 cmd = "jbrowse sort-gff %s | bgzip -c > %s.gz" % (
695 data,
696 dest,
697 ) # "gff3sort.pl --precise '%s' | grep -v \"^$\" > '%s'"
698 self.subprocess_popen(cmd)
699 self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest + ".gz"])
700
701 def _sort_bed(self, data, dest):
702 # Only index if not already done
703 if not os.path.exists(dest):
704 cmd = "sort -k1,1 -k2,2n %s | bgzip -c > %s" % (data, dest)
705 self.subprocess_popen(cmd)
706 cmd = ["tabix", "-f", "-p", "bed", dest]
707 self.subprocess_check_call(cmd)
708
709 def add_gff(self, data, ext, trackData):
710 url = "%s.%s" % (trackData["label"], ext)
711 dest = os.path.realpath("%s/%s" % (self.outdir, url))
712 self._sort_gff(data, dest)
713 url = url + ".gz"
714 tId = trackData["label"]
715 trackDict = {
716 "type": "FeatureTrack",
717 "trackId": tId,
718 "name": trackData["name"],
719 "assemblyNames": [self.genome_name],
720 "adapter": {
721 "type": "Gff3TabixAdapter",
722 "gffGzLocation": {
723 "uri": url,
724 },
725 "index": {
726 "location": {
727 "uri": url + ".tbi",
728 }
729 },
730 },
731 "displays": [
732 {
733 "type": "LinearBasicDisplay",
734 "displayId": "%s-LinearBasicDisplay" % tId,
735 },
736 {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
737 ],
738 }
739 # self.tracksToAdd.append(trackDict)
740 # self.trackIdlist.append(tId)
741 cmd = [
742 "jbrowse",
743 "add-track",
744 url,
745 "-t",
746 "FeatureTrack",
747 "-a",
748 self.genome_name,
749 "-n",
750 trackData["name"],
751 "--load",
752 "inPlace",
753 "--target",
754 self.outdir,
755 ]
756 self.subprocess_check_call(cmd)
757
758 def add_bed(self, data, ext, trackData):
759 url = "%s.%s" % (trackData["label"], ext)
760 dest = os.path.realpath("%s/%s.gz" % (self.outdir, url))
761 self._sort_bed(data, dest)
762 tId = trackData["label"]
763 url = url + ".gz"
764 trackDict = {
765 "type": "FeatureTrack",
766 "trackId": tId,
767 "name": trackData["name"],
768 "assemblyNames": [self.genome_name],
769 "adapter": {
770 "type": "BedTabixAdapter",
771 "bedGzLocation": {
772 "uri": url,
773 },
774 "index": {
775 "location": {
776 "uri": url + ".tbi",
777 }
778 },
779 },
780 "displays": [
781 {
782 "type": "LinearBasicDisplay",
783 "displayId": "%s-LinearBasicDisplay" % tId,
784 },
785 {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId},
786 ],
787 }
788 # self.tracksToAdd.append(trackDict)
789 # self.trackIdlist.append(tId)
790 cmd = [
791 "jbrowse",
792 "add-track",
793 url,
794 "-t",
795 "FeatureTrack",
796 "-a",
797 self.genome_name,
798 "--indexFile",
799 url + ".tbi",
800 "-n",
801 trackData["name"],
802 "--load",
803 "inPlace",
804 "--target",
805 self.outdir,
806 ]
807 self.subprocess_check_call(cmd)
808
809 def process_annotations(self, track):
810 category = track["category"].replace("__pd__date__pd__", TODAY)
811 for i, (
812 dataset_path,
813 dataset_ext,
814 track_human_label,
815 extra_metadata,
816 ) in enumerate(track["trackfiles"]):
817 # Unsanitize labels (element_identifiers are always sanitized by Galaxy)
818 for key, value in mapped_chars.items():
819 track_human_label = track_human_label.replace(value, key)
820 outputTrackConfig = {
821 "category": category,
822 }
823 if self.debug:
824 log.info(
825 "Processing category = %s, track_human_label = %s",
826 category,
827 track_human_label,
828 )
829 # We add extra data to hash for the case of REST + SPARQL.
830 if (
831 "conf" in track
832 and "options" in track["conf"]
833 and "url" in track["conf"]["options"]
834 ):
835 rest_url = track["conf"]["options"]["url"]
836 else:
837 rest_url = ""
838
839 # I chose to use track['category'] instead of 'category' here. This
840 # is intentional. This way re-running the tool on a different date
841 # will not generate different hashes and make comparison of outputs
842 # much simpler.
843 hashData = [
844 str(dataset_path),
845 track_human_label,
846 track["category"],
847 rest_url,
848 ]
849 hashData = "|".join(hashData).encode("utf-8")
850 outputTrackConfig["label"] = hashlib.md5(hashData).hexdigest() + "_%s" % i
851 outputTrackConfig["metadata"] = extra_metadata
852 outputTrackConfig["name"] = track_human_label
853
854 if dataset_ext in ("gff", "gff3"):
855 self.add_gff(
856 dataset_path,
857 dataset_ext,
858 outputTrackConfig,
859 )
860 elif dataset_ext in ("hic",):
861 self.add_hic(
862 dataset_path,
863 outputTrackConfig,
864 )
865 elif dataset_ext in ("bed",):
866 self.add_bed(
867 dataset_path,
868 dataset_ext,
869 outputTrackConfig,
870 )
871 elif dataset_ext in ("maf",):
872 self.add_maf(
873 dataset_path,
874 outputTrackConfig,
875 )
876 elif dataset_ext == "bigwig":
877 self.add_bigwig(
878 dataset_path,
879 outputTrackConfig,
880 )
881 elif dataset_ext == "bam":
882 real_indexes = track["conf"]["options"]["pileup"]["bam_indices"][
883 "bam_index"
884 ]
885 if not isinstance(real_indexes, list):
886 # <bam_indices>
887 # <bam_index>/path/to/a.bam.bai</bam_index>
888 # </bam_indices>
889 #
890 # The above will result in the 'bam_index' key containing a
891 # string. If there are two or more indices, the container
892 # becomes a list. Fun!
893 real_indexes = [real_indexes]
894
895 self.add_bam(
896 dataset_path,
897 outputTrackConfig,
898 track["conf"]["options"]["pileup"],
899 bam_index=real_indexes[i],
900 )
901 elif dataset_ext == "blastxml":
902 self.add_blastxml(
903 dataset_path, outputTrackConfig, track["conf"]["options"]["blast"]
904 )
905 elif dataset_ext == "vcf":
906 self.add_vcf(dataset_path, outputTrackConfig)
907 else:
908 log.warn("Do not know how to handle %s", dataset_ext)
909
910 def clone_jbrowse(self, jbrowse_dir, destination, minimal=False):
911 """Clone a JBrowse directory into a destination directory."""
912 cmd = ["jbrowse", "create", "-f", self.outdir]
913 self.subprocess_check_call(cmd)
914 for fn in [
915 "asset-manifest.json",
916 "favicon.ico",
917 "robots.txt",
918 "umd_plugin.js",
919 "version.txt",
920 "test_data",
921 ]:
922 cmd = ["rm", "-rf", os.path.join(self.outdir, fn)]
923 self.subprocess_check_call(cmd)
924
925
926 if __name__ == "__main__":
927 parser = argparse.ArgumentParser(description="", epilog="")
928 parser.add_argument("xml", type=argparse.FileType("r"), help="Track Configuration")
929
930 parser.add_argument("--jbrowse", help="Folder containing a jbrowse release")
931 parser.add_argument("--outdir", help="Output directory", default="out")
932 parser.add_argument(
933 "--standalone",
934 choices=["complete", "minimal", "data"],
935 help="Standalone mode includes a copy of JBrowse",
936 )
937 parser.add_argument("--version", "-V", action="version", version="%(prog)s 0.8.0")
938 args = parser.parse_args()
939
940 tree = ET.parse(args.xml.name)
941 root = tree.getroot()
942
943 # This should be done ASAP
944 GALAXY_INFRASTRUCTURE_URL = root.find("metadata/galaxyUrl").text
945 # Sometimes this comes as `localhost` without a protocol
946 if not GALAXY_INFRASTRUCTURE_URL.startswith("http"):
947 # so we'll prepend `http://` and hope for the best. Requests *should*
948 # be GET and not POST so it should redirect OK
949 GALAXY_INFRASTRUCTURE_URL = "http://" + GALAXY_INFRASTRUCTURE_URL
950
951 jc = JbrowseConnector(
952 jbrowse=args.jbrowse,
953 outdir=args.outdir,
954 genomes=[
955 {
956 "path": os.path.realpath(x.attrib["path"]),
957 "meta": metadata_from_node(x.find("metadata")),
958 }
959 for x in root.findall("metadata/genomes/genome")
960 ],
961 standalone=args.standalone,
962 )
963 jc.process_genomes()
964
965 for track in root.findall("tracks/track"):
966 track_conf = {}
967 track_conf["trackfiles"] = []
968
969 is_multi_bigwig = False
970 try:
971 if track.find("options/wiggle/multibigwig") and (
972 track.find("options/wiggle/multibigwig").text == "True"
973 ):
974 is_multi_bigwig = True
975 multi_bigwig_paths = []
976 except KeyError:
977 pass
978
979 trackfiles = track.findall("files/trackFile")
980 if trackfiles:
981 for x in track.findall("files/trackFile"):
982 if is_multi_bigwig:
983 multi_bigwig_paths.append(
984 (x.attrib["label"], os.path.realpath(x.attrib["path"]))
985 )
986 else:
987 if trackfiles:
988 metadata = metadata_from_node(x.find("metadata"))
989 track_conf["dataset_id"] = metadata["dataset_id"]
990 track_conf["trackfiles"].append(
991 (
992 os.path.realpath(x.attrib["path"]),
993 x.attrib["ext"],
994 x.attrib["label"],
995 metadata,
996 )
997 )
998 else:
999 # For tracks without files (rest, sparql)
1000 track_conf["trackfiles"].append(
1001 (
1002 "", # N/A, no path for rest or sparql
1003 track.attrib["format"],
1004 track.find("options/label").text,
1005 {},
1006 )
1007 )
1008
1009 if is_multi_bigwig:
1010 metadata = metadata_from_node(x.find("metadata"))
1011
1012 track_conf["trackfiles"].append(
1013 (
1014 multi_bigwig_paths, # Passing an array of paths to represent as one track
1015 "bigwig_multiple",
1016 "MultiBigWig", # Giving an hardcoded name for now
1017 {}, # No metadata for multiple bigwig
1018 )
1019 )
1020
1021 track_conf["category"] = track.attrib["cat"]
1022 track_conf["format"] = track.attrib["format"]
1023 try:
1024 # Only pertains to gff3 + blastxml. TODO?
1025 track_conf["style"] = {t.tag: t.text for t in track.find("options/style")}
1026 except TypeError:
1027 track_conf["style"] = {}
1028 pass
1029 track_conf["conf"] = etree_to_dict(track.find("options"))
1030 jc.process_annotations(track_conf)
1031 print("## processed", str(track_conf), "trackIdlist", jc.trackIdlist)
1032 print(
1033 "###done processing, trackIdlist=",
1034 jc.trackIdlist,
1035 "config=",
1036 str(jc.config_json),
1037 )
1038 jc.config_json["tracks"] = jc.tracksToAdd
1039 # jc.write_config()
1040 jc.add_default_view()