comparison jbrowse2.py @ 134:ed3a21033188 draft

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/jbrowse2 commit bb6736899ac3029f73455637a04a006fcd857fc2
author bgruening
date Sun, 20 Oct 2024 07:11:16 +0000
parents cec274db51c0
children 21bb464c1d53
comparison
equal deleted inserted replaced
133:cec274db51c0 134:ed3a21033188
11 import shutil 11 import shutil
12 import ssl 12 import ssl
13 import string 13 import string
14 import struct 14 import struct
15 import subprocess 15 import subprocess
16 import tempfile
17 import urllib.request 16 import urllib.request
18 import xml.etree.ElementTree as ET 17 import xml.etree.ElementTree as ET
19 from collections import defaultdict 18 from collections import defaultdict
20 19
21 logging.basicConfig(level=logging.DEBUG) 20 logging.basicConfig(level=logging.DEBUG)
444 if logCommands: 443 if logCommands:
445 log.debug(command) 444 log.debug(command)
446 p = subprocess.Popen( 445 p = subprocess.Popen(
447 command, 446 command,
448 cwd=self.outdir, 447 cwd=self.outdir,
449 shell=False, 448 shell=True,
450 stdin=subprocess.PIPE, 449 stdin=subprocess.PIPE,
451 stdout=subprocess.PIPE, 450 stdout=subprocess.PIPE,
452 stderr=subprocess.PIPE, 451 stderr=subprocess.PIPE,
453 ) 452 )
454 output, err = p.communicate() 453 output, err = p.communicate()
552 contig = fl.decode("utf8").strip() 551 contig = fl.decode("utf8").strip()
553 # Merlin 172788 8 60 61 552 # Merlin 172788 8 60 61
554 else: 553 else:
555 faname = gname + ".fa.gz" 554 faname = gname + ".fa.gz"
556 fadest = os.path.realpath(os.path.join(self.outdir, faname)) 555 fadest = os.path.realpath(os.path.join(self.outdir, faname))
557 cmd = ["bgzip", "-i", "-c", fapath, "-I", fadest + ".gzi"] 556 cmd = "bgzip -k -i -c -I '%s.gzi' '%s' > '%s'" % (fadest, fapath, fadest)
558 with open(fadest, "wb") as fout: 557 subprocess.run(cmd, shell=True)
559 self.subprocess_check_call(cmd, output=fout)
560 cmd = ["samtools", "faidx", fadest] 558 cmd = ["samtools", "faidx", fadest]
561 self.subprocess_check_call(cmd) 559 self.subprocess_check_call(cmd)
562 contig = open(fadest + ".fai", "r").readline().strip() 560 contig = open(fadest + ".fai", "r").readline().strip()
563 adapter = { 561 adapter = {
564 "type": "BgzipFastaAdapter", 562 "type": "BgzipFastaAdapter",
751 if self.config_json.get("plugins", None): 749 if self.config_json.get("plugins", None):
752 self.config_json["plugins"].append(mafPlugin["plugins"][0]) 750 self.config_json["plugins"].append(mafPlugin["plugins"][0])
753 else: 751 else:
754 self.config_json.update(mafPlugin) 752 self.config_json.update(mafPlugin)
755 753
756 def _blastxml_to_gff3(self, xml, min_gap=10): 754 def _sort_gff(self, data, dest):
757 gff3_unrebased = tempfile.NamedTemporaryFile(delete=False) 755 # Only index if not already done
758 cmd = [ 756 if not os.path.exists(dest):
759 "python", 757 e = os.environ
760 os.path.join(INSTALLED_TO, "blastxml_to_gapped_gff3.py"), 758 e["SHELL"] = "/bin/sh"
761 "--trim", 759 cmd = "jbrowse sort-gff %s | bgzip -c > %s" % (data, dest)
762 "--trim_end", 760 subprocess.run(cmd, env=e, shell=True)
763 "--include_seq", 761 self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest])
764 "--min_gap", 762
765 str(min_gap), 763 def add_gff(self, data, trackData):
766 xml, 764 tId = trackData["label"]
767 ] 765 useuri = trackData["useuri"].lower() == "yes"
768 subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_unrebased) 766 if useuri:
769 gff3_unrebased.close() 767 url = trackData["path"]
770 logging.debug("### blastxml to gff3 cmd = %s" % " ".join(cmd)) 768 else:
771 return gff3_unrebased.name 769 url = tId + ".gz"
772 770 dest = os.path.join(self.outdir, url)
773 def add_blastxml(self, data, trackData, blastOpts, **kwargs): 771 self._sort_gff(data, dest)
774 gff3 = self._blastxml_to_gff3(data, min_gap=blastOpts["min_gap"]) 772 categ = trackData["category"]
775 if "parent" in blastOpts and blastOpts["parent"] != "None": 773 trackDict = {
776 gff3_rebased = tempfile.NamedTemporaryFile(delete=False) 774 "type": "FeatureTrack",
777 cmd = ["python", os.path.join(INSTALLED_TO, "gff3_rebase.py")] 775 "trackId": tId,
778 if blastOpts.get("protein", "false") == "true": 776 "name": trackData["name"],
779 cmd.append("--protein2dna") 777 "assemblyNames": [trackData["assemblyNames"]],
780 cmd.extend([os.path.realpath(blastOpts["parent"]), gff3]) 778 "category": [
781 subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_rebased) 779 categ,
782 logging.debug("### gff3rebase cmd = %s" % " ".join(cmd)) 780 ],
783 gff3_rebased.close() 781 "adapter": {
784 # Replace original gff3 file 782 "type": "Gff3TabixAdapter",
785 shutil.copy(gff3_rebased.name, gff3) 783 "gffGzLocation": {
786 os.unlink(gff3_rebased.name) 784 "uri": url,
787 self.add_gff(gff3, trackData, **kwargs) 785 },
786 "index": {
787 "location": {
788 "uri": url + ".tbi",
789 }
790 },
791 },
792 "displays": [
793 {
794 "type": "LinearBasicDisplay",
795 "displayId": "%s-LinearBasicDisplay" % tId,
796 },
797 {
798 "type": "LinearArcDisplay",
799 "displayId": "%s-LinearArcDisplay" % tId,
800 },
801 ],
802 }
803 style_json = self._prepare_track_style(trackDict)
804 trackDict["style"] = style_json
805 self.tracksToAdd[trackData["assemblyNames"]].append(copy.copy(trackDict))
806 self.trackIdlist.append(tId)
788 807
789 def add_bigwig(self, data, trackData): 808 def add_bigwig(self, data, trackData):
790 tId = trackData["label"] 809 tId = trackData["label"]
791 useuri = trackData["useuri"].lower() == "yes" 810 useuri = trackData["useuri"].lower() == "yes"
792 if useuri: 811 if useuri:
836 self.subprocess_check_call(["cp", data, dest]) 855 self.subprocess_check_call(["cp", data, dest])
837 url = fname 856 url = fname
838 bindex = fname + ".bai" 857 bindex = fname + ".bai"
839 bi = bam_indexes.split(",") 858 bi = bam_indexes.split(",")
840 bam_index = [ 859 bam_index = [
841 x.split(" ~ ")[1].strip() 860 x.split("~~~")[1].strip()
842 for x in bi 861 for x in bi
843 if " ~ " in x and x.split(" ~ ")[0].strip() == realFName 862 if "~~~" in x and x.split("~~~")[0].strip() == realFName
844 ] 863 ]
845 logging.debug( 864 logging.debug(
846 "===realFName=%s got %s as bam_indexes %s as bi, %s for bam_index" 865 "===realFName=%s got %s as bam_indexes %s as bi, %s for bam_index"
847 % (realFName, bam_indexes, bi, bam_index) 866 % (realFName, bam_indexes, bi, bam_index)
848 ) 867 )
898 dest = os.path.join(self.outdir, fname) 917 dest = os.path.join(self.outdir, fname)
899 url = fname 918 url = fname
900 self.subprocess_check_call(["cp", data, dest]) 919 self.subprocess_check_call(["cp", data, dest])
901 ci = cram_indexes.split(",") 920 ci = cram_indexes.split(",")
902 cram_index = [ 921 cram_index = [
903 x.split(" ~ ")[1].strip() 922 x.split("~~~")[1].strip()
904 for x in ci 923 for x in ci
905 if " ~ " in x and x.split(" ~ ")[0].strip() == realFName 924 if "~~~" in x and x.split("~~~")[0].strip() == realFName
906 ] 925 ]
907 logging.debug( 926 logging.debug(
908 "===realFName=%s got %s as cram_indexes %s as ci, %s for cram_index" 927 "===realFName=%s got %s as cram_indexes %s as ci, %s for cram_index"
909 % (realFName, cram_indexes, ci, cram_index) 928 % (realFName, cram_indexes, ci, cram_index)
910 ) 929 )
995 style_json = self._prepare_track_style(trackDict) 1014 style_json = self._prepare_track_style(trackDict)
996 trackDict["style"] = style_json 1015 trackDict["style"] = style_json
997 self.tracksToAdd[trackData["assemblyNames"]].append(copy.copy(trackDict)) 1016 self.tracksToAdd[trackData["assemblyNames"]].append(copy.copy(trackDict))
998 self.trackIdlist.append(tId) 1017 self.trackIdlist.append(tId)
999 1018
1000 def _sort_gff(self, data, dest):
1001 # Only index if not already done
1002 if not os.path.exists(dest):
1003 e = os.environ
1004 e['SHELL'] = '/bin/sh'
1005 cmd = ['/bin/sh', '-c', "jbrowse sort-gff %s | bgzip -c > %s" % (data, dest)]
1006 subprocess.run(cmd, env=e)
1007 self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest])
1008
1009 def _sort_bed(self, data, dest): 1019 def _sort_bed(self, data, dest):
1010 # Only index if not already done 1020 # Only index if not already done
1011 if not os.path.exists(dest): 1021 if not os.path.exists(dest):
1012 cmd = ["sort", "-k1,1", "-k2,2n", data] 1022 cmd = ["sort", "-k1,1", "-k2,2n", data]
1013 ps = subprocess.run(cmd, check=True, capture_output=True) 1023 ps = subprocess.run(cmd, check=True, capture_output=True)
1014 cmd = ["bgzip", "-c"] 1024 cmd = ["bgzip", "-c"]
1015 with open(dest, "wb") as fout: 1025 with open(dest, "wb") as fout:
1016 subprocess.run(cmd, input=ps.stdout, stdout=fout) 1026 subprocess.run(cmd, input=ps.stdout, stdout=fout)
1017 cmd = ["tabix", "-f", "-p", "bed", dest] 1027 cmd = ["tabix", "-f", "-p", "bed", dest]
1018 self.subprocess_check_call(cmd) 1028 self.subprocess_check_call(cmd)
1019
1020 def add_gff(self, data, trackData):
1021 tId = trackData["label"]
1022 useuri = trackData["useuri"].lower() == "yes"
1023 if useuri:
1024 url = trackData["path"]
1025 else:
1026 url = tId + ".gz"
1027 dest = os.path.join(self.outdir, url)
1028 self._sort_gff(data, dest)
1029 categ = trackData["category"]
1030 trackDict = {
1031 "type": "FeatureTrack",
1032 "trackId": tId,
1033 "name": trackData["name"],
1034 "assemblyNames": [trackData["assemblyNames"]],
1035 "category": [
1036 categ,
1037 ],
1038 "adapter": {
1039 "type": "Gff3TabixAdapter",
1040 "gffGzLocation": {
1041 "uri": url,
1042 },
1043 "index": {
1044 "location": {
1045 "uri": url + ".tbi",
1046 }
1047 },
1048 },
1049 "displays": [
1050 {
1051 "type": "LinearBasicDisplay",
1052 "displayId": "%s-LinearBasicDisplay" % tId,
1053 },
1054 {
1055 "type": "LinearArcDisplay",
1056 "displayId": "%s-LinearArcDisplay" % tId,
1057 },
1058 ],
1059 }
1060 style_json = self._prepare_track_style(trackDict)
1061 trackDict["style"] = style_json
1062 self.tracksToAdd[trackData["assemblyNames"]].append(copy.copy(trackDict))
1063 self.trackIdlist.append(tId)
1064 1029
1065 def add_bed(self, data, ext, trackData): 1030 def add_bed(self, data, ext, trackData):
1066 bedPlugin = {"name": "BedScorePlugin", "umdLoc": {"uri": "bedscoreplugin.js"}} 1031 bedPlugin = {"name": "BedScorePlugin", "umdLoc": {"uri": "bedscoreplugin.js"}}
1067 tId = trackData["label"] 1032 tId = trackData["label"]
1068 categ = trackData["category"] 1033 categ = trackData["category"]
1137 url = tId 1102 url = tId
1138 usePIF = False # much faster if indexed remotely or locally but broken in biocontainer. 1103 usePIF = False # much faster if indexed remotely or locally but broken in biocontainer.
1139 useuri = data.startswith("http://") or data.startswith("https://") 1104 useuri = data.startswith("http://") or data.startswith("https://")
1140 if not useuri: 1105 if not useuri:
1141 if canPIF: 1106 if canPIF:
1142 fakeName = "%s.paf" % tId 1107 fakeName = os.path.join(self.outdir, "%s.paf" % tId)
1143 url = "%s.pif.gz" % tId 1108 url = "%s.pif.gz" % tId
1144 cmd = ["cp", data, fakeName] 1109 cmd = ["cp", data, fakeName]
1145 self.subprocess_check_call(cmd) 1110 self.subprocess_check_call(cmd)
1146 cmd = [ 1111 cmd = [
1147 "jbrowse", 1112 "jbrowse",
1148 "make-pif", 1113 "make-pif",
1149 fakeName, 1114 fakeName,
1150 ] # jbrowse pif input.paf --out output.pif.gz # specify output file, creates output.pif.gz.tbi also 1115 ]
1151 self.subprocess_check_call(cmd) 1116 self.subprocess_check_call(cmd)
1152 usePIF = True 1117 usePIF = True
1153 else: 1118 else:
1154 dest = os.path.join(self.outdir, url) 1119 dest = os.path.join(self.outdir, url)
1155 self.symlink_or_copy(os.path.realpath(data), dest) 1120 self.symlink_or_copy(os.path.realpath(data), dest)
1158 if data.endswith(".pif.gz") or data.endswith(".paf.gz"): # is tabix 1123 if data.endswith(".pif.gz") or data.endswith(".paf.gz"): # is tabix
1159 usePIF = True 1124 usePIF = True
1160 categ = trackData["category"] 1125 categ = trackData["category"]
1161 pg = pafOpts["genome"].split(",") 1126 pg = pafOpts["genome"].split(",")
1162 pgc = [x.strip() for x in pg if x.strip() > ""] 1127 pgc = [x.strip() for x in pg if x.strip() > ""]
1163 gnomes = [x.split(" ~ ") for x in pgc] 1128 gnomes = [x.split("~~~") for x in pgc]
1164 logging.debug("pg=%s, gnomes=%s" % (pg, gnomes)) 1129 logging.debug("pg=%s, gnomes=%s" % (pg, gnomes))
1165 passnames = [trackData["assemblyNames"]] # always first 1130 passnames = [trackData["assemblyNames"]] # always first
1166 for i, (gpath, gname) in enumerate(gnomes): 1131 for i, (gpath, gname) in enumerate(gnomes):
1167 # may have been forgotten by user for uri 1132 # may have been forgotten by user for uri
1168 if len(gname) == 0: 1133 if len(gname) == 0:
1333 real_indexes = track["conf"]["options"]["cram"]["cram_index"] 1298 real_indexes = track["conf"]["options"]["cram"]["cram_index"]
1334 self.add_cram( 1299 self.add_cram(
1335 dataset_path, 1300 dataset_path,
1336 outputTrackConfig, 1301 outputTrackConfig,
1337 cram_indexes=real_indexes, 1302 cram_indexes=real_indexes,
1338 )
1339 elif dataset_ext == "blastxml":
1340 self.add_blastxml(
1341 dataset_path,
1342 outputTrackConfig,
1343 track["conf"]["options"]["blast"],
1344 ) 1303 )
1345 elif dataset_ext == "vcf": 1304 elif dataset_ext == "vcf":
1346 self.add_vcf(dataset_path, outputTrackConfig) 1305 self.add_vcf(dataset_path, outputTrackConfig)
1347 elif dataset_ext == "paf": 1306 elif dataset_ext == "paf":
1348 self.add_paf( 1307 self.add_paf(