# HG changeset patch # User fubar # Date 1704419882 0 # Node ID 88b9b105c09bbc33b0fe13eed2ed0eb94160d97f # Parent 42ca8804cd93c311461fda28c47b377dfe1ed075 Uploaded diff -r 42ca8804cd93 -r 88b9b105c09b jbrowse2/blastxml_to_gapped_gff3.py --- a/jbrowse2/blastxml_to_gapped_gff3.py Thu Jan 04 02:18:18 2024 +0000 +++ b/jbrowse2/blastxml_to_gapped_gff3.py Fri Jan 05 01:58:02 2024 +0000 @@ -6,8 +6,9 @@ import sys from BCBio import GFF + logging.basicConfig(level=logging.INFO) -log = logging.getLogger(name='blastxml2gff3') +log = logging.getLogger(name="blastxml2gff3") __doc__ = """ BlastXML files, when transformed to GFF3, do not normally show gaps in the @@ -25,41 +26,53 @@ for idx_record, record in enumerate(blast_records): # http://www.sequenceontology.org/browser/release_2.4/term/SO:0000343 match_type = { # Currently we can only handle BLASTN, BLASTP - 'BLASTN': 'nucleotide_match', - 'BLASTP': 'protein_match', - }.get(record.application, 'match') + "BLASTN": "nucleotide_match", + "BLASTP": "protein_match", + }.get(record.application, "match") recid = record.query - if ' ' in recid: - recid = recid[0:recid.index(' ')] + if " " in recid: + recid = recid[0 : recid.index(" ")] rec = SeqRecord(Seq("ACTG"), id=recid) for idx_hit, hit in enumerate(record.alignments): for idx_hsp, hsp in enumerate(hit.hsps): qualifiers = { - "ID": 'b2g.%s.%s.%s' % (idx_record, idx_hit, idx_hsp), + "ID": "b2g.%s.%s.%s" % (idx_record, idx_hit, idx_hsp), "source": "blast", "score": hsp.expect, "accession": hit.accession, "hit_id": hit.hit_id, "length": hit.length, - "hit_titles": hit.title.split(' >'), + "hit_titles": hit.title.split(" >"), } if include_seq: - qualifiers.update({ - 'blast_qseq': hsp.query, - 'blast_sseq': hsp.sbjct, - 'blast_mseq': hsp.match, - }) + qualifiers.update( + { + "blast_qseq": hsp.query, + "blast_sseq": hsp.sbjct, + "blast_mseq": hsp.match, + } + ) - for prop in ('score', 'bits', 'identities', 'positives', - 'gaps', 'align_length', 'strand', 'frame', - 'query_start', 'query_end', 'sbjct_start', - 'sbjct_end'): - qualifiers['blast_' + prop] = getattr(hsp, prop, None) + for prop in ( + "score", + "bits", + "identities", + "positives", + "gaps", + "align_length", + "strand", + "frame", + "query_start", + "query_end", + "sbjct_start", + "sbjct_end", + ): + qualifiers["blast_" + prop] = getattr(hsp, prop, None) - desc = hit.title.split(' >')[0] - qualifiers['description'] = desc[desc.index(' '):] + desc = hit.title.split(" >")[0] + qualifiers["description"] = desc[desc.index(" ") :] # This required a fair bit of sketching out/match to figure out # the first time. @@ -71,7 +84,7 @@ # may be longer than the parent feature, so we use the supplied # subject/hit length to calculate the real ending of the target # protein. - parent_match_end = hsp.query_start + hit.length + hsp.query.count('-') + parent_match_end = hsp.query_start + hit.length + hsp.query.count("-") # If we trim the left end, we need to trim without losing information. used_parent_match_start = parent_match_start @@ -87,7 +100,7 @@ top_feature = SeqFeature( SimpleLocation(used_parent_match_start, parent_match_end, strand=0), type=match_type, - qualifiers=qualifiers + qualifiers=qualifiers, ) # Unlike the parent feature, ``match_part``s have sources. @@ -95,12 +108,13 @@ "source": "blast", } top_feature.sub_features = [] - for idx_part, (start, end, cigar) in \ - enumerate(generate_parts(hsp.query, hsp.match, - hsp.sbjct, - ignore_under=min_gap)): - part_qualifiers['Gap'] = cigar - part_qualifiers['ID'] = qualifiers['ID'] + ('.%s' % idx_part) + for idx_part, (start, end, cigar) in enumerate( + generate_parts( + hsp.query, hsp.match, hsp.sbjct, ignore_under=min_gap + ) + ): + part_qualifiers["Gap"] = cigar + part_qualifiers["ID"] = qualifiers["ID"] + (".%s" % idx_part) # Otherwise, we have to account for the subject start's location match_part_start = parent_match_start + hsp.sbjct_start + start - 1 @@ -116,7 +130,8 @@ SeqFeature( SimpleLocation(match_part_start, match_part_end, strand=1), type="match_part", - qualifiers=copy.deepcopy(part_qualifiers)) + qualifiers=copy.deepcopy(part_qualifiers), + ) ) rec.features.append(top_feature) @@ -142,13 +157,13 @@ for a match_part """ prev = 0 - fq = '' - fm = '' - fs = '' - for position in re.finditer('-', query): - fq += query[prev:position.start()] - fm += match[prev:position.start()] - fs += subject[prev:position.start()] + fq = "" + fm = "" + fs = "" + for position in re.finditer("-", query): + fq += query[prev : position.start()] + fm += match[prev : position.start()] + fs += subject[prev : position.start()] prev = position.start() + 1 fq += query[prev:] fm += match[prev:] @@ -170,7 +185,7 @@ for i, (q, m, s) in enumerate(zip(query, match, subject)): # If we have a match - if m != ' ' or m == '+': + if m != " " or m == "+": if region_start == -1: region_start = i # It's a new region, we need to reset or it's pre-seeded with @@ -191,8 +206,9 @@ region_q = region_q[0:-ignore_under] region_m = region_m[0:-ignore_under] region_s = region_s[0:-ignore_under] - yield region_start, region_end + 1, \ - cigar_from_string(region_q, region_m, region_s, strict_m=True) + yield region_start, region_end + 1, cigar_from_string( + region_q, region_m, region_s, strict_m=True + ) region_q = [] region_m = [] region_s = [] @@ -201,31 +217,32 @@ region_end = -1 mismatch_count = 0 - yield region_start, region_end + 1, \ - cigar_from_string(region_q, region_m, region_s, strict_m=True) + yield region_start, region_end + 1, cigar_from_string( + region_q, region_m, region_s, strict_m=True + ) def _qms_to_matches(query, match, subject, strict_m=True): matchline = [] for (q, m, s) in zip(query, match, subject): - ret = '' + ret = "" - if m != ' ' or m == '+': - ret = '=' - elif m == ' ': - if q == '-': - ret = 'D' - elif s == '-': - ret = 'I' + if m != " " or m == "+": + ret = "=" + elif m == " ": + if q == "-": + ret = "D" + elif s == "-": + ret = "I" else: - ret = 'X' + ret = "X" else: log.warn("Bad data: \n\t%s\n\t%s\n\t%s\n" % (query, match, subject)) if strict_m: - if ret == '=' or ret == 'X': - ret = 'M' + if ret == "=" or ret == "X": + ret = "M" matchline.append(ret) return matchline @@ -243,7 +260,7 @@ count = 1 last_char = char cigar_line.append("%s%s" % (last_char, count)) - return ' '.join(cigar_line) + return " ".join(cigar_line) def cigar_from_string(query, match, subject, strict_m=True): @@ -254,13 +271,28 @@ return "" -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='Convert Blast XML to gapped GFF3', epilog='') - parser.add_argument('blastxml', type=argparse.FileType("r"), help='Blast XML Output') - parser.add_argument('--min_gap', type=int, help='Maximum gap size before generating a new match_part', default=3) - parser.add_argument('--trim', action='store_true', help='Trim blast hits to be only as long as the parent feature') - parser.add_argument('--trim_end', action='store_true', help='Cut blast results off at end of gene') - parser.add_argument('--include_seq', action='store_true', help='Include sequence') +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Convert Blast XML to gapped GFF3", epilog="" + ) + parser.add_argument( + "blastxml", type=argparse.FileType("r"), help="Blast XML Output" + ) + parser.add_argument( + "--min_gap", + type=int, + help="Maximum gap size before generating a new match_part", + default=3, + ) + parser.add_argument( + "--trim", + action="store_true", + help="Trim blast hits to be only as long as the parent feature", + ) + parser.add_argument( + "--trim_end", action="store_true", help="Cut blast results off at end of gene" + ) + parser.add_argument("--include_seq", action="store_true", help="Include sequence") args = parser.parse_args() for rec in blastxml2gff3(**vars(args)): diff -r 42ca8804cd93 -r 88b9b105c09b jbrowse2/clienttest.jbrowse --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/jbrowse2/clienttest.jbrowse Fri Jan 05 01:58:02 2024 +0000 @@ -0,0 +1,478 @@ +{ + "configuration": { + "rpc": { + "defaultDriver": "WebWorkerRpcDriver", + "drivers": { + "MainThreadRpcDriver": {}, + "WebWorkerRpcDriver": {} + } + }, + "logoPath": { + "locationType": "UriLocation", + "uri": "" + } + }, + "plugins": [], + "assemblies": [ + { + "name": "Merlin", + "sequence": { + "type": "ReferenceSequenceTrack", + "trackId": "Merlin-1704349310829", + "adapter": { + "type": "IndexedFastaAdapter", + "fastaLocation": { + "locationType": "LocalPathLocation", + "localPath": "/home/ross/rossgit/tools-iuc/tools/jbrowse2/test-data/merlin.fa" + }, + "faiLocation": { + "locationType": "LocalPathLocation", + "localPath": "/tmp/complete/Merlin.fasta.fai" + }, + "metadataLocation": { + "locationType": "UriLocation", + "uri": "/path/to/fa.metadata.yaml" + } + }, + "displays": [ + { + "type": "LinearReferenceSequenceDisplay", + "displayId": "Merlin-1704349310829-LinearReferenceSequenceDisplay" + }, + { + "type": "LinearGCContentDisplay", + "displayId": "Merlin-1704349310829-LinearGCContentDisplay" + } + ] + } + } + ], + "tracks": [ + { + "type": "FeatureTrack", + "trackId": "3eeb95fa8dd6e9d46ac3cd1f3a4b3f4b_0.bed.gz-1704349390805", + "name": "3eeb95fa8dd6e9d46ac3cd1f3a4b3f4b_0.bed.gz", + "assemblyNames": [ + "Merlin" + ], + "adapter": { + "type": "BedTabixAdapter", + "bedGzLocation": { + "locationType": "LocalPathLocation", + "localPath": "/tmp/complete/3eeb95fa8dd6e9d46ac3cd1f3a4b3f4b_0.bed.gz" + }, + "index": { + "location": { + "locationType": "LocalPathLocation", + "localPath": "/tmp/complete/3eeb95fa8dd6e9d46ac3cd1f3a4b3f4b_0.bed.gz.tbi" + } + } + }, + "displays": [ + { + "type": "LinearBasicDisplay", + "displayId": "3eeb95fa8dd6e9d46ac3cd1f3a4b3f4b_0.bed.gz-1704349390805-LinearBasicDisplay" + }, + { + "type": "LinearArcDisplay", + "displayId": "3eeb95fa8dd6e9d46ac3cd1f3a4b3f4b_0.bed.gz-1704349390805-LinearArcDisplay" + } + ] + }, + { + "type": "VariantTrack", + "trackId": "8c3f9b71345db675abafdf2f8e33a6cd_0.vcf.gz-1704349461277", + "name": "8c3f9b71345db675abafdf2f8e33a6cd_0.vcf.gz", + "assemblyNames": [ + "Merlin" + ], + "adapter": { + "type": "VcfTabixAdapter", + "vcfGzLocation": { + "locationType": "LocalPathLocation", + "localPath": "/tmp/complete/8c3f9b71345db675abafdf2f8e33a6cd_0.vcf.gz" + }, + "index": { + "location": { + "locationType": "LocalPathLocation", + "localPath": "/tmp/complete/8c3f9b71345db675abafdf2f8e33a6cd_0.vcf.gz.tbi" + } + } + }, + "textSearching": { + "textSearchAdapter": { + "type": "TrixTextSearchAdapter", + "textSearchAdapterId": "8c3f9b71345db675abafdf2f8e33a6cd_0.vcf.gz-1704349461277-index", + "ixFilePath": { + "locationType": "LocalPathLocation", + "localPath": "/home/ross/.config/@jbrowse/desktop/autosaved/trix/8c3f9b71345db675abafdf2f8e33a6cd_0.vcf.gz-1704349461277-index.ix" + }, + "ixxFilePath": { + "locationType": "LocalPathLocation", + "localPath": "/home/ross/.config/@jbrowse/desktop/autosaved/trix/8c3f9b71345db675abafdf2f8e33a6cd_0.vcf.gz-1704349461277-index.ixx" + }, + "metaFilePath": { + "locationType": "LocalPathLocation", + "localPath": "/home/ross/.config/@jbrowse/desktop/autosaved/trix/8c3f9b71345db675abafdf2f8e33a6cd_0.vcf.gz-1704349461277-index.json" + }, + "tracks": [ + "8c3f9b71345db675abafdf2f8e33a6cd_0.vcf.gz-1704349461277" + ], + "assemblyNames": [ + "Merlin" + ] + } + }, + "displays": [ + { + "type": "LinearVariantDisplay", + "displayId": "8c3f9b71345db675abafdf2f8e33a6cd_0.vcf.gz-1704349461277-LinearVariantDisplay" + }, + { + "type": "ChordVariantDisplay", + "displayId": "8c3f9b71345db675abafdf2f8e33a6cd_0.vcf.gz-1704349461277-ChordVariantDisplay" + }, + { + "type": "LinearPairedArcDisplay", + "displayId": "8c3f9b71345db675abafdf2f8e33a6cd_0.vcf.gz-1704349461277-LinearPairedArcDisplay" + } + ] + }, + { + "type": "AlignmentsTrack", + "trackId": "859509e7a79c8f48992e130a2a79b36e_0.bam-1704349533556", + "name": "859509e7a79c8f48992e130a2a79b36e_0.bam", + "assemblyNames": [ + "Merlin" + ], + "adapter": { + "type": "BamAdapter", + "bamLocation": { + "locationType": "LocalPathLocation", + "localPath": "/tmp/complete/859509e7a79c8f48992e130a2a79b36e_0.bam" + }, + "index": { + "location": { + "locationType": "LocalPathLocation", + "localPath": "/tmp/complete/859509e7a79c8f48992e130a2a79b36e_0.bam.bai" + } + }, + "sequenceAdapter": { + "type": "IndexedFastaAdapter", + "fastaLocation": { + "locationType": "LocalPathLocation", + "localPath": "/home/ross/rossgit/tools-iuc/tools/jbrowse2/test-data/merlin.fa" + }, + "faiLocation": { + "locationType": "LocalPathLocation", + "localPath": "/tmp/complete/Merlin.fasta.fai" + }, + "metadataLocation": { + "locationType": "UriLocation", + "uri": "/path/to/fa.metadata.yaml" + } + } + }, + "displays": [ + { + "type": "LinearAlignmentsDisplay", + "displayId": "859509e7a79c8f48992e130a2a79b36e_0.bam-1704349533556-LinearAlignmentsDisplay" + }, + { + "type": "LinearPileupDisplay", + "displayId": "859509e7a79c8f48992e130a2a79b36e_0.bam-1704349533556-LinearPileupDisplay" + }, + { + "type": "LinearSNPCoverageDisplay", + "displayId": "859509e7a79c8f48992e130a2a79b36e_0.bam-1704349533556-LinearSNPCoverageDisplay" + }, + { + "type": "LinearReadArcsDisplay", + "displayId": "859509e7a79c8f48992e130a2a79b36e_0.bam-1704349533556-LinearReadArcsDisplay" + }, + { + "type": "LinearReadCloudDisplay", + "displayId": "859509e7a79c8f48992e130a2a79b36e_0.bam-1704349533556-LinearReadCloudDisplay" + } + ] + }, + { + "type": "FeatureTrack", + "trackId": "1838188642fecd9922f419a0e735e755_0.gff3.gz-1704349668492", + "name": "1838188642fecd9922f419a0e735e755_0.gff3.gz", + "assemblyNames": [ + "Merlin" + ], + "adapter": { + "type": "Gff3TabixAdapter", + "gffGzLocation": { + "locationType": "LocalPathLocation", + "localPath": "/tmp/complete/1838188642fecd9922f419a0e735e755_0.gff3.gz" + }, + "index": { + "location": { + "locationType": "LocalPathLocation", + "localPath": "/tmp/complete/1838188642fecd9922f419a0e735e755_0.gff3.gz.tbi" + } + } + }, + "displays": [ + { + "type": "LinearBasicDisplay", + "displayId": "1838188642fecd9922f419a0e735e755_0.gff3.gz-1704349668492-LinearBasicDisplay" + }, + { + "type": "LinearArcDisplay", + "displayId": "1838188642fecd9922f419a0e735e755_0.gff3.gz-1704349668492-LinearArcDisplay" + } + ] + }, + { + "type": "QuantitativeTrack", + "trackId": "merlin.bw-1704349777260", + "name": "merlin.bw", + "assemblyNames": [ + "Merlin" + ], + "adapter": { + "type": "BigWigAdapter", + "bigWigLocation": { + "locationType": "LocalPathLocation", + "localPath": "/home/ross/rossgit/tools-iuc/tools/jbrowse2/test-data/bw/merlin.bw" + } + }, + "displays": [ + { + "type": "LinearWiggleDisplay", + "displayId": "merlin.bw-1704349777260-LinearWiggleDisplay" + } + ] + } + ], + "internetAccounts": [ + { + "type": "DropboxOAuthInternetAccount", + "internetAccountId": "dropboxOAuth", + "name": "Dropbox", + "description": "Account to access Dropbox files", + "clientId": "ykjqg1kr23pl1i7" + }, + { + "type": "GoogleDriveOAuthInternetAccount", + "internetAccountId": "googleOAuth", + "name": "Google Drive", + "description": "Account to access Google Drive files", + "clientId": "109518325434-m86s8a5og8ijc5m6n7n8dk7e9586bg9i.apps.googleusercontent.com" + } + ], + "aggregateTextSearchAdapters": [], + "connections": [], + "defaultSession": { + "connectionInstances": [], + "drawerPosition": "right", + "drawerWidth": 384, + "widgets": { + "GridBookmark": { + "id": "GridBookmark", + "type": "GridBookmarkWidget" + }, + "addTrackWidget": { + "id": "addTrackWidget", + "type": "AddTrackWidget", + "view": "oATIzuZAL61OxOyGsWP1l" + }, + "JobsList": { + "id": "JobsList", + "type": "JobsListWidget", + "jobs": [], + "finished": [ + { + "name": "8c3f9b71345db675abafdf2f8e33a6cd_0.vcf.gz-index", + "statusMessage": "done", + "progressPct": 100 + } + ], + "queued": [], + "aborted": [] + }, + "configEditor": { + "id": "configEditor", + "type": "ConfigurationEditorWidget", + "target": "1838188642fecd9922f419a0e735e755_0.gff3.gz-1704349668492" + }, + "hierarchicalTrackSelector": { + "id": "hierarchicalTrackSelector", + "type": "HierarchicalTrackSelectorWidget", + "initialized": true, + "collapsed": {}, + "view": "oATIzuZAL61OxOyGsWP1l", + "faceted": { + "filterText": "", + "showSparse": false, + "showFilters": true, + "showOptions": false, + "panelWidth": 400 + } + } + }, + "activeWidgets": { + "hierarchicalTrackSelector": "hierarchicalTrackSelector" + }, + "minimized": false, + "id": "0RCNRFfuiied_MUxKv21r", + "name": "New Session 1/4/2024, 5:21:50 PM", + "margin": 0, + "views": [ + { + "id": "oATIzuZAL61OxOyGsWP1l", + "minimized": false, + "type": "LinearGenomeView", + "offsetPx": 0, + "bpPerPx": 113.22935779816514, + "displayedRegions": [ + { + "refName": "Merlin", + "start": 0, + "end": 172788, + "reversed": false, + "assemblyName": "Merlin" + } + ], + "tracks": [ + { + "id": "6q8cZDkFnpribnYm8CCn5", + "type": "FeatureTrack", + "configuration": "3eeb95fa8dd6e9d46ac3cd1f3a4b3f4b_0.bed.gz-1704349390805", + "minimized": false, + "displays": [ + { + "id": "5lTYaGmlQD_RJMDpG-Mol", + "type": "LinearBasicDisplay", + "configuration": "3eeb95fa8dd6e9d46ac3cd1f3a4b3f4b_0.bed.gz-1704349390805-LinearBasicDisplay" + } + ] + }, + { + "id": "O9bSW2YKphod5uuN1za6E", + "type": "VariantTrack", + "configuration": "8c3f9b71345db675abafdf2f8e33a6cd_0.vcf.gz-1704349461277", + "minimized": false, + "displays": [ + { + "id": "31Hdn7-9Dnr0L6PSxiDmf", + "type": "LinearVariantDisplay", + "configuration": "8c3f9b71345db675abafdf2f8e33a6cd_0.vcf.gz-1704349461277-LinearVariantDisplay" + } + ] + }, + { + "id": "IcnRsR2AjMxQq6wiOT9Vi", + "type": "AlignmentsTrack", + "configuration": "859509e7a79c8f48992e130a2a79b36e_0.bam-1704349533556", + "minimized": false, + "displays": [ + { + "id": "Dvu0sQCwrUmkze6Zua8JY", + "type": "LinearAlignmentsDisplay", + "PileupDisplay": { + "id": "ulHQgd0Omh3VttUCKyvPi", + "type": "LinearPileupDisplay", + "heightPreConfig": 205, + "configuration": { + "type": "LinearPileupDisplay", + "displayId": "859509e7a79c8f48992e130a2a79b36e_0.bam-1704349533556-LinearAlignmentsDisplay_LinearPileupDisplay_xyz" + }, + "filterBy": { + "flagInclude": 0, + "flagExclude": 1540 + }, + "jexlFilters": [], + "showSoftClipping": false + }, + "SNPCoverageDisplay": { + "id": "z_mkwAGHGZCZpvqG0AEMg", + "type": "LinearSNPCoverageDisplay", + "heightPreConfig": 45, + "configuration": { + "type": "LinearSNPCoverageDisplay", + "displayId": "859509e7a79c8f48992e130a2a79b36e_0.bam-1704349533556-LinearAlignmentsDisplay_snpcoverage_xyz" + }, + "selectedRendering": "", + "resolution": 1, + "constraints": {}, + "filterBy": { + "flagInclude": 0, + "flagExclude": 1540 + }, + "jexlFilters": [] + }, + "snpCovHeight": 45, + "configuration": "859509e7a79c8f48992e130a2a79b36e_0.bam-1704349533556-LinearAlignmentsDisplay", + "lowerPanelType": "LinearPileupDisplay" + } + ] + }, + { + "id": "sqXxD9i7mkIgJX9bStLFp", + "type": "FeatureTrack", + "configuration": "1838188642fecd9922f419a0e735e755_0.gff3.gz-1704349668492", + "minimized": false, + "displays": [ + { + "id": "ot2hxRedj7L-2eOtTfj47", + "type": "LinearBasicDisplay", + "configuration": "1838188642fecd9922f419a0e735e755_0.gff3.gz-1704349668492-LinearBasicDisplay" + } + ] + }, + { + "id": "w8XYtNAPQ77lYiJ4yfTc_", + "type": "QuantitativeTrack", + "configuration": "merlin.bw-1704349777260", + "minimized": false, + "displays": [ + { + "id": "R7aBFiHHKrly_WAFzxdcN", + "type": "LinearWiggleDisplay", + "configuration": "merlin.bw-1704349777260-LinearWiggleDisplay", + "selectedRendering": "", + "resolution": 1, + "constraints": {} + } + ] + }, + { + "id": "vCT2YxTb4kbBwbjU7y0Nw", + "type": "ReferenceSequenceTrack", + "configuration": "Merlin-1704349310829", + "minimized": false, + "displays": [ + { + "id": "1hj13cpwWU1pHF9KyXyYA", + "type": "LinearReferenceSequenceDisplay", + "heightPreConfig": 50, + "configuration": "Merlin-1704349310829-LinearReferenceSequenceDisplay", + "showForward": true, + "showReverse": true, + "showTranslation": true + } + ] + } + ], + "hideHeader": false, + "hideHeaderOverview": false, + "hideNoTracksActive": false, + "trackSelectorType": "hierarchical", + "showCenterLine": false, + "showCytobandsSetting": true, + "trackLabels": "", + "showGridlines": true, + "showBookmarkHighlights": true, + "showBookmarkLabels": true + } + ], + "sessionAssemblies": [], + "temporaryAssemblies": [], + "focusedViewId": "oATIzuZAL61OxOyGsWP1l" + } +} \ No newline at end of file diff -r 42ca8804cd93 -r 88b9b105c09b jbrowse2/gff3_rebase.py --- a/jbrowse2/gff3_rebase.py Thu Jan 04 02:18:18 2024 +0000 +++ b/jbrowse2/gff3_rebase.py Fri Jan 05 01:58:02 2024 +0000 @@ -63,8 +63,10 @@ else: yield feature - if hasattr(feature, 'sub_features'): - for x in feature_lambda(feature.sub_features, test, test_kwargs, subfeatures=subfeatures): + if hasattr(feature, "sub_features"): + for x in feature_lambda( + feature.sub_features, test, test_kwargs, subfeatures=subfeatures + ): yield x @@ -74,8 +76,8 @@ For every feature, check that at least one value in feature.quailfiers(kwargs['qualifier']) is in kwargs['attribute_list'] """ - for attribute_value in feature.qualifiers.get(kwargs['qualifier'], []): - if attribute_value in kwargs['attribute_list']: + for attribute_value in feature.qualifiers.get(kwargs["qualifier"], []): + if attribute_value in kwargs["attribute_list"]: return True return False @@ -90,12 +92,12 @@ # If it's an interpro specific gff3 file if interpro: # Then we ignore polypeptide features as they're useless - if feature.type == 'polypeptide': + if feature.type == "polypeptide": continue # If there's an underscore, we strip up to that underscore? # I do not know the rationale for this, removing. # if '_' in parent_feature_id: - # parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:] + # parent_feature_id = parent_feature_id[parent_feature_id.index('_') + 1:] try: child_features[parent_feature_id].append(feature) @@ -134,28 +136,29 @@ feature.location = FeatureLocation(ns, ne, strand=st) - if hasattr(feature, 'sub_features'): + if hasattr(feature, "sub_features"): for subfeature in feature.sub_features: __update_feature_location(subfeature, parent, protein2dna) -def rebase(parent, child, interpro=False, protein2dna=False, map_by='ID'): +def rebase(parent, child, interpro=False, protein2dna=False, map_by="ID"): # get all of the features we will be re-mapping in a dictionary, keyed by parent feature ID child_features = __get_features(child, interpro=interpro) for rec in GFF.parse(parent): replacement_features = [] for feature in feature_lambda( - rec.features, - # Filter features in the parent genome by those that are - # "interesting", i.e. have results in child_features array. - # Probably an unnecessary optimisation. - feature_test_qual_value, - { - 'qualifier': map_by, - 'attribute_list': child_features.keys(), - }, - subfeatures=False): + rec.features, + # Filter features in the parent genome by those that are + # "interesting", i.e. have results in child_features array. + # Probably an unnecessary optimisation. + feature_test_qual_value, + { + "qualifier": map_by, + "attribute_list": child_features.keys(), + }, + subfeatures=False, + ): # Features which will be re-mapped to_remap = child_features[feature.id] @@ -166,7 +169,7 @@ __update_feature_location(x, feature, protein2dna) if interpro: - for y in ('status', 'Target'): + for y in ("status", "Target"): try: del x.qualifiers[y] except Exception: @@ -181,14 +184,26 @@ GFF.write([rec], sys.stdout) -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='rebase gff3 features against parent locations', epilog="") - parser.add_argument('parent', type=argparse.FileType('r'), help='Parent GFF3 annotations') - parser.add_argument('child', type=argparse.FileType('r'), help='Child GFF3 annotations to rebase against parent') - parser.add_argument('--interpro', action='store_true', - help='Interpro specific modifications') - parser.add_argument('--protein2dna', action='store_true', - help='Map protein translated results to original DNA data') - parser.add_argument('--map_by', help='Map by key', default='ID') +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="rebase gff3 features against parent locations", epilog="" + ) + parser.add_argument( + "parent", type=argparse.FileType("r"), help="Parent GFF3 annotations" + ) + parser.add_argument( + "child", + type=argparse.FileType("r"), + help="Child GFF3 annotations to rebase against parent", + ) + parser.add_argument( + "--interpro", action="store_true", help="Interpro specific modifications" + ) + parser.add_argument( + "--protein2dna", + action="store_true", + help="Map protein translated results to original DNA data", + ) + parser.add_argument("--map_by", help="Map by key", default="ID") args = parser.parse_args() rebase(**vars(args)) diff -r 42ca8804cd93 -r 88b9b105c09b jbrowse2/jbrowse2.py --- a/jbrowse2/jbrowse2.py Thu Jan 04 02:18:18 2024 +0000 +++ b/jbrowse2/jbrowse2.py Fri Jan 05 01:58:02 2024 +0000 @@ -110,6 +110,7 @@ class JbrowseConnector(object): def __init__(self, jbrowse, outdir, genomes, standalone=None): self.debug = False + self.usejson = True self.giURL = GALAXY_INFRASTRUCTURE_URL self.jbrowse = jbrowse self.outdir = outdir @@ -118,20 +119,9 @@ self.standalone = standalone self.trackIdlist = [] self.tracksToAdd = [] - self.config_json = { - "configuration": { - "rpc": { - "defaultDriver": "WebWorkerRpcDriver", - "drivers": {"MainThreadRpcDriver": {}, "WebWorkerRpcDriver": {}}, - }, - "logoPath": {"locationType": "UriLocation", "uri": ""}, - } - } - self.config_json_file = os.path.join(outdir, "config.json") - if standalone == "complete": - self.clone_jbrowse(self.jbrowse, self.outdir) - elif standalone == "minimal": - self.clone_jbrowse(self.jbrowse, self.outdir, minimal=True) + self.config_json = {} + self.config_json_file = os.path.realpath(os.path.join(outdir, "config.json")) + self.clone_jbrowse(self.jbrowse, self.outdir) def subprocess_check_call(self, command, output=None): if output: @@ -181,44 +171,98 @@ def process_genomes(self): assemblies = [] for i, genome_node in enumerate(self.genome_paths): - log.info("genome_node=%s" % str(genome_node)) - # We only expect one input genome per run. This for loop is just - # easier to write than the alternative / catches any possible - # issues. + if self.debug: + log.info("genome_node=%s" % str(genome_node)) genome_name = genome_node["meta"]["dataset_dname"] dsId = genome_node["meta"]["dataset_id"] fapath = genome_node["path"] - faname = genome_name + ".fasta" - faind = os.path.realpath(os.path.join(self.outdir, faname + ".fai")) if self.standalone == "complete": - faurl = faname + faname = genome_name + ".fa.gz" fadest = os.path.realpath(os.path.join(self.outdir, faname)) - cmd = ["cp", fapath, fadest] - self.subprocess_check_call(cmd) + cmd = "bgzip -i -c %s > %s && samtools faidx %s" % ( + fapath, + fadest, + fadest, + ) + self.subprocess_popen(cmd) + adapter = { + "type": "BgzipFastaAdapter", + "fastaLocation": { + "uri": faname, + }, + "faiLocation": { + "uri": faname + ".fai", + }, + "gziLocation": { + "uri": faname + ".gzi", + }, + } else: - faurl = "%s/api/datasets/%s/display?to_ext=fasta" % (self.giURL, dsId) - cmd = ["samtools", "faidx", fapath, "--fai-idx", faind] - self.subprocess_check_call(cmd) + faurl = "%s/api/datasets/%s/display" % (self.giURL, dsId) + faname = genome_name + ".fa.fai" + fastalocation = { + "uri": faurl, + } + failocation = { + "uri": faname, + } + adapter = { + "type": "IndexedFastaAdapter", + "fastaLocation": fastalocation, + "faiLocation": failocation, + } + + cmd = ["samtools", "faidx", fapath, "--fai-idx", faname] + self.subprocess_check_call(cmd) trackDict = { "name": genome_name, "sequence": { "type": "ReferenceSequenceTrack", "trackId": genome_name, - "adapter": { - "type": "IndexedFastaAdapter", - "fastaLocation": {"uri": faurl, "locationType": "UriLocation"}, - "faiLocation": { - "uri": faname + ".fai", - "locationType": "UriLocation", - }, - }, + "adapter": adapter, }, + "rendering": {"type": "DivSequenceRenderer"}, } assemblies.append(trackDict) - self.config_json["assemblies"] = assemblies self.genome_name = genome_name - self.genome_path = faurl - self.genome_fai_path = faname + ".fai" + if self.usejson: + self.config_json["assemblies"] = assemblies + else: + if self.standalone == "complete": + cmd = [ + "jbrowse", + "add-assembly", + faname, + "-t", + "bgzipFasta", + "-n", + genome_name, + "--load", + "inPlace", + "--faiLocation", + faname + ".fai", + "--gziLocation", + faname + ".gzi", + "--target", + self.outdir, + ] + else: + cmd = [ + "jbrowse", + "add-assembly", + faname, + "-t", + "indexedFasta", + "-n", + genome_name, + "--load", + "inPlace", + "--faiLocation", + faname + ".fai", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) def add_default_view(self): cmd = [ @@ -229,10 +273,14 @@ "-t", ",".join(self.trackIdlist), "-n", - "Default", + "JBrowse2 in Galaxy", "--target", - self.outdir, - ] # + self.config_json_file, + "-v", + " LinearGenomeView", + ] + if True or self.debug: + log.info("### calling set-default-session with cmd=%s" % " ".join(cmd)) self.subprocess_check_call(cmd) def write_config(self): @@ -268,8 +316,14 @@ url = hname cmd = ["cp", data, dest] self.subprocess_check_call(cmd) + floc = { + "uri": hname, + } else: url = "%s/api/datasets/%s/display?to_ext=hic" % (self.giURL, dsId) + floc = { + "uri": url, + } trackDict = { "type": "HicTrack", "trackId": tId, @@ -277,11 +331,29 @@ "assemblyNames": [self.genome_name], "adapter": { "type": "HicAdapter", - "hicLocation": {"uri": url, "locationType": "UriLocation"}, + "hicLocation": floc, }, } - self.tracksToAdd.append(trackDict) - self.trackIdlist.append(tId) + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "HicTrack", + "-a", + self.genome_name, + "-n", + hname, + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) def add_maf(self, data, trackData): """ @@ -333,9 +405,13 @@ "adapter": { "type": "MafTabixAdapter", "samples": samples, - "bedGzLocation": {"uri": fname + ".sorted.bed.gz"}, + "bedGzLocation": { + "uri": fname + ".sorted.bed.gz", + }, "index": { - "location": {"uri": fname + ".sorted.bed.gz.tbi"}, + "location": { + "uri": fname + ".sorted.bed.gz.tbi", + }, }, }, "assemblyNames": [self.genome_name], @@ -390,9 +466,13 @@ "assemblyNames": [self.genome_name], "adapter": { "type": "Gff3TabixAdapter", - "gffGzLocation": {"locationType": "UriLocation", "uri": url}, + "gffGzLocation": { + "uri": url, + }, "index": { - "location": {"locationType": "UriLocation", "uri": url + ".tbi"} + "location": { + "uri": url + ".tbi", + } }, }, "displays": [ @@ -403,31 +483,52 @@ {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, ], } - self.tracksToAdd.append(trackDict) - self.trackIdlist.append(tId) + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "FeatureTrack", + "-a", + self.genome_name, + "--indexFile", + url + ".tbi", + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) os.unlink(gff3) def add_bigwig(self, data, trackData): - fname = trackData["name"] + url = "%s.bw" % trackData["name"] if self.standalone == "complete": - dest = os.path.realpath(os.path.join(self.outdir, fname)) - url = fname + dest = os.path.realpath(os.path.join(self.outdir, url)) cmd = ["cp", data, dest] self.subprocess_check_call(cmd) + bwloc = {"uri": url} else: dsId = trackData["metadata"]["dataset_id"] url = "%s/api/datasets/%s/display?to_ext=fasta" % (self.giURL, dsId) + bwloc = {"uri": url} tId = trackData["label"] trackDict = { "type": "QuantitativeTrack", "trackId": tId, - "name": fname, + "name": url, "assemblyNames": [ self.genome_name, ], "adapter": { "type": "BigWigAdapter", - "bigWigLocation": {"locationType": "UriLocation", "uri": url}, + "bigWigLocation": bwloc, }, "displays": [ { @@ -436,19 +537,40 @@ } ], } - self.tracksToAdd.append(trackDict) - self.trackIdlist.append(tId) + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "QuantitativeTrack", + "-a", + self.genome_name, + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) def add_bam(self, data, trackData, bamOpts, bam_index=None, **kwargs): tId = trackData["label"] fname = "%s.bam" % trackData["label"] dest = os.path.realpath("%s/%s" % (self.outdir, fname)) - if self.standalone == "minimal": + if self.standalone == "complete": + url = fname + self.subprocess_check_call(["cp", data, dest]) + log.info("### copied %s to %s" % (data, dest)) + bloc = {"uri": url} + else: dsId = trackData["metadata"]["dataset_id"] url = "%s/api/datasets/%s/display?to_ext=bam" % (self.giURL, dsId) - else: - url = fname - self.symlink_or_copy(data, dest) + bloc = {"uri": url} if bam_index is not None and os.path.exists(os.path.realpath(bam_index)): # bai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest self.subprocess_check_call( @@ -470,29 +592,36 @@ "assemblyNames": [self.genome_name], "adapter": { "type": "BamAdapter", - "bamLocation": {"locationType": "UriLocation", "uri": url}, + "bamLocation": bloc, "index": { - "location": {"locationType": "UriLocation", "uri": fname + ".bai"} - }, - "sequenceAdapter": { - "type": "IndexedFastaAdapter", - "fastaLocation": { - "locationType": "UriLocation", - "uri": self.genome_path, - }, - "faiLocation": { - "locationType": "UriLocation", - "uri": self.genome_fai_path, - }, - "metadataLocation": { - "locationType": "UriLocation", - "uri": "/path/to/fa.metadata.yaml", - }, + "location": { + "uri": fname + ".bai", + } }, }, } - self.tracksToAdd.append(trackDict) - self.trackIdlist.append(tId) + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + fname, + "-t", + "AlignmentsTrack", + "-l", + "inPlace", + "-a", + self.genome_name, + "--indexFile", + fname + ".bai", + "-n", + trackData["name"], + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) def add_vcf(self, data, trackData): tId = trackData["label"] @@ -513,9 +642,13 @@ "assemblyNames": [self.genome_name], "adapter": { "type": "VcfTabixAdapter", - "vcfGzLocation": {"uri": url, "locationType": "UriLocation"}, + "vcfGzLocation": { + "uri": url, + }, "index": { - "location": {"uri": url + ".tbi", "locationType": "UriLocation"} + "location": { + "uri": url + ".tbi", + } }, }, "displays": [ @@ -533,8 +666,28 @@ }, ], } - self.tracksToAdd.append(trackDict) - self.trackIdlist.append(tId) + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "VariantTrack", + "-a", + self.genome_name, + "--indexFile", + url + ".tbi", + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) def _sort_gff(self, data, dest): # Only index if not already done @@ -567,9 +720,13 @@ "assemblyNames": [self.genome_name], "adapter": { "type": "Gff3TabixAdapter", - "gffGzLocation": {"locationType": "UriLocation", "uri": url}, + "gffGzLocation": { + "uri": url, + }, "index": { - "location": {"uri": url + ".tbi", "locationType": "UriLocation"} + "location": { + "uri": url + ".tbi", + } }, }, "displays": [ @@ -580,8 +737,26 @@ {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, ], } - self.tracksToAdd.append(trackDict) - self.trackIdlist.append(tId) + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "FeatureTrack", + "-a", + self.genome_name, + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) def add_bed(self, data, ext, trackData): url = "%s.%s" % (trackData["label"], ext) @@ -596,9 +771,13 @@ "assemblyNames": [self.genome_name], "adapter": { "type": "BedTabixAdapter", - "bedGzLocation": {"locationType": "UriLocation", "uri": url}, + "bedGzLocation": { + "uri": url, + }, "index": { - "location": {"uri": url + ".tbi", "locationType": "UriLocation"} + "location": { + "uri": url + ".tbi", + } }, }, "displays": [ @@ -609,8 +788,28 @@ {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, ], } - self.tracksToAdd.append(trackDict) - self.trackIdlist.append(tId) + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "FeatureTrack", + "-a", + self.genome_name, + "--indexFile", + url + ".tbi", + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) def process_annotations(self, track): category = track["category"].replace("__pd__date__pd__", TODAY) @@ -713,7 +912,7 @@ else: log.warn("Do not know how to handle %s", dataset_ext) - def clone_jbrowse(self, jbrowse_dir, destination, minimal=False): + def clone_jbrowse(self, jbrowse_dir, destination): """Clone a JBrowse directory into a destination directory.""" cmd = ["jbrowse", "create", "-f", self.outdir] self.subprocess_check_call(cmd) @@ -842,5 +1041,6 @@ str(jc.config_json), ) jc.config_json["tracks"] = jc.tracksToAdd - jc.write_config() + if jc.usejson: + jc.write_config() jc.add_default_view() diff -r 42ca8804cd93 -r 88b9b105c09b jbrowse2/jbrowse2.xml --- a/jbrowse2/jbrowse2.xml Thu Jan 04 02:18:18 2024 +0000 +++ b/jbrowse2/jbrowse2.xml Fri Jan 05 01:58:02 2024 +0000 @@ -68,7 +68,7 @@ file_ext="${dataset.ext}" /> - #else + #else: ": "__gt__", + "<": "__lt__", + "'": "__sq__", + '"': "__dq__", + "[": "__ob__", + "]": "__cb__", + "{": "__oc__", + "}": "__cc__", + "@": "__at__", + "#": "__pd__", + "": "__cn__", +} + + +def etree_to_dict(t): + if t is None: + return {} + + d = {t.tag: {} if t.attrib else None} + children = list(t) + if children: + dd = defaultdict(list) + for dc in map(etree_to_dict, children): + for k, v in dc.items(): + dd[k].append(v) + d = {t.tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}} + if t.attrib: + d[t.tag].update(("@" + k, v) for k, v in t.attrib.items()) + if t.text: + text = t.text.strip() + if children or t.attrib: + if text: + d[t.tag]["#text"] = text + else: + d[t.tag] = text + return d + + +INSTALLED_TO = os.path.dirname(os.path.realpath(__file__)) + + +def metadata_from_node(node): + metadata = {} + try: + if len(node.findall("dataset")) != 1: + # exit early + return metadata + except Exception: + return {} + + for (key, value) in node.findall("dataset")[0].attrib.items(): + metadata["dataset_%s" % key] = value + + for (key, value) in node.findall("history")[0].attrib.items(): + metadata["history_%s" % key] = value + + for (key, value) in node.findall("metadata")[0].attrib.items(): + metadata["metadata_%s" % key] = value + + for (key, value) in node.findall("tool")[0].attrib.items(): + metadata["tool_%s" % key] = value + + # Additional Mappings applied: + metadata[ + "dataset_edam_format" + ] = '{1}'.format( + metadata["dataset_edam_format"], metadata["dataset_file_ext"] + ) + metadata["history_user_email"] = '{0}'.format( + metadata["history_user_email"] + ) + metadata["hist_name"] = metadata["history_display_name"] + metadata[ + "history_display_name" + ] = '{hist_name}'.format( + galaxy=GALAXY_INFRASTRUCTURE_URL, + encoded_hist_id=metadata["history_id"], + hist_name=metadata["history_display_name"], + ) + metadata[ + "tool_tool" + ] = '{tool_id}'.format( + galaxy=GALAXY_INFRASTRUCTURE_URL, + encoded_id=metadata["dataset_id"], + tool_id=metadata["tool_tool_id"], + # tool_version=metadata['tool_tool_version'], + ) + return metadata + + +class JbrowseConnector(object): + def __init__(self, jbrowse, outdir, genomes, standalone=None): + self.debug = False + self.usejson = True + self.giURL = GALAXY_INFRASTRUCTURE_URL + self.jbrowse = jbrowse + self.outdir = outdir + os.makedirs(self.outdir, exist_ok=True) + self.genome_paths = genomes + self.standalone = standalone + self.trackIdlist = [] + self.tracksToAdd = [] + self.config_json = {} + self.config_json_file = os.path.realpath(os.path.join(outdir, "config.json")) + if standalone == "complete": + self.clone_jbrowse(self.jbrowse, self.outdir) + elif standalone == "minimal": + self.clone_jbrowse(self.jbrowse, self.outdir, minimal=True) + + def subprocess_check_call(self, command, output=None): + if output: + if self.debug: + log.debug("cd %s && %s > %s", self.outdir, " ".join(command), output) + subprocess.check_call(command, cwd=self.outdir, stdout=output) + else: + log.debug("cd %s && %s", self.outdir, " ".join(command)) + subprocess.check_call(command, cwd=self.outdir) + + def subprocess_popen(self, command): + if self.debug: + log.debug("cd %s && %s", self.outdir, command) + p = subprocess.Popen( + command, + shell=True, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + output, err = p.communicate() + retcode = p.returncode + if retcode != 0: + log.error("cd %s && %s", self.outdir, command) + log.error(output) + log.error(err) + raise RuntimeError("Command failed with exit code %s" % (retcode)) + + def subprocess_check_output(self, command): + if self.debug: + log.debug("cd %s && %s", self.outdir, " ".join(command)) + return subprocess.check_output(command, cwd=self.outdir) + + def _jbrowse_bin(self, command): + return os.path.realpath(os.path.join(self.jbrowse, "bin", command)) + + def symlink_or_copy(self, src, dest): + if "GALAXY_JBROWSE_SYMLINKS" in os.environ and bool( + os.environ["GALAXY_JBROWSE_SYMLINKS"] + ): + cmd = ["ln", "-s", src, dest] + else: + cmd = ["cp", src, dest] + + return self.subprocess_check_call(cmd) + + def process_genomes(self): + assemblies = [] + for i, genome_node in enumerate(self.genome_paths): + log.info("genome_node=%s" % str(genome_node)) + # We only expect one input genome per run. This for loop is just + # easier to write than the alternative / catches any possible + # issues. + genome_name = genome_node["meta"]["dataset_dname"] + dsId = genome_node["meta"]["dataset_id"] + fapath = genome_node["path"] + faname = genome_name + ".fa.gz" + faind = os.path.realpath(os.path.join(self.outdir, faname + ".gzi")) + if self.standalone == "complete": + fadest = os.path.realpath(os.path.join(self.outdir, faname)) + cmd = "bgzip -i -c %s > %s && samtools faidx %s" % ( + fapath, + fadest, + fadest, + ) + self.subprocess_popen(cmd) + adapter = { + "type": "BgzipFastaAdapter", + "fastaLocation": { + "uri": faname, + }, + "faiLocation": { + "uri": faname + ".fai", + }, + "gziLocation": { + "uri": faname + ".gzi", + }, + } + else: + faurl = "%s/api/datasets/%s/display" % (self.giURL, dsId) + fastalocation = { + "uri": faurl, + } + failocation = { + "uri": faname + ".fai", + } + adapter = { + "type": "IndexedFastaAdapter", + "fastaLocation": fastalocation, + "faiLocation": failocation, + } + + cmd = ["samtools", "faidx", fapath, "--fai-idx", faind] + self.subprocess_check_call(cmd) + trackDict = { + "name": genome_name, + "sequence": { + "type": "ReferenceSequenceTrack", + "trackId": genome_name, + "adapter": adapter, + }, + "rendering": {"type": "DivSequenceRenderer"}, + } + assemblies.append(trackDict) + self.genome_name = genome_name + if self.usejson: + self.config_json["assemblies"] = assemblies + else: + cmd = [ + "jbrowse", + "add-assembly", + faname, + "-t", + "bgzipFasta", + "-n", + genome_name, + "--load", + "inPlace", + "--faiLocation", + faname + ".fai", + "--gziLocation", + faname + ".gzi", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def add_default_view(self): + cmd = [ + "jbrowse", + "set-default-session", + "-s", + self.config_json_file, + "-t", + ",".join(self.trackIdlist), + "-n", + "JBrowse2 in Galaxy", + "--target", + self.config_json_file, + "-v", + " LinearGenomeView", + ] + if True or self.debug: + log.info("### calling set-default-session with cmd=%s" % " ".join(cmd)) + self.subprocess_check_call(cmd) + + def write_config(self): + with open(self.config_json_file, "w") as fp: + json.dump(self.config_json, fp) + + def add_hic(self, data, trackData): + """ + HiC adapter. + https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md + for testing locally, these work: + HiC data is from https://s3.amazonaws.com/igv.broadinstitute.org/data/hic/intra_nofrag_30.hic + using hg19 reference track as a + 'BgzipFastaAdapter' + fastaLocation: + uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz', + faiLocation: + uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.fai', + gziLocation: + uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.gzi', + Cool will not be likely to be a good fit - see discussion at https://github.com/GMOD/jbrowse-components/issues/2438 + """ + log.info("#### trackData=%s" % trackData) + tId = trackData["label"] + dsId = trackData["metadata"]["dataset_id"] + url = "%s/api/datasets/%s/display?to_ext=hic " % ( + self.giURL, + dsId, + ) + hname = trackData["name"] + if self.standalone == "complete": + dest = os.path.realpath(os.path.join(self.outdir, hname)) + url = hname + cmd = ["cp", data, dest] + self.subprocess_check_call(cmd) + floc = { + "uri": hname, + } + else: + url = "%s/api/datasets/%s/display?to_ext=hic" % (self.giURL, dsId) + floc = { + "uri": url, + } + trackDict = { + "type": "HicTrack", + "trackId": tId, + "name": hname, + "assemblyNames": [self.genome_name], + "adapter": { + "type": "HicAdapter", + "hicLocation": floc, + }, + } + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "HicTrack", + "-a", + self.genome_name, + "-n", + hname, + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def add_maf(self, data, trackData): + """ + from https://github.com/cmdcolin/maf2bed + Note: Both formats start with a MAF as input, and note that your MAF file should contain the species name and chromosome name + e.g. hg38.chr1 in the sequence identifiers. + need the reference id - eg hg18, for maf2bed.pl as the first parameter + """ + mafPlugin = { + "plugins": [ + { + "name": "MafViewer", + "url": "https://unpkg.com/jbrowse-plugin-mafviewer/dist/jbrowse-plugin-mafviewer.umd.production.min.js", + } + ] + } + tId = trackData["label"] + fname = "%s.bed" % tId + dest = os.path.realpath("%s/%s" % (self.outdir, fname)) + # self.symlink_or_copy(data, dest) + # Process MAF to bed-like. Need build to munge chromosomes + gname = self.genome_name + cmd = [ + "bash", + os.path.join(INSTALLED_TO, "convertMAF.sh"), + data, + gname, + INSTALLED_TO, + dest, + ] + self.subprocess_check_call(cmd) + if True or self.debug: + log.info("### convertMAF.sh called as %s" % " ".join(cmd)) + # Construct samples list + # We could get this from galaxy metadata, not sure how easily. + ps = subprocess.Popen(["grep", "^s [^ ]*", "-o", data], stdout=subprocess.PIPE) + output = subprocess.check_output(("sort", "-u"), stdin=ps.stdout) + ps.wait() + outp = output.decode("ascii") + soutp = outp.split("\n") + samp = [x.split("s ")[1] for x in soutp if x.startswith("s ")] + samples = [x.split(".")[0] for x in samp] + if self.debug: + log.info("### got samples = %s " % (samples)) + trackDict = { + "type": "MafTrack", + "trackId": tId, + "name": trackData["name"], + "adapter": { + "type": "MafTabixAdapter", + "samples": samples, + "bedGzLocation": { + "uri": fname + ".sorted.bed.gz", + }, + "index": { + "location": { + "uri": fname + ".sorted.bed.gz.tbi", + }, + }, + }, + "assemblyNames": [self.genome_name], + } + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + if self.config_json.get("plugins", None): + self.config_json["plugins"].append(mafPlugin[0]) + else: + self.config_json.update(mafPlugin) + + def _blastxml_to_gff3(self, xml, min_gap=10): + gff3_unrebased = tempfile.NamedTemporaryFile(delete=False) + cmd = [ + "python", + os.path.join(INSTALLED_TO, "blastxml_to_gapped_gff3.py"), + "--trim", + "--trim_end", + "--include_seq", + "--min_gap", + str(min_gap), + xml, + ] + subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_unrebased) + gff3_unrebased.close() + return gff3_unrebased.name + + def add_blastxml(self, data, trackData, blastOpts, **kwargs): + gff3 = self._blastxml_to_gff3(data, min_gap=blastOpts["min_gap"]) + + if "parent" in blastOpts and blastOpts["parent"] != "None": + gff3_rebased = tempfile.NamedTemporaryFile(delete=False) + cmd = ["python", os.path.join(INSTALLED_TO, "gff3_rebase.py")] + if blastOpts.get("protein", "false") == "true": + cmd.append("--protein2dna") + cmd.extend([os.path.realpath(blastOpts["parent"]), gff3]) + subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_rebased) + gff3_rebased.close() + + # Replace original gff3 file + shutil.copy(gff3_rebased.name, gff3) + os.unlink(gff3_rebased.name) + url = "%s.gff3" % trackData["label"] + dest = os.path.realpath("%s/%s" % (self.outdir, url)) + self._sort_gff(gff3, dest) + url = url + ".gz" + tId = trackData["label"] + trackDict = { + "type": "FeatureTrack", + "trackId": tId, + "name": trackData["name"], + "assemblyNames": [self.genome_name], + "adapter": { + "type": "Gff3TabixAdapter", + "gffGzLocation": { + "uri": url, + }, + "index": { + "location": { + "uri": url + ".tbi", + } + }, + }, + "displays": [ + { + "type": "LinearBasicDisplay", + "displayId": "%s-LinearBasicDisplay" % tId, + }, + {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, + ], + } + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "FeatureTrack", + "-a", + self.genome_name, + "--indexFile", + url + ".tbi", + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + os.unlink(gff3) + + def add_bigwig(self, data, trackData): + url = "%s.bw" % trackData["name"] + if self.standalone == "complete": + dest = os.path.realpath(os.path.join(self.outdir, url)) + cmd = ["cp", data, dest] + self.subprocess_check_call(cmd) + bwloc = {"uri": url} + else: + dsId = trackData["metadata"]["dataset_id"] + url = "%s/api/datasets/%s/display?to_ext=fasta" % (self.giURL, dsId) + bwloc = {"uri": url} + tId = trackData["label"] + trackDict = { + "type": "QuantitativeTrack", + "trackId": tId, + "name": url, + "assemblyNames": [ + self.genome_name, + ], + "adapter": { + "type": "BigWigAdapter", + "bigWigLocation": bwloc, + }, + "displays": [ + { + "type": "LinearWiggleDisplay", + "displayId": "%s-LinearWiggleDisplay" % tId, + } + ], + } + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "QuantitativeTrack", + "-a", + self.genome_name, + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def add_bam(self, data, trackData, bamOpts, bam_index=None, **kwargs): + tId = trackData["label"] + fname = "%s.bam" % trackData["label"] + dest = os.path.realpath("%s/%s" % (self.outdir, fname)) + if self.standalone == "complete": + url = fname + self.subprocess_check_call(["cp", data, dest]) + log.info("### copied %s to %s" % (data, dest)) + bloc = {"uri": url} + else: + dsId = trackData["metadata"]["dataset_id"] + url = "%s/api/datasets/%s/display?to_ext=bam" % (self.giURL, dsId) + bloc = {"uri": url} + if bam_index is not None and os.path.exists(os.path.realpath(bam_index)): + # bai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest + self.subprocess_check_call( + ["cp", os.path.realpath(bam_index), dest + ".bai"] + ) + else: + # Can happen in exotic condition + # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam + # => no index generated by galaxy, but there might be one next to the symlink target + # this trick allows to skip the bam sorting made by galaxy if already done outside + if os.path.exists(os.path.realpath(data) + ".bai"): + self.symlink_or_copy(os.path.realpath(data) + ".bai", dest + ".bai") + else: + log.warn("Could not find a bam index (.bai file) for %s", data) + trackDict = { + "type": "AlignmentsTrack", + "trackId": tId, + "name": trackData["name"], + "assemblyNames": [self.genome_name], + "adapter": { + "type": "BamAdapter", + "bamLocation": bloc, + "index": { + "location": { + "uri": fname + ".bai", + } + }, + }, + } + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + fname, + "-t", + "AlignmentsTrack", + "-l", + "inPlace", + "-a", + self.genome_name, + "--indexFile", + fname + ".bai", + "-n", + trackData["name"], + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def add_vcf(self, data, trackData): + tId = trackData["label"] + url = "%s/api/datasets/%s/display" % ( + self.giURL, + trackData["metadata"]["dataset_id"], + ) + url = "%s.vcf.gz" % tId + dest = os.path.realpath("%s/%s" % (self.outdir, url)) + cmd = "bgzip -c %s > %s" % (data, dest) + self.subprocess_popen(cmd) + cmd = ["tabix", "-p", "vcf", dest] + self.subprocess_check_call(cmd) + trackDict = { + "type": "VariantTrack", + "trackId": tId, + "name": trackData["name"], + "assemblyNames": [self.genome_name], + "adapter": { + "type": "VcfTabixAdapter", + "vcfGzLocation": { + "uri": url, + }, + "index": { + "location": { + "uri": url + ".tbi", + } + }, + }, + "displays": [ + { + "type": "LinearVariantDisplay", + "displayId": "%s-LinearVariantDisplay" % tId, + }, + { + "type": "ChordVariantDisplay", + "displayId": "%s-ChordVariantDisplay" % tId, + }, + { + "type": "LinearPairedArcDisplay", + "displayId": "%s-LinearPairedArcDisplay" % tId, + }, + ], + } + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "VariantTrack", + "-a", + self.genome_name, + "--indexFile", + url + ".tbi", + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def _sort_gff(self, data, dest): + # Only index if not already done + if not os.path.exists(dest + ".gz"): + cmd = "jbrowse sort-gff %s | bgzip -c > %s.gz" % ( + data, + dest, + ) # "gff3sort.pl --precise '%s' | grep -v \"^$\" > '%s'" + self.subprocess_popen(cmd) + self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest + ".gz"]) + + def _sort_bed(self, data, dest): + # Only index if not already done + if not os.path.exists(dest): + cmd = "sort -k1,1 -k2,2n %s | bgzip -c > %s" % (data, dest) + self.subprocess_popen(cmd) + cmd = ["tabix", "-f", "-p", "bed", dest] + self.subprocess_check_call(cmd) + + def add_gff(self, data, ext, trackData): + url = "%s.%s" % (trackData["label"], ext) + dest = os.path.realpath("%s/%s" % (self.outdir, url)) + self._sort_gff(data, dest) + url = url + ".gz" + tId = trackData["label"] + trackDict = { + "type": "FeatureTrack", + "trackId": tId, + "name": trackData["name"], + "assemblyNames": [self.genome_name], + "adapter": { + "type": "Gff3TabixAdapter", + "gffGzLocation": { + "uri": url, + }, + "index": { + "location": { + "uri": url + ".tbi", + } + }, + }, + "displays": [ + { + "type": "LinearBasicDisplay", + "displayId": "%s-LinearBasicDisplay" % tId, + }, + {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, + ], + } + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "FeatureTrack", + "-a", + self.genome_name, + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def add_bed(self, data, ext, trackData): + url = "%s.%s" % (trackData["label"], ext) + dest = os.path.realpath("%s/%s.gz" % (self.outdir, url)) + self._sort_bed(data, dest) + tId = trackData["label"] + url = url + ".gz" + trackDict = { + "type": "FeatureTrack", + "trackId": tId, + "name": trackData["name"], + "assemblyNames": [self.genome_name], + "adapter": { + "type": "BedTabixAdapter", + "bedGzLocation": { + "uri": url, + }, + "index": { + "location": { + "uri": url + ".tbi", + } + }, + }, + "displays": [ + { + "type": "LinearBasicDisplay", + "displayId": "%s-LinearBasicDisplay" % tId, + }, + {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, + ], + } + if self.usejson: + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + else: + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "FeatureTrack", + "-a", + self.genome_name, + "--indexFile", + url + ".tbi", + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def process_annotations(self, track): + category = track["category"].replace("__pd__date__pd__", TODAY) + for i, ( + dataset_path, + dataset_ext, + track_human_label, + extra_metadata, + ) in enumerate(track["trackfiles"]): + # Unsanitize labels (element_identifiers are always sanitized by Galaxy) + for key, value in mapped_chars.items(): + track_human_label = track_human_label.replace(value, key) + outputTrackConfig = { + "category": category, + } + if self.debug: + log.info( + "Processing category = %s, track_human_label = %s", + category, + track_human_label, + ) + # We add extra data to hash for the case of REST + SPARQL. + if ( + "conf" in track + and "options" in track["conf"] + and "url" in track["conf"]["options"] + ): + rest_url = track["conf"]["options"]["url"] + else: + rest_url = "" + + # I chose to use track['category'] instead of 'category' here. This + # is intentional. This way re-running the tool on a different date + # will not generate different hashes and make comparison of outputs + # much simpler. + hashData = [ + str(dataset_path), + track_human_label, + track["category"], + rest_url, + ] + hashData = "|".join(hashData).encode("utf-8") + outputTrackConfig["label"] = hashlib.md5(hashData).hexdigest() + "_%s" % i + outputTrackConfig["metadata"] = extra_metadata + outputTrackConfig["name"] = track_human_label + + if dataset_ext in ("gff", "gff3"): + self.add_gff( + dataset_path, + dataset_ext, + outputTrackConfig, + ) + elif dataset_ext in ("hic",): + self.add_hic( + dataset_path, + outputTrackConfig, + ) + elif dataset_ext in ("bed",): + self.add_bed( + dataset_path, + dataset_ext, + outputTrackConfig, + ) + elif dataset_ext in ("maf",): + self.add_maf( + dataset_path, + outputTrackConfig, + ) + elif dataset_ext == "bigwig": + self.add_bigwig( + dataset_path, + outputTrackConfig, + ) + elif dataset_ext == "bam": + real_indexes = track["conf"]["options"]["pileup"]["bam_indices"][ + "bam_index" + ] + if not isinstance(real_indexes, list): + # + # /path/to/a.bam.bai + # + # + # The above will result in the 'bam_index' key containing a + # string. If there are two or more indices, the container + # becomes a list. Fun! + real_indexes = [real_indexes] + + self.add_bam( + dataset_path, + outputTrackConfig, + track["conf"]["options"]["pileup"], + bam_index=real_indexes[i], + ) + elif dataset_ext == "blastxml": + self.add_blastxml( + dataset_path, outputTrackConfig, track["conf"]["options"]["blast"] + ) + elif dataset_ext == "vcf": + self.add_vcf(dataset_path, outputTrackConfig) + else: + log.warn("Do not know how to handle %s", dataset_ext) + + def clone_jbrowse(self, jbrowse_dir, destination, minimal=False): + """Clone a JBrowse directory into a destination directory.""" + cmd = ["jbrowse", "create", "-f", self.outdir] + self.subprocess_check_call(cmd) + for fn in [ + "asset-manifest.json", + "favicon.ico", + "robots.txt", + "umd_plugin.js", + "version.txt", + "test_data", + ]: + cmd = ["rm", "-rf", os.path.join(self.outdir, fn)] + self.subprocess_check_call(cmd) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="", epilog="") + parser.add_argument("xml", type=argparse.FileType("r"), help="Track Configuration") + + parser.add_argument("--jbrowse", help="Folder containing a jbrowse release") + parser.add_argument("--outdir", help="Output directory", default="out") + parser.add_argument( + "--standalone", + choices=["complete", "minimal", "data"], + help="Standalone mode includes a copy of JBrowse", + ) + parser.add_argument("--version", "-V", action="version", version="%(prog)s 0.8.0") + args = parser.parse_args() + + tree = ET.parse(args.xml.name) + root = tree.getroot() + + # This should be done ASAP + GALAXY_INFRASTRUCTURE_URL = root.find("metadata/galaxyUrl").text + # Sometimes this comes as `localhost` without a protocol + if not GALAXY_INFRASTRUCTURE_URL.startswith("http"): + # so we'll prepend `http://` and hope for the best. Requests *should* + # be GET and not POST so it should redirect OK + GALAXY_INFRASTRUCTURE_URL = "http://" + GALAXY_INFRASTRUCTURE_URL + + jc = JbrowseConnector( + jbrowse=args.jbrowse, + outdir=args.outdir, + genomes=[ + { + "path": os.path.realpath(x.attrib["path"]), + "meta": metadata_from_node(x.find("metadata")), + } + for x in root.findall("metadata/genomes/genome") + ], + standalone=args.standalone, + ) + jc.process_genomes() + + for track in root.findall("tracks/track"): + track_conf = {} + track_conf["trackfiles"] = [] + + is_multi_bigwig = False + try: + if track.find("options/wiggle/multibigwig") and ( + track.find("options/wiggle/multibigwig").text == "True" + ): + is_multi_bigwig = True + multi_bigwig_paths = [] + except KeyError: + pass + + trackfiles = track.findall("files/trackFile") + if trackfiles: + for x in track.findall("files/trackFile"): + if is_multi_bigwig: + multi_bigwig_paths.append( + (x.attrib["label"], os.path.realpath(x.attrib["path"])) + ) + else: + if trackfiles: + metadata = metadata_from_node(x.find("metadata")) + track_conf["dataset_id"] = metadata["dataset_id"] + track_conf["trackfiles"].append( + ( + os.path.realpath(x.attrib["path"]), + x.attrib["ext"], + x.attrib["label"], + metadata, + ) + ) + else: + # For tracks without files (rest, sparql) + track_conf["trackfiles"].append( + ( + "", # N/A, no path for rest or sparql + track.attrib["format"], + track.find("options/label").text, + {}, + ) + ) + + if is_multi_bigwig: + metadata = metadata_from_node(x.find("metadata")) + + track_conf["trackfiles"].append( + ( + multi_bigwig_paths, # Passing an array of paths to represent as one track + "bigwig_multiple", + "MultiBigWig", # Giving an hardcoded name for now + {}, # No metadata for multiple bigwig + ) + ) + + track_conf["category"] = track.attrib["cat"] + track_conf["format"] = track.attrib["format"] + try: + # Only pertains to gff3 + blastxml. TODO? + track_conf["style"] = {t.tag: t.text for t in track.find("options/style")} + except TypeError: + track_conf["style"] = {} + pass + track_conf["conf"] = etree_to_dict(track.find("options")) + jc.process_annotations(track_conf) + print("## processed", str(track_conf), "trackIdlist", jc.trackIdlist) + print( + "###done processing, trackIdlist=", + jc.trackIdlist, + "config=", + str(jc.config_json), + ) + jc.config_json["tracks"] = jc.tracksToAdd + if jc.usejson: + jc.write_config() + jc.add_default_view() diff -r 42ca8804cd93 -r 88b9b105c09b jbrowse2/jbrowse2_json.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/jbrowse2/jbrowse2_json.py Fri Jan 05 01:58:02 2024 +0000 @@ -0,0 +1,1040 @@ +#!/usr/bin/env python +# change to accumulating all configuration for config.json based on the default from the clone +import argparse +import datetime +import hashlib +import json +import logging +import os +import shutil +import subprocess +import tempfile +import xml.etree.ElementTree as ET +from collections import defaultdict + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger("jbrowse") +TODAY = datetime.datetime.now().strftime("%Y-%m-%d") +GALAXY_INFRASTRUCTURE_URL = None +mapped_chars = { + ">": "__gt__", + "<": "__lt__", + "'": "__sq__", + '"': "__dq__", + "[": "__ob__", + "]": "__cb__", + "{": "__oc__", + "}": "__cc__", + "@": "__at__", + "#": "__pd__", + "": "__cn__", +} + + +def etree_to_dict(t): + if t is None: + return {} + + d = {t.tag: {} if t.attrib else None} + children = list(t) + if children: + dd = defaultdict(list) + for dc in map(etree_to_dict, children): + for k, v in dc.items(): + dd[k].append(v) + d = {t.tag: {k: v[0] if len(v) == 1 else v for k, v in dd.items()}} + if t.attrib: + d[t.tag].update(("@" + k, v) for k, v in t.attrib.items()) + if t.text: + text = t.text.strip() + if children or t.attrib: + if text: + d[t.tag]["#text"] = text + else: + d[t.tag] = text + return d + + +INSTALLED_TO = os.path.dirname(os.path.realpath(__file__)) + + +def metadata_from_node(node): + metadata = {} + try: + if len(node.findall("dataset")) != 1: + # exit early + return metadata + except Exception: + return {} + + for (key, value) in node.findall("dataset")[0].attrib.items(): + metadata["dataset_%s" % key] = value + + for (key, value) in node.findall("history")[0].attrib.items(): + metadata["history_%s" % key] = value + + for (key, value) in node.findall("metadata")[0].attrib.items(): + metadata["metadata_%s" % key] = value + + for (key, value) in node.findall("tool")[0].attrib.items(): + metadata["tool_%s" % key] = value + + # Additional Mappings applied: + metadata[ + "dataset_edam_format" + ] = '{1}'.format( + metadata["dataset_edam_format"], metadata["dataset_file_ext"] + ) + metadata["history_user_email"] = '{0}'.format( + metadata["history_user_email"] + ) + metadata["hist_name"] = metadata["history_display_name"] + metadata[ + "history_display_name" + ] = '{hist_name}'.format( + galaxy=GALAXY_INFRASTRUCTURE_URL, + encoded_hist_id=metadata["history_id"], + hist_name=metadata["history_display_name"], + ) + metadata[ + "tool_tool" + ] = '{tool_id}'.format( + galaxy=GALAXY_INFRASTRUCTURE_URL, + encoded_id=metadata["dataset_id"], + tool_id=metadata["tool_tool_id"], + # tool_version=metadata['tool_tool_version'], + ) + return metadata + + +class JbrowseConnector(object): + def __init__(self, jbrowse, outdir, genomes, standalone=None): + self.debug = False + self.giURL = GALAXY_INFRASTRUCTURE_URL + self.jbrowse = jbrowse + self.outdir = outdir + os.makedirs(self.outdir, exist_ok=True) + self.genome_paths = genomes + self.standalone = standalone + self.trackIdlist = [] + self.tracksToAdd = [] + self.config_json = {} + self.config_json_file = os.path.realpath(os.path.join(outdir, "config.json")) + if standalone == "complete": + self.clone_jbrowse(self.jbrowse, self.outdir) + elif standalone == "minimal": + self.clone_jbrowse(self.jbrowse, self.outdir, minimal=True) + + def subprocess_check_call(self, command, output=None): + if output: + if self.debug: + log.debug("cd %s && %s > %s", self.outdir, " ".join(command), output) + subprocess.check_call(command, cwd=self.outdir, stdout=output) + else: + log.debug("cd %s && %s", self.outdir, " ".join(command)) + subprocess.check_call(command, cwd=self.outdir) + + def subprocess_popen(self, command): + if self.debug: + log.debug("cd %s && %s", self.outdir, command) + p = subprocess.Popen( + command, + shell=True, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + output, err = p.communicate() + retcode = p.returncode + if retcode != 0: + log.error("cd %s && %s", self.outdir, command) + log.error(output) + log.error(err) + raise RuntimeError("Command failed with exit code %s" % (retcode)) + + def subprocess_check_output(self, command): + if self.debug: + log.debug("cd %s && %s", self.outdir, " ".join(command)) + return subprocess.check_output(command, cwd=self.outdir) + + def _jbrowse_bin(self, command): + return os.path.realpath(os.path.join(self.jbrowse, "bin", command)) + + def symlink_or_copy(self, src, dest): + if "GALAXY_JBROWSE_SYMLINKS" in os.environ and bool( + os.environ["GALAXY_JBROWSE_SYMLINKS"] + ): + cmd = ["ln", "-s", src, dest] + else: + cmd = ["cp", src, dest] + + return self.subprocess_check_call(cmd) + + def _add_track(self, track_data): + if len(track_data) == 0: + return + cmd = [ + "jbrowse", + "add-track", + track_data["path"], + "-t", + track_data["type"], + "-n", + track_data["name"], + "-l", + "move", + "--trackId", + track_data["label"], + "--target", + self.outdir, + ] + if track_data.get("indexfile"): + cmd += ["--indexFile", track_data["indexfile"]] + if track_data.get("category"): + for c in track_data["category"]: + cmd += ["--category", c] + + def process_genomes(self): + assemblies = [] + for i, genome_node in enumerate(self.genome_paths): + log.info("genome_node=%s" % str(genome_node)) + # We only expect one input genome per run. This for loop is just + # easier to write than the alternative / catches any possible + # issues. + genome_name = genome_node["meta"]["dataset_dname"] + dsId = genome_node["meta"]["dataset_id"] + fapath = genome_node["path"] + faname = genome_name + ".fa.gz" + faind = os.path.realpath(os.path.join(self.outdir, faname + ".gzi")) + if True or self.standalone == "complete": + fadest = os.path.realpath(os.path.join(self.outdir, faname)) + cmd = "bgzip -i -c %s > %s && samtools faidx %s" % ( + fapath, + fadest, + fadest, + ) + self.subprocess_popen(cmd) + adapter = { + "type": "BgzipFastaAdapter", + "fastaLocation": { + "uri": faname, + }, + "faiLocation": { + "uri": faname + ".fai", + }, + "gziLocation": { + "uri": faname + ".gzi", + }, + } + else: + faurl = "%s/api/datasets/%s/display" % (self.giURL, dsId) + fastalocation = { + "uri": faurl, + } + failocation = { + "uri": faname + ".fai", + } + adapter = { + "type": "IndexedFastaAdapter", + "fastaLocation": fastalocation, + "faiLocation": failocation, + } + + cmd = ["samtools", "faidx", fapath, "--fai-idx", faind] + self.subprocess_check_call(cmd) + trackDict = { + "name": genome_name, + "sequence": { + "type": "ReferenceSequenceTrack", + "trackId": genome_name, + "adapter": adapter, + }, + "rendering": {"type": "DivSequenceRenderer"}, + } + assemblies.append(trackDict) + # self.config_json["assemblies"] = assemblies + self.genome_name = genome_name + cmd = [ + "jbrowse", + "add-assembly", + faname, + "-t", + "bgzipFasta", + "-n", + genome_name, + "--load", + "inPlace", + "--faiLocation", + faname + ".fai", + "--gziLocation", + faname + ".gzi", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def add_default_view(self): + cmd = [ + "jbrowse", + "set-default-session", + "-s", + self.config_json_file, + "-t", + ",".join(self.trackIdlist), + "-n", + "JBrowse2 in Galaxy", + "--target", + self.config_json_file, + "-v", + " LinearGenomeView", + ] + if True or self.debug: + log.info("### calling set-default-session with cmd=%s" % " ".join(cmd)) + self.subprocess_check_call(cmd) + + def write_config(self): + with open(self.config_json_file, "w") as fp: + json.dump(self.config_json, fp) + + def add_hic(self, data, trackData): + """ + HiC adapter. + https://github.com/aidenlab/hic-format/blob/master/HiCFormatV9.md + for testing locally, these work: + HiC data is from https://s3.amazonaws.com/igv.broadinstitute.org/data/hic/intra_nofrag_30.hic + using hg19 reference track as a + 'BgzipFastaAdapter' + fastaLocation: + uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz', + faiLocation: + uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.fai', + gziLocation: + uri: 'https://s3.amazonaws.com/jbrowse.org/genomes/GRCh38/fasta/GRCh38.fa.gz.gzi', + Cool will not be likely to be a good fit - see discussion at https://github.com/GMOD/jbrowse-components/issues/2438 + """ + log.info("#### trackData=%s" % trackData) + tId = trackData["label"] + dsId = trackData["metadata"]["dataset_id"] + url = "%s/api/datasets/%s/display?to_ext=hic " % ( + self.giURL, + dsId, + ) + hname = trackData["name"] + if True or self.standalone == "complete": + dest = os.path.realpath(os.path.join(self.outdir, hname)) + url = hname + cmd = ["cp", data, dest] + self.subprocess_check_call(cmd) + floc = { + "uri": hname, + } + else: + url = "%s/api/datasets/%s/display?to_ext=hic" % (self.giURL, dsId) + floc = { + "uri": url, + } + trackDict = { + "type": "HicTrack", + "trackId": tId, + "name": hname, + "assemblyNames": [self.genome_name], + "adapter": { + "type": "HicAdapter", + "hicLocation": floc, + }, + } + # self.tracksToAdd.append(trackDict) + # self.trackIdlist.append(tId) + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "HicTrack", + "-a", + self.genome_name, + "-n", + hname, + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def add_maf(self, data, trackData): + """ + from https://github.com/cmdcolin/maf2bed + Note: Both formats start with a MAF as input, and note that your MAF file should contain the species name and chromosome name + e.g. hg38.chr1 in the sequence identifiers. + need the reference id - eg hg18, for maf2bed.pl as the first parameter + """ + mafPlugin = { + "plugins": [ + { + "name": "MafViewer", + "url": "https://unpkg.com/jbrowse-plugin-mafviewer/dist/jbrowse-plugin-mafviewer.umd.production.min.js", + } + ] + } + tId = trackData["label"] + fname = "%s.bed" % tId + dest = os.path.realpath("%s/%s" % (self.outdir, fname)) + # self.symlink_or_copy(data, dest) + # Process MAF to bed-like. Need build to munge chromosomes + gname = self.genome_name + cmd = [ + "bash", + os.path.join(INSTALLED_TO, "convertMAF.sh"), + data, + gname, + INSTALLED_TO, + dest, + ] + self.subprocess_check_call(cmd) + if True or self.debug: + log.info("### convertMAF.sh called as %s" % " ".join(cmd)) + # Construct samples list + # We could get this from galaxy metadata, not sure how easily. + ps = subprocess.Popen(["grep", "^s [^ ]*", "-o", data], stdout=subprocess.PIPE) + output = subprocess.check_output(("sort", "-u"), stdin=ps.stdout) + ps.wait() + outp = output.decode("ascii") + soutp = outp.split("\n") + samp = [x.split("s ")[1] for x in soutp if x.startswith("s ")] + samples = [x.split(".")[0] for x in samp] + if self.debug: + log.info("### got samples = %s " % (samples)) + trackDict = { + "type": "MafTrack", + "trackId": tId, + "name": trackData["name"], + "adapter": { + "type": "MafTabixAdapter", + "samples": samples, + "bedGzLocation": { + "uri": fname + ".sorted.bed.gz", + }, + "index": { + "location": { + "uri": fname + ".sorted.bed.gz.tbi", + }, + }, + }, + "assemblyNames": [self.genome_name], + } + self.tracksToAdd.append(trackDict) + self.trackIdlist.append(tId) + if self.config_json.get("plugins", None): + self.config_json["plugins"].append(mafPlugin[0]) + else: + self.config_json.update(mafPlugin) + + def _blastxml_to_gff3(self, xml, min_gap=10): + gff3_unrebased = tempfile.NamedTemporaryFile(delete=False) + cmd = [ + "python", + os.path.join(INSTALLED_TO, "blastxml_to_gapped_gff3.py"), + "--trim", + "--trim_end", + "--include_seq", + "--min_gap", + str(min_gap), + xml, + ] + subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_unrebased) + gff3_unrebased.close() + return gff3_unrebased.name + + def add_blastxml(self, data, trackData, blastOpts, **kwargs): + gff3 = self._blastxml_to_gff3(data, min_gap=blastOpts["min_gap"]) + + if "parent" in blastOpts and blastOpts["parent"] != "None": + gff3_rebased = tempfile.NamedTemporaryFile(delete=False) + cmd = ["python", os.path.join(INSTALLED_TO, "gff3_rebase.py")] + if blastOpts.get("protein", "false") == "true": + cmd.append("--protein2dna") + cmd.extend([os.path.realpath(blastOpts["parent"]), gff3]) + subprocess.check_call(cmd, cwd=self.outdir, stdout=gff3_rebased) + gff3_rebased.close() + + # Replace original gff3 file + shutil.copy(gff3_rebased.name, gff3) + os.unlink(gff3_rebased.name) + url = "%s.gff3" % trackData["label"] + dest = os.path.realpath("%s/%s" % (self.outdir, url)) + self._sort_gff(gff3, dest) + url = url + ".gz" + tId = trackData["label"] + trackDict = { + "type": "FeatureTrack", + "trackId": tId, + "name": trackData["name"], + "assemblyNames": [self.genome_name], + "adapter": { + "type": "Gff3TabixAdapter", + "gffGzLocation": { + "uri": url, + }, + "index": { + "location": { + "uri": url + ".tbi", + } + }, + }, + "displays": [ + { + "type": "LinearBasicDisplay", + "displayId": "%s-LinearBasicDisplay" % tId, + }, + {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, + ], + } + # self.tracksToAdd.append(trackDict) + # self.trackIdlist.append(tId) + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "FeatureTrack", + "-a", + self.genome_name, + "--indexFile", + url + ".tbi", + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + os.unlink(gff3) + + def add_bigwig(self, data, trackData): + url = "%s.bw" % trackData["name"] + if True or self.standalone == "complete": + dest = os.path.realpath(os.path.join(self.outdir, url)) + cmd = ["cp", data, dest] + self.subprocess_check_call(cmd) + bwloc = {"uri": url} + else: + dsId = trackData["metadata"]["dataset_id"] + url = "%s/api/datasets/%s/display?to_ext=fasta" % (self.giURL, dsId) + bwloc = {"uri": url} + tId = trackData["label"] + trackDict = { + "type": "QuantitativeTrack", + "trackId": tId, + "name": url, + "assemblyNames": [ + self.genome_name, + ], + "adapter": { + "type": "BigWigAdapter", + "bigWigLocation": bwloc, + }, + "displays": [ + { + "type": "LinearWiggleDisplay", + "displayId": "%s-LinearWiggleDisplay" % tId, + } + ], + } + # self.tracksToAdd.append(trackDict) + # self.trackIdlist.append(tId) + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "QuantitativeTrack", + "-a", + self.genome_name, + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def add_bam(self, data, trackData, bamOpts, bam_index=None, **kwargs): + tId = trackData["label"] + fname = "%s.bam" % trackData["label"] + dest = os.path.realpath("%s/%s" % (self.outdir, fname)) + if True or self.standalone == "complete": + url = fname + self.subprocess_check_call(["cp", data, dest]) + log.info("### copied %s to %s" % (data, dest)) + bloc = {"uri": url} + else: + dsId = trackData["metadata"]["dataset_id"] + url = "%s/api/datasets/%s/display?to_ext=bam" % (self.giURL, dsId) + bloc = {"uri": url} + if bam_index is not None and os.path.exists(os.path.realpath(bam_index)): + # bai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest + self.subprocess_check_call( + ["cp", os.path.realpath(bam_index), dest + ".bai"] + ) + else: + # Can happen in exotic condition + # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam + # => no index generated by galaxy, but there might be one next to the symlink target + # this trick allows to skip the bam sorting made by galaxy if already done outside + if os.path.exists(os.path.realpath(data) + ".bai"): + self.symlink_or_copy(os.path.realpath(data) + ".bai", dest + ".bai") + else: + log.warn("Could not find a bam index (.bai file) for %s", data) + trackDict = { + "type": "AlignmentsTrack", + "trackId": tId, + "name": trackData["name"], + "assemblyNames": [self.genome_name], + "adapter": { + "type": "BamAdapter", + "bamLocation": bloc, + "index": { + "location": { + "uri": fname + ".bai", + } + }, + }, + } + # self.tracksToAdd.append(trackDict) + # self.trackIdlist.append(tId) + cmd = [ + "jbrowse", + "add-track", + fname, + "-t", + "AlignmentsTrack", + "-l", + "inPlace", + "-a", + self.genome_name, + "--indexFile", + fname + ".bai", + "-n", + trackData["name"], + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def add_vcf(self, data, trackData): + tId = trackData["label"] + url = "%s/api/datasets/%s/display" % ( + self.giURL, + trackData["metadata"]["dataset_id"], + ) + url = "%s.vcf.gz" % tId + dest = os.path.realpath("%s/%s" % (self.outdir, url)) + cmd = "bgzip -c %s > %s" % (data, dest) + self.subprocess_popen(cmd) + cmd = ["tabix", "-p", "vcf", dest] + self.subprocess_check_call(cmd) + trackDict = { + "type": "VariantTrack", + "trackId": tId, + "name": trackData["name"], + "assemblyNames": [self.genome_name], + "adapter": { + "type": "VcfTabixAdapter", + "vcfGzLocation": { + "uri": url, + }, + "index": { + "location": { + "uri": url + ".tbi", + } + }, + }, + "displays": [ + { + "type": "LinearVariantDisplay", + "displayId": "%s-LinearVariantDisplay" % tId, + }, + { + "type": "ChordVariantDisplay", + "displayId": "%s-ChordVariantDisplay" % tId, + }, + { + "type": "LinearPairedArcDisplay", + "displayId": "%s-LinearPairedArcDisplay" % tId, + }, + ], + } + # self.tracksToAdd.append(trackDict) + # self.trackIdlist.append(tId) + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "VariantTrack", + "-a", + self.genome_name, + "--indexFile", + url + ".tbi", + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def _sort_gff(self, data, dest): + # Only index if not already done + if not os.path.exists(dest + ".gz"): + cmd = "jbrowse sort-gff %s | bgzip -c > %s.gz" % ( + data, + dest, + ) # "gff3sort.pl --precise '%s' | grep -v \"^$\" > '%s'" + self.subprocess_popen(cmd) + self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest + ".gz"]) + + def _sort_bed(self, data, dest): + # Only index if not already done + if not os.path.exists(dest): + cmd = "sort -k1,1 -k2,2n %s | bgzip -c > %s" % (data, dest) + self.subprocess_popen(cmd) + cmd = ["tabix", "-f", "-p", "bed", dest] + self.subprocess_check_call(cmd) + + def add_gff(self, data, ext, trackData): + url = "%s.%s" % (trackData["label"], ext) + dest = os.path.realpath("%s/%s" % (self.outdir, url)) + self._sort_gff(data, dest) + url = url + ".gz" + tId = trackData["label"] + trackDict = { + "type": "FeatureTrack", + "trackId": tId, + "name": trackData["name"], + "assemblyNames": [self.genome_name], + "adapter": { + "type": "Gff3TabixAdapter", + "gffGzLocation": { + "uri": url, + }, + "index": { + "location": { + "uri": url + ".tbi", + } + }, + }, + "displays": [ + { + "type": "LinearBasicDisplay", + "displayId": "%s-LinearBasicDisplay" % tId, + }, + {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, + ], + } + # self.tracksToAdd.append(trackDict) + # self.trackIdlist.append(tId) + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "FeatureTrack", + "-a", + self.genome_name, + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def add_bed(self, data, ext, trackData): + url = "%s.%s" % (trackData["label"], ext) + dest = os.path.realpath("%s/%s.gz" % (self.outdir, url)) + self._sort_bed(data, dest) + tId = trackData["label"] + url = url + ".gz" + trackDict = { + "type": "FeatureTrack", + "trackId": tId, + "name": trackData["name"], + "assemblyNames": [self.genome_name], + "adapter": { + "type": "BedTabixAdapter", + "bedGzLocation": { + "uri": url, + }, + "index": { + "location": { + "uri": url + ".tbi", + } + }, + }, + "displays": [ + { + "type": "LinearBasicDisplay", + "displayId": "%s-LinearBasicDisplay" % tId, + }, + {"type": "LinearArcDisplay", "displayId": "%s-LinearArcDisplay" % tId}, + ], + } + # self.tracksToAdd.append(trackDict) + # self.trackIdlist.append(tId) + cmd = [ + "jbrowse", + "add-track", + url, + "-t", + "FeatureTrack", + "-a", + self.genome_name, + "--indexFile", + url + ".tbi", + "-n", + trackData["name"], + "--load", + "inPlace", + "--target", + self.outdir, + ] + self.subprocess_check_call(cmd) + + def process_annotations(self, track): + category = track["category"].replace("__pd__date__pd__", TODAY) + for i, ( + dataset_path, + dataset_ext, + track_human_label, + extra_metadata, + ) in enumerate(track["trackfiles"]): + # Unsanitize labels (element_identifiers are always sanitized by Galaxy) + for key, value in mapped_chars.items(): + track_human_label = track_human_label.replace(value, key) + outputTrackConfig = { + "category": category, + } + if self.debug: + log.info( + "Processing category = %s, track_human_label = %s", + category, + track_human_label, + ) + # We add extra data to hash for the case of REST + SPARQL. + if ( + "conf" in track + and "options" in track["conf"] + and "url" in track["conf"]["options"] + ): + rest_url = track["conf"]["options"]["url"] + else: + rest_url = "" + + # I chose to use track['category'] instead of 'category' here. This + # is intentional. This way re-running the tool on a different date + # will not generate different hashes and make comparison of outputs + # much simpler. + hashData = [ + str(dataset_path), + track_human_label, + track["category"], + rest_url, + ] + hashData = "|".join(hashData).encode("utf-8") + outputTrackConfig["label"] = hashlib.md5(hashData).hexdigest() + "_%s" % i + outputTrackConfig["metadata"] = extra_metadata + outputTrackConfig["name"] = track_human_label + + if dataset_ext in ("gff", "gff3"): + self.add_gff( + dataset_path, + dataset_ext, + outputTrackConfig, + ) + elif dataset_ext in ("hic",): + self.add_hic( + dataset_path, + outputTrackConfig, + ) + elif dataset_ext in ("bed",): + self.add_bed( + dataset_path, + dataset_ext, + outputTrackConfig, + ) + elif dataset_ext in ("maf",): + self.add_maf( + dataset_path, + outputTrackConfig, + ) + elif dataset_ext == "bigwig": + self.add_bigwig( + dataset_path, + outputTrackConfig, + ) + elif dataset_ext == "bam": + real_indexes = track["conf"]["options"]["pileup"]["bam_indices"][ + "bam_index" + ] + if not isinstance(real_indexes, list): + # + # /path/to/a.bam.bai + # + # + # The above will result in the 'bam_index' key containing a + # string. If there are two or more indices, the container + # becomes a list. Fun! + real_indexes = [real_indexes] + + self.add_bam( + dataset_path, + outputTrackConfig, + track["conf"]["options"]["pileup"], + bam_index=real_indexes[i], + ) + elif dataset_ext == "blastxml": + self.add_blastxml( + dataset_path, outputTrackConfig, track["conf"]["options"]["blast"] + ) + elif dataset_ext == "vcf": + self.add_vcf(dataset_path, outputTrackConfig) + else: + log.warn("Do not know how to handle %s", dataset_ext) + + def clone_jbrowse(self, jbrowse_dir, destination, minimal=False): + """Clone a JBrowse directory into a destination directory.""" + cmd = ["jbrowse", "create", "-f", self.outdir] + self.subprocess_check_call(cmd) + for fn in [ + "asset-manifest.json", + "favicon.ico", + "robots.txt", + "umd_plugin.js", + "version.txt", + "test_data", + ]: + cmd = ["rm", "-rf", os.path.join(self.outdir, fn)] + self.subprocess_check_call(cmd) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="", epilog="") + parser.add_argument("xml", type=argparse.FileType("r"), help="Track Configuration") + + parser.add_argument("--jbrowse", help="Folder containing a jbrowse release") + parser.add_argument("--outdir", help="Output directory", default="out") + parser.add_argument( + "--standalone", + choices=["complete", "minimal", "data"], + help="Standalone mode includes a copy of JBrowse", + ) + parser.add_argument("--version", "-V", action="version", version="%(prog)s 0.8.0") + args = parser.parse_args() + + tree = ET.parse(args.xml.name) + root = tree.getroot() + + # This should be done ASAP + GALAXY_INFRASTRUCTURE_URL = root.find("metadata/galaxyUrl").text + # Sometimes this comes as `localhost` without a protocol + if not GALAXY_INFRASTRUCTURE_URL.startswith("http"): + # so we'll prepend `http://` and hope for the best. Requests *should* + # be GET and not POST so it should redirect OK + GALAXY_INFRASTRUCTURE_URL = "http://" + GALAXY_INFRASTRUCTURE_URL + + jc = JbrowseConnector( + jbrowse=args.jbrowse, + outdir=args.outdir, + genomes=[ + { + "path": os.path.realpath(x.attrib["path"]), + "meta": metadata_from_node(x.find("metadata")), + } + for x in root.findall("metadata/genomes/genome") + ], + standalone=args.standalone, + ) + jc.process_genomes() + + for track in root.findall("tracks/track"): + track_conf = {} + track_conf["trackfiles"] = [] + + is_multi_bigwig = False + try: + if track.find("options/wiggle/multibigwig") and ( + track.find("options/wiggle/multibigwig").text == "True" + ): + is_multi_bigwig = True + multi_bigwig_paths = [] + except KeyError: + pass + + trackfiles = track.findall("files/trackFile") + if trackfiles: + for x in track.findall("files/trackFile"): + if is_multi_bigwig: + multi_bigwig_paths.append( + (x.attrib["label"], os.path.realpath(x.attrib["path"])) + ) + else: + if trackfiles: + metadata = metadata_from_node(x.find("metadata")) + track_conf["dataset_id"] = metadata["dataset_id"] + track_conf["trackfiles"].append( + ( + os.path.realpath(x.attrib["path"]), + x.attrib["ext"], + x.attrib["label"], + metadata, + ) + ) + else: + # For tracks without files (rest, sparql) + track_conf["trackfiles"].append( + ( + "", # N/A, no path for rest or sparql + track.attrib["format"], + track.find("options/label").text, + {}, + ) + ) + + if is_multi_bigwig: + metadata = metadata_from_node(x.find("metadata")) + + track_conf["trackfiles"].append( + ( + multi_bigwig_paths, # Passing an array of paths to represent as one track + "bigwig_multiple", + "MultiBigWig", # Giving an hardcoded name for now + {}, # No metadata for multiple bigwig + ) + ) + + track_conf["category"] = track.attrib["cat"] + track_conf["format"] = track.attrib["format"] + try: + # Only pertains to gff3 + blastxml. TODO? + track_conf["style"] = {t.tag: t.text for t in track.find("options/style")} + except TypeError: + track_conf["style"] = {} + pass + track_conf["conf"] = etree_to_dict(track.find("options")) + jc.process_annotations(track_conf) + print("## processed", str(track_conf), "trackIdlist", jc.trackIdlist) + print( + "###done processing, trackIdlist=", + jc.trackIdlist, + "config=", + str(jc.config_json), + ) + jc.config_json["tracks"] = jc.tracksToAdd + # jc.write_config() + jc.add_default_view()