# HG changeset patch
# User fubar
# Date 1706594703 0
# Node ID bde6b1d09f7d0b0f03026df29bf4f8263149eded
# Parent 2e6c489108199f56d3b9221d5ce5fa5780efb9fb
planemo upload for repository https://github.com/usegalaxy-eu/temporary-tools/tree/master/jbrowse2 commit 1290bf486bc55c02fecd0327de10a28655a18e81-dirty
diff -r 2e6c48910819 -r bde6b1d09f7d GFFOutput.py
--- a/GFFOutput.py Mon Jan 29 02:34:43 2024 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,213 +0,0 @@
-"""Output Biopython SeqRecords and SeqFeatures to GFF3 format.
-
-The target format is GFF3, the current GFF standard:
- http://www.sequenceontology.org/gff3.shtml
-"""
-from six.moves import urllib
-
-from Bio import SeqIO
-
-
-class _IdHandler:
- """Generate IDs for GFF3 Parent/Child
- relationships where they don't exist."""
-
- def __init__(self):
- self._prefix = "biopygen"
- self._counter = 1
- self._seen_ids = []
-
- def _generate_id(self, quals):
- """Generate a unique ID not present in our existing IDs."""
- gen_id = self._get_standard_id(quals)
- if gen_id is None:
- while 1:
- gen_id = "%s%s" % (self._prefix, self._counter)
- if gen_id not in self._seen_ids:
- break
- self._counter += 1
- return gen_id
-
- def _get_standard_id(self, quals):
- """Retrieve standardized IDs from other sources like NCBI GenBank.
-
- This tries to find IDs from known key/values when stored differently
- than GFF3 specifications.
- """
- possible_keys = ["transcript_id", "protein_id"]
- for test_key in possible_keys:
- if test_key in quals:
- cur_id = quals[test_key]
- if isinstance(cur_id, tuple) or isinstance(cur_id, list):
- return cur_id[0]
- else:
- return cur_id
- return None
-
- def update_quals(self, quals, has_children):
- """Update a set of qualifiers, adding an ID if necessary."""
- cur_id = quals.get("ID", None)
- # if we have an ID, record it
- if cur_id:
- if not isinstance(cur_id, list) and not isinstance(cur_id, tuple):
- cur_id = [cur_id]
- for add_id in cur_id:
- self._seen_ids.append(add_id)
- # if we need one and don't have it, create a new one
- elif has_children:
- new_id = self._generate_id(quals)
- self._seen_ids.append(new_id)
- quals["ID"] = [new_id]
- return quals
-
-
-class GFF3Writer:
- """Write GFF3 files starting with standard Biopython objects."""
-
- def __init__(self):
- pass
-
- def write(self, recs, out_handle, include_fasta=False):
- """Write the provided records to the given handle in GFF3 format."""
- id_handler = _IdHandler()
- self._write_header(out_handle)
- fasta_recs = []
- try:
- recs = iter(recs)
- except TypeError:
- recs = [recs]
- for rec in recs:
- self._write_rec(rec, out_handle)
- self._write_annotations(rec.annotations, rec.id, len(rec.seq), out_handle)
- for sf in rec.features:
- sf = self._clean_feature(sf)
- id_handler = self._write_feature(sf, rec.id, out_handle, id_handler)
- if include_fasta and len(rec.seq) > 0:
- fasta_recs.append(rec)
- if len(fasta_recs) > 0:
- self._write_fasta(fasta_recs, out_handle)
-
- def _clean_feature(self, feature):
- quals = {}
- for key, val in feature.qualifiers.items():
- if not isinstance(val, (list, tuple)):
- val = [val]
- val = [str(x) for x in val]
- quals[key] = val
- feature.qualifiers = quals
- # Support for Biopython 1.68 and above, which removed sub_features
- if not hasattr(feature, "sub_features"):
- feature.sub_features = []
- clean_sub = [self._clean_feature(f) for f in feature.sub_features]
- feature.sub_features = clean_sub
- return feature
-
- def _write_rec(self, rec, out_handle):
- # if we have a SeqRecord, write out optional directive
- if len(rec.seq) > 0:
- out_handle.write("##sequence-region %s 1 %s\n" % (rec.id, len(rec.seq)))
-
- def _get_phase(self, feature):
- if "phase" in feature.qualifiers:
- phase = feature.qualifiers["phase"][0]
- elif feature.type == "CDS":
- phase = int(feature.qualifiers.get("codon_start", [1])[0]) - 1
- else:
- phase = "."
- return str(phase)
-
- def _write_feature(self, feature, rec_id, out_handle, id_handler, parent_id=None):
- """Write a feature with location information."""
- if feature.location.strand == 1:
- strand = "+"
- elif feature.location.strand == -1:
- strand = "-"
- else:
- strand = "."
- # remove any standard features from the qualifiers
- quals = feature.qualifiers.copy()
- for std_qual in ["source", "score", "phase"]:
- if std_qual in quals and len(quals[std_qual]) == 1:
- del quals[std_qual]
- # add a link to a parent identifier if it exists
- if parent_id:
- if "Parent" not in quals:
- quals["Parent"] = []
- quals["Parent"].append(parent_id)
- quals = id_handler.update_quals(quals, len(feature.sub_features) > 0)
- if feature.type:
- ftype = feature.type
- else:
- ftype = "sequence_feature"
- parts = [
- str(rec_id),
- feature.qualifiers.get("source", ["feature"])[0],
- ftype,
- str(feature.location.start + 1), # 1-based indexing
- str(feature.location.end),
- feature.qualifiers.get("score", ["."])[0],
- strand,
- self._get_phase(feature),
- self._format_keyvals(quals),
- ]
- out_handle.write("\t".join(parts) + "\n")
- for sub_feature in feature.sub_features:
- id_handler = self._write_feature(
- sub_feature,
- rec_id,
- out_handle,
- id_handler,
- quals["ID"][0],
- )
- return id_handler
-
- def _format_keyvals(self, keyvals):
- format_kvs = []
- for key in sorted(keyvals.keys()):
- values = keyvals[key]
- key = key.strip()
- format_vals = []
- if not isinstance(values, list) or isinstance(values, tuple):
- values = [values]
- for val in values:
- val = urllib.parse.quote(str(val).strip(), safe=":/ ")
- if (key and val) and val not in format_vals:
- format_vals.append(val)
- format_kvs.append("%s=%s" % (key, ",".join(format_vals)))
- return ";".join(format_kvs)
-
- def _write_annotations(self, anns, rec_id, size, out_handle):
- """Add annotations which refer to an entire sequence."""
- format_anns = self._format_keyvals(anns)
- if format_anns:
- parts = [
- rec_id,
- "annotation",
- "remark",
- "1",
- str(size if size > 1 else 1),
- ".",
- ".",
- ".",
- format_anns,
- ]
- out_handle.write("\t".join(parts) + "\n")
-
- def _write_header(self, out_handle):
- """Write out standard header directives."""
- out_handle.write("##gff-version 3\n")
-
- def _write_fasta(self, recs, out_handle):
- """Write sequence records using the ##FASTA directive."""
- out_handle.write("##FASTA\n")
- SeqIO.write(recs, out_handle, "fasta")
-
-
-def write(recs, out_handle, include_fasta=False):
- """High level interface to write GFF3 files from SeqRecords and SeqFeatures.
-
- If include_fasta is True, the GFF3 file will include sequence information
- using the ##FASTA directive.
- """
- writer = GFF3Writer()
- return writer.write(recs, out_handle, include_fasta)
diff -r 2e6c48910819 -r bde6b1d09f7d autogenJB2.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/autogenJB2.py Tue Jan 30 06:05:03 2024 +0000
@@ -0,0 +1,88 @@
+import argparse
+import os
+
+from jbrowse2 import jbrowseConnector as jbC
+
+
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="", epilog="")
+ parser.add_argument("--yaml", help="Track Configuration")
+ parser.add_argument("--outdir", help="Output directory", default="out")
+ parser.add_argument("--version", "-V", action="version", version="%(prog)s 0.0.1")
+ args = parser.parse_args()
+ if os.path.exists(args.outdir):
+ root = args.outdir
+ dirList = os.scandir(root)
+ subdirs = [f.path for f in dirList if f.is_dir()]
+ genome_paths = [f.path for f in dirList if f.name.startswith('genome') and f.is_file()]
+ if len(genome_paths) > 0:
+ genome_fnames = [os.path.basename(x).split('_')[2:] for x in genome_paths] # expect genome_1_genomename.fasta etc
+ jc = jbC(
+ outdir=args.outdir,
+ genomes=[
+ {
+ "path": x,
+ "meta": {"name" : genome_fnames[i], },
+ }
+ for i,x in enumerate(genome_paths)
+ ],
+ )
+ jc.process_genomes()
+ # .add_default_view() replace from https://github.com/abretaud/tools-iuc/blob/jbrowse2/tools/jbrowse2/jbrowse2.py
+ default_session_data = {
+ "visibility": {
+ "default_on": [],
+ "default_off": [],
+ },
+ "style": {},
+ "style_labels": {},
+ }
+
+ track_paths = [x for x in genome_paths if not x.startswith('genome') and x.is_file()]
+ for i, track in enumerate(track_paths):
+ track_conf = {}
+ track_conf['format']= os.path.basename(track).split('_')[0]
+ track_conf["name"] = os.path.basename(track).split('_')[2:] # expect genome_1_genomename.fasta etc
+ fext = os.path.splitext(os.path.basename(track)).replace('.','')
+ track_conf["label"] = "%s_%i" % (os.path.basename(track), i)
+ track_conf["trackfiles"] = []
+ keys = jc.process_annotations(track_conf)
+
+ if keys:
+ for key in keys:
+ default_session_data["visibility"][
+ track.attrib.get("visibility", "default_off")
+ ].append(key)
+ if track_conf.get("style", None):
+ default_session_data["style"][key] = track_conf[
+ "style"
+ ] # TODO do we need this anymore?
+ if track_conf.get("style_lables", None):
+ default_session_data["style_labels"][key] = track_conf.get(
+ "style_labels", None
+ )
+ # default_session_data["defaultLocation"] = root.find(
+ # "metadata/general/defaultLocation"
+ # ).text
+ # default_session_data["session_name"] = root.find(
+ # "metadata/general/session_name"
+ # ).text
+ # general_data = {
+ # "analytics": root.find("metadata/general/analytics").text,
+ # "primary_color": root.find("metadata/general/primary_color").text,
+ # "secondary_color": root.find("metadata/general/secondary_color").text,
+ # "tertiary_color": root.find("metadata/general/tertiary_color").text,
+ # "quaternary_color": root.find("metadata/general/quaternary_color").text,
+ # "font_size": root.find("metadata/general/font_size").text,
+ # }
+ # jc.add_general_configuration(general_data)
+ trackconf = jc.config_json.get("tracks", None)
+ if trackconf:
+ jc.config_json["tracks"].update(jc.tracksToAdd)
+ else:
+ jc.config_json["tracks"] = jc.tracksToAdd
+ jc.write_config()
+ jc.add_default_session(default_session_data)
+ # jc.text_index() not sure what broke here.
diff -r 2e6c48910819 -r bde6b1d09f7d autogenJB2.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/autogenJB2.xml Tue Jan 30 06:05:03 2024 +0000
@@ -0,0 +1,148 @@
+
+ Files to JBrowse2
+
+ macros.xml
+
+
+
+ jbrowse2
+
+
+ python '${__tool_directory__}/autogenJB2.py' --version
+ to test the files I want to test. Hmph.
+#if str($uglyTestingHack) == "enabled":
+ && cp '$trackxml' '$output'
+#end if
+ ]]>
+
+
+
+
+
+
+
+
+
+ `__, reviewer.
+It is maintained by https://github.com/fubar2 who you can help you
+with missing features or bugs in the tool. For the record, he remains unconvinced by the reviewer's logic,
+and disturbed by the distinctly coercive approach to introducing new code,
+compared to the more usual method of providing a working PR.
+
+Options
+-------
+
+**Reference or Assembly**
+
+Choose either a built-in or select one from your history.
+
+Track coordinates and contig names *must* match this reference precisely
+or they will not display.
+
+**Track Groups** represent a set of tracks in a single category.
+
+Annotation Tracks
+-----------------
+
+GFF3/BED
+~~~~~~~~
+
+Standard feature tracks. They usually highlight genes, mRNAs and other features of interest along a genomic region.
+
+When these contain tens of millions of features, such as repeat regions from a VGP assembly, displaying one at a time leads
+to extremely slow loading times when a large region is in view, unless the "LinearPileupDisplay" display option is
+selected for that track in the styling options section. The default is LinearBasicDisplay, which shows all details and works
+well for relatively sparse bed files. A better option is to make a bigwig track using a set of windows based on the
+lengths of each assembly or reference contig.
+
+BAM Pileups
+~~~~~~~~~~~
+
+We support BAM files and can automatically generate SNP tracks based on
+that bam data.
+
+
+BlastXML
+~~~~~~~~
+
+JiG now supports both blastn and blastp datasets. JiG internally uses a
+blastXML to gapped GFF3 tool to convert your blastxml datasets into a
+format amenable to visualization in JBrowse. This tool is also
+available separately from the IUC on the toolshed.
+
+**Minimum Gap Size** reflects how long a gap must be before it becomes a
+real gap in the processed gff3 file. In the picture above, various sizes
+of gaps can be seen. If the minimum gap size was set much higher, say
+100nt, many of the smaller gaps would disappear, and the features on
+both sides would be merged into one, longer feature. This setting is
+inversely proportional to runtime and output file size. *Do not set this
+to a low value for large datasets*. By setting this number lower, you
+will have extremely large outputs and extremely long runtimes. The
+default was configured based off of the author's experience, but the
+author only works on small viruses. It is *strongly* recommended that
+you filter your blast results before display, e.g. picking out the top
+10 hits or so.
+
+**Protein blast search** option merely informs underlying tools that
+they should adjust feature locations by 3x.
+
+
+@ATTRIBUTION@
+]]>
+
+
diff -r 2e6c48910819 -r bde6b1d09f7d gff3_rebase.py
--- a/gff3_rebase.py Mon Jan 29 02:34:43 2024 +0000
+++ b/gff3_rebase.py Tue Jan 30 06:05:03 2024 +0000
@@ -117,7 +117,7 @@
start *= 3
end *= 3
- if parent.location.strand >= 0:
+ if parent.location.strand !=None and parent.location.strand >= 0:
ns = parent.location.start + start
ne = parent.location.start + end
st = +1
@@ -136,7 +136,8 @@
ns %= 3
if ne < 0:
ne %= 3
-
+ if ns > ne:
+ ne, ns = ns, ne # dunno why but sometimes happens
feature.location = FeatureLocation(ns, ne, strand=st)
if hasattr(feature, "sub_features"):
diff -r 2e6c48910819 -r bde6b1d09f7d jb2_GFF/GFFOutput.py
--- a/jb2_GFF/GFFOutput.py Mon Jan 29 02:34:43 2024 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,233 +0,0 @@
-"""Output Biopython SeqRecords and SeqFeatures to GFF3 format.
-
-The target format is GFF3, the current GFF standard:
- http://www.sequenceontology.org/gff3.shtml
-"""
-from six.moves import urllib
-
-from Bio import SeqIO
-
-
-class _IdHandler:
- """Generate IDs for GFF3 Parent/Child
- relationships where they don't exist."""
-
- def __init__(self):
- self._prefix = "biopygen"
- self._counter = 1
- self._seen_ids = []
-
- def _generate_id(self, quals):
- """Generate a unique ID not present in our existing IDs."""
- gen_id = self._get_standard_id(quals)
- if gen_id is None:
- while 1:
- gen_id = "%s%s" % (self._prefix, self._counter)
- if gen_id not in self._seen_ids:
- break
- self._counter += 1
- return gen_id
-
- def _get_standard_id(self, quals):
- """Retrieve standardized IDs from other sources like NCBI GenBank.
-
- This tries to find IDs from known key/values when stored differently
- than GFF3 specifications.
- """
- possible_keys = ["transcript_id", "protein_id"]
- for test_key in possible_keys:
- if test_key in quals:
- cur_id = quals[test_key]
- if isinstance(cur_id, tuple) or isinstance(
- cur_id, list
- ):
- return cur_id[0]
- else:
- return cur_id
- return None
-
- def update_quals(self, quals, has_children):
- """Update a set of qualifiers, adding an ID if necessary."""
- cur_id = quals.get("ID", None)
- # if we have an ID, record it
- if cur_id:
- if not isinstance(cur_id, list) and not isinstance(
- cur_id, tuple
- ):
- cur_id = [cur_id]
- for add_id in cur_id:
- self._seen_ids.append(add_id)
- # if we need one and don't have it, create a new one
- elif has_children:
- new_id = self._generate_id(quals)
- self._seen_ids.append(new_id)
- quals["ID"] = [new_id]
- return quals
-
-
-class GFF3Writer:
- """Write GFF3 files starting with standard Biopython objects."""
-
- def __init__(self):
- pass
-
- def write(self, recs, out_handle, include_fasta=False):
- """Write the provided records to the given handle in GFF3 format."""
- id_handler = _IdHandler()
- self._write_header(out_handle)
- fasta_recs = []
- try:
- recs = iter(recs)
- except TypeError:
- recs = [recs]
- for rec in recs:
- self._write_rec(rec, out_handle)
- self._write_annotations(
- rec.annotations, rec.id, len(rec.seq), out_handle
- )
- for sf in rec.features:
- sf = self._clean_feature(sf)
- id_handler = self._write_feature(
- sf, rec.id, out_handle, id_handler
- )
- if include_fasta and len(rec.seq) > 0:
- fasta_recs.append(rec)
- if len(fasta_recs) > 0:
- self._write_fasta(fasta_recs, out_handle)
-
- def _clean_feature(self, feature):
- quals = {}
- for key, val in feature.qualifiers.items():
- if not isinstance(val, (list, tuple)):
- val = [val]
- val = [str(x) for x in val]
- quals[key] = val
- feature.qualifiers = quals
- # Support for Biopython 1.68 and above, which removed sub_features
- if not hasattr(feature, "sub_features"):
- feature.sub_features = []
- clean_sub = [
- self._clean_feature(f) for f in feature.sub_features
- ]
- feature.sub_features = clean_sub
- return feature
-
- def _write_rec(self, rec, out_handle):
- # if we have a SeqRecord, write out optional directive
- if len(rec.seq) > 0:
- out_handle.write(
- "##sequence-region %s 1 %s\n" % (rec.id, len(rec.seq))
- )
-
- def _get_phase(self, feature):
- if "phase" in feature.qualifiers:
- phase = feature.qualifiers["phase"][0]
- elif feature.type == "CDS":
- phase = (
- int(feature.qualifiers.get("codon_start", [1])[0]) - 1
- )
- else:
- phase = "."
- return str(phase)
-
- def _write_feature(
- self, feature, rec_id, out_handle, id_handler, parent_id=None
- ):
- """Write a feature with location information."""
- if feature.location.strand == 1:
- strand = "+"
- elif feature.location.strand == -1:
- strand = "-"
- else:
- strand = "."
- # remove any standard features from the qualifiers
- quals = feature.qualifiers.copy()
- for std_qual in ["source", "score", "phase"]:
- if std_qual in quals and len(quals[std_qual]) == 1:
- del quals[std_qual]
- # add a link to a parent identifier if it exists
- if parent_id:
- if "Parent" not in quals:
- quals["Parent"] = []
- quals["Parent"].append(parent_id)
- quals = id_handler.update_quals(
- quals, len(feature.sub_features) > 0
- )
- if feature.type:
- ftype = feature.type
- else:
- ftype = "sequence_feature"
- parts = [
- str(rec_id),
- feature.qualifiers.get("source", ["feature"])[0],
- ftype,
- str(feature.location.start + 1), # 1-based indexing
- str(feature.location.end),
- feature.qualifiers.get("score", ["."])[0],
- strand,
- self._get_phase(feature),
- self._format_keyvals(quals),
- ]
- out_handle.write("\t".join(parts) + "\n")
- for sub_feature in feature.sub_features:
- id_handler = self._write_feature(
- sub_feature,
- rec_id,
- out_handle,
- id_handler,
- quals["ID"][0],
- )
- return id_handler
-
- def _format_keyvals(self, keyvals):
- format_kvs = []
- for key in sorted(keyvals.keys()):
- values = keyvals[key]
- key = key.strip()
- format_vals = []
- if not isinstance(values, list) or isinstance(
- values, tuple
- ):
- values = [values]
- for val in values:
- val = urllib.parse.quote(str(val).strip(), safe=":/ ")
- if (key and val) and val not in format_vals:
- format_vals.append(val)
- format_kvs.append("%s=%s" % (key, ",".join(format_vals)))
- return ";".join(format_kvs)
-
- def _write_annotations(self, anns, rec_id, size, out_handle):
- """Add annotations which refer to an entire sequence."""
- format_anns = self._format_keyvals(anns)
- if format_anns:
- parts = [
- rec_id,
- "annotation",
- "remark",
- "1",
- str(size if size > 1 else 1),
- ".",
- ".",
- ".",
- format_anns,
- ]
- out_handle.write("\t".join(parts) + "\n")
-
- def _write_header(self, out_handle):
- """Write out standard header directives."""
- out_handle.write("##gff-version 3\n")
-
- def _write_fasta(self, recs, out_handle):
- """Write sequence records using the ##FASTA directive."""
- out_handle.write("##FASTA\n")
- SeqIO.write(recs, out_handle, "fasta")
-
-
-def write(recs, out_handle, include_fasta=False):
- """High level interface to write GFF3 files from SeqRecords and SeqFeatures.
-
- If include_fasta is True, the GFF3 file will include sequence information
- using the ##FASTA directive.
- """
- writer = GFF3Writer()
- return writer.write(recs, out_handle, include_fasta)
diff -r 2e6c48910819 -r bde6b1d09f7d jb2_GFF/GFFParser.py
--- a/jb2_GFF/GFFParser.py Mon Jan 29 02:34:43 2024 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,1099 +0,0 @@
-"""Parse GFF files into features attached to Biopython SeqRecord objects.
-
-This deals with GFF3 formatted files, a tab delimited format for storing
-sequence features and annotations:
-
-http://www.sequenceontology.org/gff3.shtml
-
-It will also deal with older GFF versions (GTF/GFF2):
-
-http://www.sanger.ac.uk/Software/formats/GFF/GFF_Spec.shtml
-http://mblab.wustl.edu/GTF22.html
-
-The implementation utilizes map/reduce parsing of GFF using Disco. Disco
-(http://discoproject.org) is a Map-Reduce framework for Python utilizing
-Erlang for parallelization. The code works on a single processor without
-Disco using the same architecture.
-"""
-import os
-import copy
-import json
-import re
-import collections
-import io
-import itertools
-import warnings
-import six
-from six.moves import urllib
-
-from Bio.SeqRecord import SeqRecord
-from Bio import SeqFeature
-from Bio import SeqIO
-from Bio import BiopythonDeprecationWarning
-
-import disco
-
-# Make defaultdict compatible with versions of python older than 2.4
-try:
- collections.defaultdict
-except AttributeError:
- import _utils
-
- collections.defaultdict = _utils.defaultdict
-
-unknown_seq_avail = False
-try:
- from Bio.Seq import UnknownSeq
-
- unknown_seq_avail = True
-except ImportError:
- # Starting with biopython 1.81, has been removed
- from Bio.Seq import _UndefinedSequenceData
- from Bio.Seq import Seq
-
-
-warnings.simplefilter("ignore", BiopythonDeprecationWarning)
-
-
-def _gff_line_map(line, params):
- """Map part of Map-Reduce; parses a line of GFF into a dictionary.
-
- Given an input line from a GFF file, this:
- - decides if the file passes our filtering limits
- - if so:
- - breaks it into component elements
- - determines the type of attribute (flat, parent, child or annotation)
- - generates a dictionary of GFF info which can be serialized as JSON
- """
-
- def _merge_keyvals(parts):
- """Merge key-values escaped by quotes
- that are improperly split at semicolons."""
- out = []
- for i, p in enumerate(parts):
- if (
- i > 0
- and len(p) == 1
- and p[0].endswith('"')
- and not p[0].startswith('"')
- ):
- if out[-1][-1].startswith('"'):
- prev_p = out.pop(-1)
- to_merge = prev_p[-1]
- prev_p[-1] = "%s; %s" % (to_merge, p[0])
- out.append(prev_p)
- else:
- out.append(p)
- return out
-
- gff3_kw_pat = re.compile(r"\w+=")
-
- def _split_keyvals(keyval_str):
- """Split key-value pairs in a GFF2, GTF and GFF3 compatible way.
-
- GFF3 has key value pairs like:
- count=9;gene=amx-2;sequence=SAGE:aacggagccg
- GFF2 and GTF have:
- Sequence "Y74C9A" ; Note "Clone Y74C9A; Genbank AC024206"
- name "fgenesh1_pg.C_chr_1000003"; transcriptId 869
- """
- quals = collections.defaultdict(list)
- if keyval_str is None:
- return quals
- # ensembl GTF has a stray semi-colon at the end
- if keyval_str[-1] == ";":
- keyval_str = keyval_str[:-1]
- # GFF2/GTF has a semi-colon with at least one space after it.
- # It can have spaces on both sides; wormbase does this.
- # GFF3 works with no spaces.
- # Split at the first one we can recognize as working
- parts = keyval_str.split(" ; ")
- if len(parts) == 1:
- parts = [x.strip() for x in keyval_str.split(";")]
- # check if we have GFF3 style key-vals (with =)
- is_gff2 = True
- if gff3_kw_pat.match(parts[0]):
- is_gff2 = False
- key_vals = _merge_keyvals([p.split("=") for p in parts])
- # otherwise, we are separated by a space with a key as the first item
- else:
- pieces = []
- for p in parts:
- # fix misplaced semi-colons in keys in some GFF2 files
- if p and p[0] == ";":
- p = p[1:]
- pieces.append(p.strip().split(" "))
- key_vals = [(p[0], " ".join(p[1:])) for p in pieces]
- for item in key_vals:
- # standard in-spec items are key=value
- if len(item) == 2:
- key, val = item
- # out-of-spec files can have just key values. We set an empty value
- # which will be changed to true later to standardize.
- else:
- assert len(item) == 1, item
- key = item[0]
- val = ""
- # remove quotes in GFF2 files
- quoted = False
- if len(val) > 0 and val[0] == '"' and val[-1] == '"':
- quoted = True
- val = val[1:-1]
- if val:
- if quoted:
- quals[key].append(val)
- else:
- quals[key].extend(
- [v for v in val.split(",") if v]
- )
- # if we don't have a value, make this a key=True/False style
- # attribute
- else:
- quals[key].append("true")
- for key, vals in quals.items():
- quals[key] = [urllib.parse.unquote(v) for v in vals]
- return quals, is_gff2
-
- def _nest_gff2_features(gff_parts):
- """Provide nesting of GFF2 transcript parts with transcript IDs.
-
- exons and coding sequences are mapped to a parent with a transcript_id
- in GFF2. This is implemented differently at different genome centers
- and this function attempts to resolve that and map things to the GFF3
- way of doing them.
- """
- # map protein or transcript ids to a parent
- for transcript_id in [
- "transcript_id",
- "transcriptId",
- "proteinId",
- ]:
- try:
- gff_parts["quals"]["Parent"] = gff_parts["quals"][
- transcript_id
- ]
- break
- except KeyError:
- pass
- # case for WormBase GFF -- everything labelled as Transcript or CDS
- for flat_name in ["Transcript", "CDS"]:
- if flat_name in gff_parts["quals"]:
- # parent types
- if gff_parts["type"] in [flat_name]:
- if not gff_parts["id"]:
- gff_parts["id"] = gff_parts["quals"][
- flat_name
- ][0]
- gff_parts["quals"]["ID"] = [gff_parts["id"]]
- # children types
- elif gff_parts["type"] in [
- "intron",
- "exon",
- "three_prime_UTR",
- "coding_exon",
- "five_prime_UTR",
- "CDS",
- "stop_codon",
- "start_codon",
- ]:
- gff_parts["quals"]["Parent"] = gff_parts["quals"][
- flat_name
- ]
- break
-
- return gff_parts
-
- strand_map = {"+": 1, "-": -1, "?": None, None: None}
- line = line.strip()
- if line[:2] == "##":
- return [("directive", line[2:])]
- elif line and line[0] != "#":
- parts = line.split("\t")
- should_do = True
- if params.limit_info:
- for limit_name, limit_values in params.limit_info.items():
- cur_id = tuple(
- [parts[i] for i in params.filter_info[limit_name]]
- )
- if cur_id not in limit_values:
- should_do = False
- break
- if should_do:
- assert len(parts) >= 8, line
- # not python2.4 compatible but easier to understand
- # gff_parts = [(None if p == '.' else p) for p in parts]
- gff_parts = []
- for p in parts:
- if p == ".":
- gff_parts.append(None)
- else:
- gff_parts.append(p)
- gff_info = dict()
- # collect all of the base qualifiers for this item
- if len(parts) > 8:
- quals, is_gff2 = _split_keyvals(gff_parts[8])
- else:
- quals, is_gff2 = collections.defaultdict(list), False
- gff_info["is_gff2"] = is_gff2
- if gff_parts[1]:
- quals["source"].append(gff_parts[1])
- if gff_parts[5]:
- quals["score"].append(gff_parts[5])
- if gff_parts[7]:
- quals["phase"].append(gff_parts[7])
- gff_info["quals"] = dict(quals)
- gff_info["rec_id"] = gff_parts[0]
- # if we are describing a location, then we are a feature
- if gff_parts[3] and gff_parts[4]:
- gff_info["location"] = [
- int(gff_parts[3]) - 1,
- int(gff_parts[4]),
- ]
- gff_info["type"] = gff_parts[2]
- gff_info["id"] = quals.get("ID", [""])[0]
- gff_info["strand"] = strand_map.get(
- gff_parts[6], None
- )
- if is_gff2:
- gff_info = _nest_gff2_features(gff_info)
- # features that have parents need to link so we can pick up
- # the relationship
- if "Parent" in gff_info["quals"]:
- # check for self referential parent/child relationships
- # remove the ID, which is not useful
- for p in gff_info["quals"]["Parent"]:
- if p == gff_info["id"]:
- gff_info["id"] = ""
- del gff_info["quals"]["ID"]
- break
- final_key = "child"
- elif gff_info["id"]:
- final_key = "parent"
- # Handle flat features
- else:
- final_key = "feature"
- # otherwise, associate these annotations with the full record
- else:
- final_key = "annotation"
- if params.jsonify:
- return [(final_key, json.dumps(gff_info))]
- else:
- return [(final_key, gff_info)]
- return []
-
-
-def _gff_line_reduce(map_results, out, params):
- """Reduce part of Map-Reduce; combines results of parsed features."""
- final_items = dict()
- for gff_type, final_val in map_results:
- if params.jsonify and gff_type not in ["directive"]:
- final_val = json.loads(final_val)
- try:
- final_items[gff_type].append(final_val)
- except KeyError:
- final_items[gff_type] = [final_val]
- for key, vals in final_items.items():
- if params.jsonify:
- vals = json.dumps(vals)
- out.add(key, vals)
-
-
-class _MultiIDRemapper:
- """Provide an ID remapping for cases where a parent has a non-unique ID.
-
- Real life GFF3 cases have non-unique ID attributes, which we fix here
- by using the unique sequence region to assign children to the right
- parent.
- """
-
- def __init__(self, base_id, all_parents):
- self._base_id = base_id
- self._parents = all_parents
-
- def remap_id(self, feature_dict):
- rstart, rend = feature_dict["location"]
- for index, parent in enumerate(self._parents):
- pstart, pend = parent["location"]
- if rstart >= pstart and rend <= pend:
- if index > 0:
- return "%s_%s" % (self._base_id, index + 1)
- else:
- return self._base_id
- # if we haven't found a location match but parents are umabiguous,
- # return that
- if len(self._parents) == 1:
- return self._base_id
- raise ValueError(
- "Did not find remapped ID location: %s, %s, %s"
- % (
- self._base_id,
- [p["location"] for p in self._parents],
- feature_dict["location"],
- )
- )
-
-
-class _AbstractMapReduceGFF:
- """Base class providing general GFF parsing for local and remote classes.
-
- This class should be subclassed to provide a concrete class to parse
- GFF under specific conditions. These classes need to implement
- the _gff_process function, which returns a dictionary of SeqRecord
- information.
- """
-
- def __init__(self, create_missing=True):
- """Initialize GFF parser
-
- create_missing - If True, create blank records for GFF ids not in
- the base_dict. If False, an error will be raised.
- """
- self._create_missing = create_missing
- self._map_fn = _gff_line_map
- self._reduce_fn = _gff_line_reduce
- self._examiner = GFFExaminer()
-
- def _gff_process(self, gff_files, limit_info, target_lines=None):
- raise NotImplementedError("Derived class must define")
-
- def parse(self, gff_files, base_dict=None, limit_info=None):
- """Parse a GFF file, returning an iterator of SeqRecords.
-
- limit_info - A dictionary specifying the regions of the GFF file
- which should be extracted. This allows only relevant portions of a file
- to be parsed.
-
- base_dict - A base dictionary of SeqRecord objects which may be
- pre-populated with sequences and other features. The new features from
- the GFF file will be added to this dictionary.
- """
- for rec in self.parse_in_parts(
- gff_files, base_dict, limit_info
- ):
- yield rec
-
- def parse_in_parts(
- self,
- gff_files,
- base_dict=None,
- limit_info=None,
- target_lines=None,
- ):
- """Parse a region of a GFF file specified, returning info as generated.
-
- target_lines -- The number of lines in the file which should be used
- for each partial parse. This should be determined based on available
- memory.
- """
- for results in self.parse_simple(
- gff_files, limit_info, target_lines
- ):
- if base_dict is None:
- cur_dict = dict()
- else:
- cur_dict = copy.deepcopy(base_dict)
- cur_dict = self._results_to_features(cur_dict, results)
- all_ids = list(cur_dict.keys())
- all_ids.sort()
- for cur_id in all_ids:
- yield cur_dict[cur_id]
-
- def parse_simple(
- self, gff_files, limit_info=None, target_lines=1
- ):
- """Simple parse which does not build or nest features.
-
- This returns a simple dictionary representation of each line in the
- GFF file.
- """
- # gracefully handle a single file passed
- if not isinstance(gff_files, (list, tuple)):
- gff_files = [gff_files]
- limit_info = self._normalize_limit_info(limit_info)
- for results in self._gff_process(
- gff_files, limit_info, target_lines
- ):
- yield results
-
- def _normalize_limit_info(self, limit_info):
- """Turn all limit information into tuples for identical comparisons."""
- final_limit_info = {}
- if limit_info:
- for key, values in limit_info.items():
- final_limit_info[key] = []
- for v in values:
- if isinstance(v, str):
- final_limit_info[key].append((v,))
- else:
- final_limit_info[key].append(tuple(v))
- return final_limit_info
-
- def _results_to_features(self, base, results):
- """Add parsed dictionaries of results to Biopython SeqFeatures."""
- base = self._add_annotations(
- base, results.get("annotation", [])
- )
- for feature in results.get("feature", []):
- (_, base) = self._add_toplevel_feature(base, feature)
- base = self._add_parent_child_features(
- base, results.get("parent", []), results.get("child", [])
- )
- base = self._add_seqs(base, results.get("fasta", []))
- base = self._add_directives(
- base, results.get("directive", [])
- )
- return base
-
- def _add_directives(self, base, directives):
- """Handle any directives or meta-data in the GFF file.
-
- Relevant items are added as annotation meta-data to each record.
- """
- dir_keyvals = collections.defaultdict(list)
- for directive in directives:
- parts = directive.split()
- if len(parts) > 1:
- key = parts[0]
- if len(parts) == 2:
- val = parts[1]
- else:
- val = tuple(parts[1:])
- # specific directives that need special handling
- if (
- key == "sequence-region"
- ): # convert to Python 0-based coordinates
- if len(val) == 2: # handle regions missing contig
- val = (int(val[0]) - 1, int(val[1]))
- elif len(val) == 3:
- val = (val[0], int(val[1]) - 1, int(val[2]))
- dir_keyvals[key].append(val)
- for key, vals in dir_keyvals.items():
- for rec in base.values():
- self._add_ann_to_rec(rec, key, vals)
- return base
-
- def _get_matching_record_id(self, base, find_id):
- """Find a matching base record with the test identifier, handling tricky cases.
-
- NCBI IDs https://en.wikipedia.org/wiki/FASTA_format#NCBI_identifiers
- """
- # Straight matches for identifiers
- if find_id in base:
- return find_id
- # NCBI style IDs in find_id
- elif find_id and find_id.find("|") > 0:
- for test_id in [
- x.strip() for x in find_id.split("|")[1:]
- ]:
- if test_id and test_id in base:
- return test_id
- # NCBI style IDs in base IDs
- else:
- for base_id in base.keys():
- if base_id.find("|") > 0:
- for test_id in [
- x.strip() for x in base_id.split("|")[1:]
- ]:
- if test_id and test_id == find_id:
- return base_id
- return None
-
- def _add_seqs(self, base, recs):
- """Add sequence information contained in the GFF3 to records."""
- for rec in recs:
- match_id = self._get_matching_record_id(base, rec.id)
- if match_id:
- base[match_id].seq = rec.seq
- else:
- base[rec.id] = rec
- return base
-
- def _add_parent_child_features(self, base, parents, children):
- """Add nested features with parent child relationships."""
- multi_remap = self._identify_dup_ids(parents)
- # add children features
- children_prep = collections.defaultdict(list)
- for child_dict in children:
- child_feature = self._get_feature(child_dict)
- for pindex, pid in enumerate(
- child_feature.qualifiers["Parent"]
- ):
- if pid in multi_remap:
- pid = multi_remap[pid].remap_id(child_dict)
- child_feature.qualifiers["Parent"][pindex] = pid
- children_prep[pid].append(
- (child_dict["rec_id"], child_feature)
- )
- children = dict(children_prep)
- # add children to parents that exist
- for cur_parent_dict in parents:
- cur_id = cur_parent_dict["id"]
- if cur_id in multi_remap:
- cur_parent_dict["id"] = multi_remap[cur_id].remap_id(
- cur_parent_dict
- )
- cur_parent, base = self._add_toplevel_feature(
- base, cur_parent_dict
- )
- cur_parent, children = self._add_children_to_parent(
- cur_parent, children
- )
- # create parents for children without them (GFF2 or split/bad files)
- while len(children) > 0:
- parent_id, cur_children = next(
- itertools.islice(children.items(), 1)
- )
- # one child, do not nest it
- if len(cur_children) == 1:
- rec_id, child = cur_children[0]
- loc = (child.location.start, child.location.end)
- rec, base = self._get_rec(
- base, dict(rec_id=rec_id, location=loc)
- )
- rec.features.append(child)
- del children[parent_id]
- else:
- cur_parent, base = self._add_missing_parent(
- base, parent_id, cur_children
- )
- cur_parent, children = self._add_children_to_parent(
- cur_parent, children
- )
- return base
-
- def _identify_dup_ids(self, parents):
- """Identify duplicated ID attributes in potential nested parents.
-
- According to the GFF3 spec ID attributes are supposed to be unique
- for a file, but this is not always true in practice. This looks
- for duplicates, and provides unique IDs sorted by locations.
- """
- multi_ids = collections.defaultdict(list)
- for parent in parents:
- multi_ids[parent["id"]].append(parent)
- multi_ids = [
- (mid, ps)
- for (mid, ps) in multi_ids.items()
- if len(parents) > 1
- ]
- multi_remap = dict()
- for mid, parents in multi_ids:
- multi_remap[mid] = _MultiIDRemapper(mid, parents)
- return multi_remap
-
- def _add_children_to_parent(self, cur_parent, children):
- """Recursively add children to parent features."""
- if cur_parent.id in children:
- cur_children = children[cur_parent.id]
- ready_children = []
- for _, cur_child in cur_children:
- cur_child, _ = self._add_children_to_parent(
- cur_child, children
- )
- ready_children.append(cur_child)
- # Support Biopython features for 1.62+
- # CompoundLocations and pre-1.62
- if not hasattr(SeqFeature, "CompoundLocation"):
- cur_parent.location_operator = "join"
- for cur_child in ready_children:
- cur_parent.sub_features.append(cur_child)
- del children[cur_parent.id]
- return cur_parent, children
-
- def _add_annotations(self, base, anns):
- """Add annotation data from the GFF file to records."""
- # add these as a list of annotations, checking not to overwrite
- # current values
- for ann in anns:
- rec, base = self._get_rec(base, ann)
- for key, vals in ann["quals"].items():
- self._add_ann_to_rec(rec, key, vals)
- return base
-
- def _add_ann_to_rec(self, rec, key, vals):
- """Add a key/value annotation to the given SeqRecord."""
- if key in rec.annotations:
- try:
- rec.annotations[key].extend(vals)
- except AttributeError:
- rec.annotations[key] = [rec.annotations[key]] + vals
- else:
- rec.annotations[key] = vals
-
- def _get_rec(self, base, info_dict):
- """Retrieve a record to add features to."""
- max_loc = info_dict.get("location", (0, 1))[1]
- match_id = self._get_matching_record_id(
- base, info_dict["rec_id"]
- )
- if match_id:
- cur_rec = base[match_id]
- # update generated unknown sequences
- if unknown_seq_avail and isinstance(
- cur_rec.seq, UnknownSeq
- ):
- cur_rec.seq._length = max(
- [max_loc, cur_rec.seq._length]
- )
- elif not unknown_seq_avail and isinstance(
- cur_rec.seq._data, _UndefinedSequenceData
- ):
- cur_rec.seq._data._length = max(
- [max_loc, cur_rec.seq._data._length]
- )
- return cur_rec, base
- elif self._create_missing:
- if unknown_seq_avail:
- new_rec = SeqRecord(
- UnknownSeq(max_loc), info_dict["rec_id"]
- )
- else:
- new_rec = SeqRecord(
- Seq(None, length=max_loc), info_dict["rec_id"]
- )
- base[info_dict["rec_id"]] = new_rec
- return new_rec, base
- else:
- raise KeyError(
- "Did not find matching record in %s for %s"
- % (base.keys(), info_dict)
- )
-
- def _add_missing_parent(self, base, parent_id, cur_children):
- """Add a new feature that is missing from the GFF file."""
- base_rec_id = list(set(c[0] for c in cur_children))
- child_strands = list(
- set(c[1].location.strand for c in cur_children)
- )
- inferred_strand = (
- child_strands[0] if len(child_strands) == 1 else None
- )
- assert len(base_rec_id) > 0
- feature_dict = dict(
- id=parent_id,
- strand=inferred_strand,
- type="inferred_parent",
- quals=dict(ID=[parent_id]),
- rec_id=base_rec_id[0],
- )
- coords = [
- (c.location.start, c.location.end)
- for r, c in cur_children
- ]
- feature_dict["location"] = (
- min([c[0] for c in coords]),
- max([c[1] for c in coords]),
- )
- return self._add_toplevel_feature(base, feature_dict)
-
- def _add_toplevel_feature(self, base, feature_dict):
- """Add a toplevel non-nested feature to the appropriate record."""
- new_feature = self._get_feature(feature_dict)
- rec, base = self._get_rec(base, feature_dict)
- rec.features.append(new_feature)
- return new_feature, base
-
- def _get_feature(self, feature_dict):
- """Retrieve a Biopython feature from our dictionary representation."""
- # location = SeqFeature.FeatureLocation(*feature_dict['location'])
- rstart, rend = feature_dict["location"]
- new_feature = SeqFeature.SeqFeature(
- SeqFeature.SimpleLocation(
- start=rstart, end=rend, strand=feature_dict["strand"]
- ),
- feature_dict["type"],
- id=feature_dict["id"],
- )
- # Support for Biopython 1.68 and above, which removed sub_features
- if not hasattr(new_feature, "sub_features"):
- new_feature.sub_features = []
- new_feature.qualifiers = feature_dict["quals"]
- return new_feature
-
- def _parse_fasta(self, in_handle):
- """Parse FASTA sequence information contained in the GFF3 file."""
- return list(SeqIO.parse(in_handle, "fasta"))
-
-
-class _GFFParserLocalOut:
- """Provide a collector for local GFF MapReduce file parsing."""
-
- def __init__(self, smart_breaks=False):
- self._items = dict()
- self._smart_breaks = smart_breaks
- self._missing_keys = collections.defaultdict(int)
- self._last_parent = None
- self.can_break = True
- self.num_lines = 0
-
- def add(self, key, vals):
- if self._smart_breaks:
- # if we are not GFF2 we expect parents and break
- # based on not having missing ones
- if key == "directive":
- if vals[0] == "#":
- self.can_break = True
- self._last_parent = None
- elif not vals[0].get("is_gff2", False):
- self._update_missing_parents(key, vals)
- self.can_break = len(self._missing_keys) == 0
- # break when we are done with stretches of child features
- elif key != "child":
- self.can_break = True
- self._last_parent = None
- # break when we have lots of child features in a row
- # and change between parents
- else:
- cur_parent = vals[0]["quals"]["Parent"][0]
- if self._last_parent:
- self.can_break = cur_parent != self._last_parent
- self._last_parent = cur_parent
- self.num_lines += 1
- try:
- self._items[key].extend(vals)
- except KeyError:
- self._items[key] = vals
-
- def _update_missing_parents(self, key, vals):
- # smart way of deciding if we can break this.
- # if this is too much, can go back to not breaking in the
- # middle of children
- if key in ["child"]:
- for val in vals:
- for p_id in val["quals"]["Parent"]:
- self._missing_keys[p_id] += 1
- for val in vals:
- try:
- del self._missing_keys[val["quals"]["ID"][0]]
- except KeyError:
- pass
-
- def has_items(self):
- return len(self._items) > 0
-
- def get_results(self):
- self._last_parent = None
- return self._items
-
-
-class GFFParser(_AbstractMapReduceGFF):
- """Local GFF parser providing standardized parsing of GFF files."""
-
- def __init__(self, line_adjust_fn=None, create_missing=True):
- _AbstractMapReduceGFF.__init__(
- self, create_missing=create_missing
- )
- self._line_adjust_fn = line_adjust_fn
-
- def _gff_process(self, gff_files, limit_info, target_lines):
- """Process GFF addition without any parallelization.
-
- In addition to limit filtering, this accepts a target_lines attribute
- which provides a number of lines to parse before returning results.
- This allows partial parsing of a file to prevent memory issues.
- """
- line_gen = self._file_line_generator(gff_files)
- for out in self._lines_to_out_info(
- line_gen, limit_info, target_lines
- ):
- yield out
-
- def _file_line_generator(self, gff_files):
- """Generate single lines from a set of GFF files."""
- for gff_file in gff_files:
- if hasattr(gff_file, "read"):
- need_close = False
- in_handle = gff_file
- else:
- need_close = True
- in_handle = open(gff_file)
- while 1:
- line = in_handle.readline()
- if not line:
- break
- yield line
- if need_close:
- in_handle.close()
-
- def _lines_to_out_info(self, line_iter, limit_info=None, target_lines=None):
- """Generate SeqRecord and SeqFeatures from GFF file lines."""
- params = self._examiner._get_local_params(limit_info)
- out_info = _GFFParserLocalOut(
- (target_lines is not None and target_lines > 1)
- )
- found_seqs = False
- for line in line_iter:
- results = self._map_fn(line, params)
- if self._line_adjust_fn and results:
- if results[0][0] not in ["directive"]:
- results = [
- (
- results[0][0],
- self._line_adjust_fn(results[0][1]),
- )
- ]
- self._reduce_fn(results, out_info, params)
- if (
- target_lines
- and out_info.num_lines >= target_lines
- and out_info.can_break
- ):
- yield out_info.get_results()
- out_info = _GFFParserLocalOut(
- (target_lines is not None and target_lines > 1)
- )
- if (
- results
- and results[0][0] == "directive"
- and results[0][1] == "FASTA"
- ):
- found_seqs = True
- break
-
- class FakeHandle:
- def __init__(self, line_iter):
- self._iter = line_iter
-
- def __iter__(self):
- return self
-
- def __next__(self):
- return next(self._iter)
-
- next = __next__
-
- def read(self, size=-1):
- if size < 0:
- return "".join(x for x in self._iter)
- elif size == 0:
- return "" # Used by Biopython to sniff unicode vs bytes
- else:
- raise NotImplementedError
-
- def readline(self):
- try:
- return next(self._iter)
- except StopIteration:
- return ""
-
- if found_seqs:
- fasta_recs = self._parse_fasta(FakeHandle(line_iter))
- out_info.add("fasta", fasta_recs)
- if out_info.has_items():
- yield out_info.get_results()
-
-
-class DiscoGFFParser(_AbstractMapReduceGFF):
- """GFF Parser with parallelization (http://discoproject.org."""
-
- def __init__(self, disco_host, create_missing=True):
- """Initialize parser.
-
- disco_host - Web reference to a Disco host which will be used for
- parallelizing the GFF reading job.
- """
- _AbstractMapReduceGFF.__init__(
- self, create_missing=create_missing
- )
- self._disco_host = disco_host
-
- def _gff_process(self, gff_files, limit_info, target_lines=None):
- """Process GFF addition, using Disco to parallelize the process."""
- assert target_lines is None, "Cannot split parallelized jobs"
- # make these imports local; only need them when using disco
- # absolute path names unless they are special disco files
- full_files = []
- for f in gff_files:
- if f.split(":")[0] != "disco":
- full_files.append(os.path.abspath(f))
- else:
- full_files.append(f)
- results = disco.job(
- self._disco_host,
- name="gff_reader",
- input=full_files,
- params=disco.Params(
- limit_info=limit_info,
- jsonify=True,
- filter_info=self._examiner._filter_info,
- ),
- required_modules=["json", "collections", "re"],
- map=self._map_fn,
- reduce=self._reduce_fn,
- )
- processed = dict()
- for out_key, out_val in disco.result_iterator(results):
- processed[out_key] = json.loads(out_val)
- yield processed
-
-
-def parse(
- gff_files, base_dict=None, limit_info=None, target_lines=None
-):
- """parse GFF files into SeqRecords and SeqFeatures."""
- parser = GFFParser()
- for rec in parser.parse_in_parts(
- gff_files, base_dict, limit_info, target_lines
- ):
- yield rec
-
-
-def parse_simple(gff_files, limit_info=None):
- """Parse GFF files as line by line dictionary of parts."""
- parser = GFFParser()
- for rec in parser.parse_simple(gff_files, limit_info=limit_info):
- if "child" in rec:
- assert "parent" not in rec
- yield rec["child"][0]
- elif "parent" in rec:
- yield rec["parent"][0]
- elif "feature" in rec:
- yield rec["feature"][0]
- # ignore directive lines
- else:
- assert "directive" in rec
-
-
-def _file_or_handle(fn):
- """Decorator to handle either an input handle or a file."""
-
- def _file_or_handle_inside(*args, **kwargs):
- in_file = args[1]
- if hasattr(in_file, "read"):
- need_close = False
- in_handle = in_file
- if six.PY3 and not isinstance(in_handle, io.TextIOBase):
- raise TypeError(
- "input handle must be opened in text mode"
- )
- else:
- need_close = True
- in_handle = open(in_file)
- args = (args[0], in_handle) + args[2:]
- out = fn(*args, **kwargs)
- if need_close:
- in_handle.close()
- return out
-
- return _file_or_handle_inside
-
-
-class GFFExaminer:
- """Provide high level details about a GFF file to refine parsing.
-
- GFF is a spec and is provided by many different centers. Real life files
- will present the same information in slightly different ways. Becoming
- familiar with the file you are dealing with is the best way to extract the
- information you need. This class provides high level summary details to
- help in learning.
- """
-
- def __init__(self):
- self._filter_info = dict(
- gff_id=[0],
- gff_source_type=[1, 2],
- gff_source=[1],
- gff_type=[2],
- )
-
- def _get_local_params(self, limit_info=None):
- class _LocalParams:
- def __init__(self):
- self.jsonify = False
-
- params = _LocalParams()
- params.limit_info = limit_info
- params.filter_info = self._filter_info
- return params
-
- @_file_or_handle
- def available_limits(self, gff_handle):
- """Return dictionary information on possible limits for this file.
-
- This returns a nested dictionary with the following structure:
-
- keys -- names of items to filter by
- values -- dictionary with:
- keys -- filter choice
- value -- counts of that filter in this file
-
- Not a parallelized map-reduce implementation.
- """
- cur_limits = dict()
- for filter_key in self._filter_info.keys():
- cur_limits[filter_key] = collections.defaultdict(int)
- for line in gff_handle:
- # when we hit FASTA sequences, we are done with annotations
- if line.startswith("##FASTA"):
- break
- # ignore empty and comment lines
- if line.strip() and line.strip()[0] != "#":
- parts = [p.strip() for p in line.split("\t")]
- assert len(parts) >= 8, line
- parts = parts[:9]
- for (
- filter_key,
- cur_indexes,
- ) in self._filter_info.items():
- cur_id = tuple([parts[i] for i in cur_indexes])
- cur_limits[filter_key][cur_id] += 1
- # get rid of the default dicts
- final_dict = dict()
- for key, value_dict in cur_limits.items():
- if len(key) == 1:
- key = key[0]
- final_dict[key] = dict(value_dict)
- gff_handle.close()
- return final_dict
-
- @_file_or_handle
- def parent_child_map(self, gff_handle):
- """Provide a mapping of parent to child relationships in the file.
-
- Returns a dictionary of parent child relationships:
-
- keys -- tuple of (source, type) for each parent
- values -- tuple of (source, type) as children of that parent
-
- Not a parallelized map-reduce implementation.
- """
- # collect all of the parent and child types mapped to IDs
- parent_sts = dict()
- child_sts = collections.defaultdict(list)
- for line in gff_handle:
- # when we hit FASTA sequences, we are done with annotations
- if line.startswith("##FASTA"):
- break
- if line.strip() and not line.startswith("#"):
- line_type, line_info = _gff_line_map(
- line, self._get_local_params()
- )[0]
- if line_type == "parent" or (
- line_type == "child" and line_info["id"]
- ):
- parent_sts[line_info["id"]] = (
- line_info["quals"].get("source", [""])[0],
- line_info["type"],
- )
- if line_type == "child":
- for parent_id in line_info["quals"]["Parent"]:
- child_sts[parent_id].append(
- (
- line_info["quals"].get(
- "source", [""]
- )[0],
- line_info["type"],
- )
- )
- # print parent_sts, child_sts
- # generate a dictionary of the unique final type relationships
- pc_map = collections.defaultdict(list)
- for parent_id, parent_type in parent_sts.items():
- for child_type in child_sts[parent_id]:
- pc_map[parent_type].append(child_type)
- pc_final_map = dict()
- for ptype, ctypes in pc_map.items():
- unique_ctypes = list(set(ctypes))
- unique_ctypes.sort()
- pc_final_map[ptype] = unique_ctypes
- return pc_final_map
diff -r 2e6c48910819 -r bde6b1d09f7d jb2_GFF/__init__.py
--- a/jb2_GFF/__init__.py Mon Jan 29 02:34:43 2024 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,6 +0,0 @@
-"""Top level of GFF parsing providing shortcuts for useful classes.
-"""
-from jb2_GFF.GFFParser import GFFParser, DiscoGFFParser, GFFExaminer, parse, parse_simple
-from jb2_GFF.GFFOutput import GFF3Writer, write
-
-__version__ = "0.6.9"
diff -r 2e6c48910819 -r bde6b1d09f7d jb2_GFF/__pycache__/GFFOutput.cpython-310.pyc
Binary file jb2_GFF/__pycache__/GFFOutput.cpython-310.pyc has changed
diff -r 2e6c48910819 -r bde6b1d09f7d jb2_GFF/__pycache__/GFFParser.cpython-310.pyc
Binary file jb2_GFF/__pycache__/GFFParser.cpython-310.pyc has changed
diff -r 2e6c48910819 -r bde6b1d09f7d jb2_GFF/__pycache__/__init__.cpython-310.pyc
Binary file jb2_GFF/__pycache__/__init__.cpython-310.pyc has changed
diff -r 2e6c48910819 -r bde6b1d09f7d jb2_GFF/_utils.py
--- a/jb2_GFF/_utils.py Mon Jan 29 02:34:43 2024 +0000
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,49 +0,0 @@
-class defaultdict(dict):
- """Back compatible defaultdict:
- http://code.activestate.com/recipes/523034/"""
-
- def __init__(self, default_factory=None, *a, **kw):
- if default_factory is not None and not hasattr(
- default_factory, "__call__"
- ):
- raise TypeError("first argument must be callable")
- dict.__init__(self, *a, **kw)
- self.default_factory = default_factory
-
- def __getitem__(self, key):
- try:
- return dict.__getitem__(self, key)
- except KeyError:
- return self.__missing__(key)
-
- def __missing__(self, key):
- if self.default_factory is None:
- raise KeyError(key)
- self[key] = value = self.default_factory()
- return value
-
- def __reduce__(self):
- if self.default_factory is None:
- args = tuple()
- else:
- args = (self.default_factory,)
- return type(self), args, None, None, self.items()
-
- def copy(self):
- return self.__copy__()
-
- def __copy__(self):
- return type(self)(self.default_factory, self)
-
- def __deepcopy__(self, memo):
- import copy
-
- return type(self)(
- self.default_factory, copy.deepcopy(self.items())
- )
-
- def __repr__(self):
- return "defaultdict(%s, %s)" % (
- self.default_factory,
- dict.__repr__(self),
- )
diff -r 2e6c48910819 -r bde6b1d09f7d jbrowse2.py
--- a/jbrowse2.py Mon Jan 29 02:34:43 2024 +0000
+++ b/jbrowse2.py Tue Jan 30 06:05:03 2024 +0000
@@ -3,7 +3,6 @@
import argparse
import binascii
import datetime
-import hashlib
import json
import logging
import os
@@ -23,7 +22,7 @@
TODAY = datetime.datetime.now().strftime("%Y-%m-%d")
GALAXY_INFRASTRUCTURE_URL = None
-JB2REL = "v2.10.0"
+JB2REL = "v2.10.1"
# version pinned for cloning
mapped_chars = {
@@ -232,7 +231,9 @@
elif "scaling" in track:
if track["scaling"]["method"] == "ignore":
if track["scaling"]["scheme"]["color"] != "__auto__":
- trackConfig["style"]["color"] = track["scaling"]["scheme"]["color"]
+ trackConfig["style"]["color"] = track["scaling"]["scheme"][
+ "color"
+ ]
else:
trackConfig["style"]["color"] = self.hex_from_rgb(
*self._get_colours()
@@ -259,13 +260,18 @@
"blue": blue,
}
)
- trackConfig["style"]["color"] = color_function.replace("\n", "")
+ trackConfig["style"]["color"] = color_function.replace(
+ "\n", ""
+ )
elif trackFormat == "gene_calls":
# Default values, based on GFF3 spec
min_val = 0
max_val = 1000
# Get min/max and build a scoring function since JBrowse doesn't
- if scales["type"] == "automatic" or scales["type"] == "__auto__":
+ if (
+ scales["type"] == "automatic"
+ or scales["type"] == "__auto__"
+ ):
min_val, max_val = self.min_max_gff(gff3)
else:
min_val = scales.get("min", 0)
@@ -273,7 +279,9 @@
if scheme["color"] == "__auto__":
user_color = "undefined"
- auto_color = "'%s'" % self.hex_from_rgb(*self._get_colours())
+ auto_color = "'%s'" % self.hex_from_rgb(
+ *self._get_colours()
+ )
elif scheme["color"].startswith("#"):
user_color = "'%s'" % self.hex_from_rgb(
*self.rgb_from_hex(scheme["color"][1:])
@@ -281,7 +289,9 @@
auto_color = "undefined"
else:
user_color = "undefined"
- auto_color = "'%s'" % self.hex_from_rgb(*self._get_colours())
+ auto_color = "'%s'" % self.hex_from_rgb(
+ *self._get_colours()
+ )
color_function = self.COLOR_FUNCTION_TEMPLATE_QUAL.format(
**{
@@ -293,7 +303,9 @@
}
)
- trackConfig["style"]["color"] = color_function.replace("\n", "")
+ trackConfig["style"]["color"] = color_function.replace(
+ "\n", ""
+ )
return trackConfig
@@ -336,40 +348,41 @@
for (key, value) in node.findall("dataset")[0].attrib.items():
metadata["dataset_%s" % key] = value
- for (key, value) in node.findall("history")[0].attrib.items():
- metadata["history_%s" % key] = value
-
- for (key, value) in node.findall("metadata")[0].attrib.items():
- metadata["metadata_%s" % key] = value
-
- for (key, value) in node.findall("tool")[0].attrib.items():
- metadata["tool_%s" % key] = value
+ if node.findall("history"):
+ for (key, value) in node.findall("history")[0].attrib.items():
+ metadata["history_%s" % key] = value
- # Additional Mappings applied:
- metadata[
- "dataset_edam_format"
- ] = '{1}'.format(
- metadata["dataset_edam_format"], metadata["dataset_file_ext"]
- )
- metadata["history_user_email"] = '{0}'.format(
- metadata["history_user_email"]
- )
- metadata["hist_name"] = metadata["history_display_name"]
- metadata[
- "history_display_name"
- ] = '{hist_name}'.format(
- galaxy=GALAXY_INFRASTRUCTURE_URL,
- encoded_hist_id=metadata["history_id"],
- hist_name=metadata["history_display_name"],
- )
- metadata[
- "tool_tool"
- ] = '{tool_id}'.format(
- galaxy=GALAXY_INFRASTRUCTURE_URL,
- encoded_id=metadata["dataset_id"],
- tool_id=metadata["tool_tool_id"],
- # tool_version=metadata['tool_tool_version'],
- )
+ if node.findall("metadata"):
+ for (key, value) in node.findall("metadata")[0].attrib.items():
+ metadata["metadata_%s" % key] = value
+ # Additional Mappings applied:
+ metadata[
+ "dataset_edam_format"
+ ] = '{1}'.format(
+ metadata["dataset_edam_format"], metadata["dataset_file_ext"]
+ )
+ metadata["history_user_email"] = '{0}'.format(
+ metadata["history_user_email"]
+ )
+ metadata["hist_name"] = metadata["history_display_name"]
+ metadata[
+ "history_display_name"
+ ] = '{hist_name}'.format(
+ galaxy=GALAXY_INFRASTRUCTURE_URL,
+ encoded_hist_id=metadata["history_id"],
+ hist_name=metadata["history_display_name"],
+ )
+ if node.findall("tool"):
+ for (key, value) in node.findall("tool")[0].attrib.items():
+ metadata["tool_%s" % key] = value
+ metadata[
+ "tool_tool"
+ ] = '{tool_id}{tool_version}'.format(
+ galaxy=GALAXY_INFRASTRUCTURE_URL,
+ encoded_id=metadata.get("dataset_id", ""),
+ tool_id=metadata.get("tool_tool_id", ""),
+ tool_version=metadata.get("tool_tool_version",""),
+ )
return metadata
@@ -389,7 +402,9 @@
def subprocess_check_call(self, command, output=None):
if output:
- log.debug("cd %s && %s > %s", self.outdir, " ".join(command), output)
+ log.debug(
+ "cd %s && %s > %s", self.outdir, " ".join(command), output
+ )
subprocess.check_call(command, cwd=self.outdir, stdout=output)
else:
log.debug("cd %s && %s", self.outdir, " ".join(command))
@@ -468,13 +483,8 @@
self.config_json["assemblies"] = assemblies
def make_assembly(self, fapath, gname):
- hashData = [
- fapath,
- gname,
- ]
- hashData = "|".join(hashData).encode("utf-8")
- ghash = hashlib.md5(hashData).hexdigest()
- faname = ghash + ".fa.gz"
+
+ faname = gname + ".fa.gz"
fadest = os.path.join(self.outdir, faname)
cmd = "bgzip -i -c %s -I %s.gzi > %s && samtools faidx %s" % (
fapath,
@@ -495,6 +505,7 @@
"uri": faname + ".gzi",
},
}
+ self.genome_sequence_adapter = adapter
trackDict = {
"name": gname,
"sequence": {
@@ -604,7 +615,7 @@
"plugins": [
{
"name": "MafViewer",
- "url": "https://unpkg.com/browse/jbrowse-plugin-mafviewer@1.0.6/dist/jbrowse-plugin-mafviewer.umd.production.min.js",
+ "url": "https://unpkg.com/jbrowse-plugin-mafviewer/dist/jbrowse-plugin-mafviewer.umd.production.min.js"
}
]
}
@@ -623,7 +634,9 @@
self.subprocess_check_call(cmd)
# Construct samples list
# We could get this from galaxy metadata, not sure how easily.
- ps = subprocess.Popen(["grep", "^s [^ ]*", "-o", data], stdout=subprocess.PIPE)
+ ps = subprocess.Popen(
+ ["grep", "^s [^ ]*", "-o", data], stdout=subprocess.PIPE
+ )
output = subprocess.check_output(("sort", "-u"), stdin=ps.stdout)
ps.wait()
outp = output.decode("ascii")
@@ -783,7 +796,9 @@
url = fname
self.subprocess_check_call(["cp", data, dest])
bloc = {"uri": url}
- if bam_index is not None and os.path.exists(os.path.realpath(bam_index)):
+ if bam_index is not None and os.path.exists(
+ os.path.realpath(bam_index)
+ ):
# bai most probably made by galaxy and stored in galaxy dirs, need to copy it to dest
self.subprocess_check_call(
["cp", os.path.realpath(bam_index), dest + ".bai"]
@@ -794,7 +809,9 @@
# => no index generated by galaxy, but there might be one next to the symlink target
# this trick allows to skip the bam sorting made by galaxy if already done outside
if os.path.exists(os.path.realpath(data) + ".bai"):
- self.symlink_or_copy(os.path.realpath(data) + ".bai", dest + ".bai")
+ self.symlink_or_copy(
+ os.path.realpath(data) + ".bai", dest + ".bai"
+ )
else:
log.warn("Could not find a bam index (.bai file) for %s", data)
trackDict = {
@@ -823,12 +840,62 @@
self.tracksToAdd.append(trackDict)
self.trackIdlist.append(tId)
+ def add_cram(self, data, trackData, cramOpts, cram_index=None, **kwargs):
+ tId = trackData["label"]
+ fname = "%s.cram" % trackData["label"]
+ dest = "%s/%s" % (self.outdir, fname)
+ url = fname
+ self.subprocess_check_call(["cp", data, dest])
+ bloc = {"uri": url}
+ if cram_index is not None and os.path.exists(
+ os.path.realpath(cram_index)
+ ):
+ # most probably made by galaxy and stored in galaxy dirs, need to copy it to dest
+ self.subprocess_check_call(
+ ["cp", os.path.realpath(cram_index), dest + ".crai"]
+ )
+ else:
+ # Can happen in exotic condition
+ # e.g. if bam imported as symlink with datatype=unsorted.bam, then datatype changed to bam
+ # => no index generated by galaxy, but there might be one next to the symlink target
+ # this trick allows to skip the bam sorting made by galaxy if already done outside
+ if os.path.exists(os.path.realpath(data) + ".crai"):
+ self.symlink_or_copy(
+ os.path.realpath(data) + ".crai", dest + ".crai"
+ )
+ else:
+ log.warn(
+ "Could not find a cram index (.crai file) for %s", data
+ )
+ trackDict = {
+ "type": "AlignmentsTrack",
+ "trackId": tId,
+ "name": trackData["name"],
+ "assemblyNames": [self.genome_name],
+ "adapter": {
+ "type": "CramAdapter",
+ "cramLocation": bloc,
+ "craiLocation": {"uri": fname + ".crai",},
+ "sequenceAdapter": self.genome_sequence_adapter,
+ },
+ "displays": [
+ {
+ "type": "LinearAlignmentsDisplay",
+ "displayId": "%s-LinearAlignmentsDisplay" % tId,
+ },
+ ],
+ }
+ style_json = self._prepare_track_style(trackDict)
+ trackDict["style"] = style_json
+ self.tracksToAdd.append(trackDict)
+ self.trackIdlist.append(tId)
+
def add_vcf(self, data, trackData):
tId = trackData["label"]
- url = "%s/api/datasets/%s/display" % (
- self.giURL,
- trackData["metadata"]["dataset_id"],
- )
+ # url = "%s/api/datasets/%s/display" % (
+ # self.giURL,
+ # trackData["metadata"]["dataset_id"],
+ # )
url = "%s.vcf.gz" % tId
dest = "%s/%s" % (self.outdir, url)
cmd = "bgzip -c %s > %s" % (data, dest)
@@ -879,7 +946,9 @@
dest,
) # "gff3sort.pl --precise '%s' | grep -v \"^$\" > '%s'"
self.subprocess_popen(cmd)
- self.subprocess_check_call(["tabix", "-f", "-p", "gff", dest + ".gz"])
+ self.subprocess_check_call(
+ ["tabix", "-f", "-p", "gff", dest + ".gz"]
+ )
def _sort_bed(self, data, dest):
# Only index if not already done
@@ -1085,28 +1154,12 @@
outputTrackConfig["key"] = track_human_label
- # We add extra data to hash for the case of REST + SPARQL.
- if (
- "conf" in track
- and "options" in track["conf"]
- and "url" in track["conf"]["options"]
- ):
- rest_url = track["conf"]["options"]["url"]
- else:
- rest_url = ""
outputTrackConfig["trackset"] = track.get("trackset", {})
- # I chose to use track['category'] instead of 'category' here. This
- # is intentional. This way re-running the tool on a different date
- # will not generate different hashes and make comparison of outputs
- # much simpler.
- hashData = [
- str(dataset_path),
+ outputTrackConfig["label"] = "%s_%i_%s" % (
+ dataset_ext,
+ i,
track_human_label,
- track["category"],
- rest_url,
- ]
- hashData = "|".join(hashData).encode("utf-8")
- outputTrackConfig["label"] = hashlib.md5(hashData).hexdigest() + "_%s" % i
+ )
outputTrackConfig["metadata"] = extra_metadata
outputTrackConfig["name"] = track_human_label
@@ -1138,17 +1191,10 @@
outputTrackConfig,
)
elif dataset_ext == "bam":
- real_indexes = track["conf"]["options"]["pileup"]["bam_indices"][
- "bam_index"
- ]
+ real_indexes = track["conf"]["options"]["pileup"][
+ "bam_indices"
+ ]["bam_index"]
if not isinstance(real_indexes, list):
- #
- # /path/to/a.bam.bai
- #
- #
- # The above will result in the 'bam_index' key containing a
- # string. If there are two or more indices, the container
- # becomes a list. Fun!
real_indexes = [real_indexes]
self.add_bam(
@@ -1157,6 +1203,19 @@
track["conf"]["options"]["pileup"],
bam_index=real_indexes[i],
)
+ elif dataset_ext == "cram":
+ real_indexes = track["conf"]["options"]["cram"][
+ "cram_indices"
+ ]["cram_index"]
+ if not isinstance(real_indexes, list):
+ real_indexes = [real_indexes]
+
+ self.add_cram(
+ dataset_path,
+ outputTrackConfig,
+ track["conf"]["options"]["cram"],
+ cram_index=real_indexes[i],
+ )
elif dataset_ext == "blastxml":
self.add_blastxml(
dataset_path,
@@ -1290,14 +1349,18 @@
config_json.update(self.config_json)
config_data = {}
- config_data["disableAnalytics"] = data.get("analytics", "false") == "true"
+ config_data["disableAnalytics"] = (
+ data.get("analytics", "false") == "true"
+ )
config_data["theme"] = {
"palette": {
"primary": {"main": data.get("primary_color", "#0D233F")},
"secondary": {"main": data.get("secondary_color", "#721E63")},
"tertiary": {"main": data.get("tertiary_color", "#135560")},
- "quaternary": {"main": data.get("quaternary_color", "#FFB11D")},
+ "quaternary": {
+ "main": data.get("quaternary_color", "#FFB11D")
+ },
},
"typography": {"fontSize": int(data.get("font_size", 10))},
}
@@ -1351,9 +1414,10 @@
parser = argparse.ArgumentParser(description="", epilog="")
parser.add_argument("--xml", help="Track Configuration")
parser.add_argument("--outdir", help="Output directory", default="out")
- parser.add_argument("--version", "-V", action="version", version="%(prog)s 2.0.1")
+ parser.add_argument(
+ "--version", "-V", action="version", version="%(prog)s 2.0.1"
+ )
args = parser.parse_args()
-
tree = ET.parse(args.xml)
root = tree.getroot()
@@ -1448,7 +1512,8 @@
track_conf["format"] = track.attrib["format"]
if track.find("options/style"):
track_conf["style"] = {
- item.tag: parse_style_conf(item) for item in track.find("options/style")
+ item.tag: parse_style_conf(item)
+ for item in track.find("options/style")
}
if track.find("options/style_labels"):
track_conf["style_labels"] = {
@@ -1461,7 +1526,9 @@
track_conf["format"] = track.attrib["format"]
try:
# Only pertains to gff3 + blastxml. TODO?
- track_conf["style"] = {t.tag: t.text for t in track.find("options/style")}
+ track_conf["style"] = {
+ t.tag: t.text for t in track.find("options/style")
+ }
except TypeError:
track_conf["style"] = {}
pass
@@ -1492,7 +1559,9 @@
"primary_color": root.find("metadata/general/primary_color").text,
"secondary_color": root.find("metadata/general/secondary_color").text,
"tertiary_color": root.find("metadata/general/tertiary_color").text,
- "quaternary_color": root.find("metadata/general/quaternary_color").text,
+ "quaternary_color": root.find(
+ "metadata/general/quaternary_color"
+ ).text,
"font_size": root.find("metadata/general/font_size").text,
}
jc.add_general_configuration(general_data)
diff -r 2e6c48910819 -r bde6b1d09f7d jbrowse2.xml
--- a/jbrowse2.xml Mon Jan 29 02:34:43 2024 +0000
+++ b/jbrowse2.xml Tue Jan 30 06:05:03 2024 +0000
@@ -1,4 +1,4 @@
-
+ genome browsermacros.xml
@@ -31,17 +31,20 @@
#if str($reference_genome.genome_type_select) == "indexed":
-
-
+
+
+
+
#else
-
+
+ dname = "${reference_genome.genome.name}" />
${track.data_format.synteny_genome}
- ${track.data_format.synteny_genome.element_identifier}
+ ${track.data_format.synteny_genome.name}
#else if str($track.data_format.data_format_select) == "hic":
@@ -227,15 +230,15 @@
-
+
+
-
+
@@ -291,24 +294,24 @@
+
+
+
+
-
-
-
-
@@ -371,7 +374,7 @@
-
+
diff -r 2e6c48910819 -r bde6b1d09f7d macros.xml
--- a/macros.xml Mon Jan 29 02:34:43 2024 +0000
+++ b/macros.xml Tue Jan 30 06:05:03 2024 +0000
@@ -1,6 +1,6 @@
- 2.10.0
+ 2.10.1topic_3307
@@ -14,8 +14,8 @@
jbrowse2
- biopython
- bcbio-gff
+ biopython
+ bcbio-gffsamtoolspyyamltabix