changeset 0:04f12e56d4c7 draft

planemo upload for repository https://github.com/galaxy-genome-annotation/galaxy-tools/tree/master/tools/chado commit 81a83f06b49db32928ba0cd44e5b6d0431868d27
author gga
date Thu, 21 Jun 2018 08:45:47 -0400
parents
children 7af45fc803fd
files README.rst chado.py feature_load_gff.xml macros.xml
diffstat 4 files changed, 668 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README.rst	Thu Jun 21 08:45:47 2018 -0400
@@ -0,0 +1,43 @@
+Galaxy-chado
+=============
+
+Galaxy tools to interface with Tripal using python-chado
+
+Dependencies
+------------
+
+You will need to install some python modules in the Galaxy virtualenv for these
+tools to be fully functional:
+
+.. code:: bash
+
+    . /path/to/galaxy/.venv/bin/activate
+    pip install future chado
+    deactivate
+
+Environment
+-----------
+
+The following environment variables must be set:
+
++--------------------------------+-----------------------------------------------------------+
+| ENV                            | Use                                                       |
++================================+===========================================================+
+| ``$GALAXY_CHADO_DBHOST``       | Host of the Chado database                                |
++--------------------------------+-----------------------------------------------------------+
+| ``$GALAXY_CHADO_DBNAME``       | Name of the Chado database                                |
++--------------------------------+-----------------------------------------------------------+
+| ``$GALAXY_CHADO_DBUSER``       | Username to connect to the database                       |
++--------------------------------+-----------------------------------------------------------+
+| ``$GALAXY_CHADO_DBPASS``       | Password to connect to the database                       |
++--------------------------------+-----------------------------------------------------------+
+| ``$GALAXY_CHADO_DBSCHEMA``     | Database schema.                                          |
++--------------------------------+-----------------------------------------------------------+
+| ``$GALAXY_CHADO_DBPORT``       | Port of the Chado database                                |
++--------------------------------+-----------------------------------------------------------+
+
+
+License
+-------
+
+All python scripts and wrappers are licensed under MIT license.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/chado.py	Thu Jun 21 08:45:47 2018 -0400
@@ -0,0 +1,470 @@
+import collections
+import os
+import time
+from abc import abstractmethod
+
+import chado
+
+
+#############################################
+#      BEGIN IMPORT OF CACHING LIBRARY      #
+#############################################
+# This code is licensed under the MIT       #
+# License and is a copy of code publicly    #
+# available in rev.                         #
+# e27332bc82f4e327aedaec17c9b656ae719322ed  #
+# of https://github.com/tkem/cachetools/    #
+#############################################
+class DefaultMapping(collections.MutableMapping):
+
+    __slots__ = ()
+
+    @abstractmethod
+    def __contains__(self, key):  # pragma: nocover
+        return False
+
+    @abstractmethod
+    def __getitem__(self, key):  # pragma: nocover
+        if hasattr(self.__class__, '__missing__'):
+            return self.__class__.__missing__(self, key)
+        else:
+            raise KeyError(key)
+
+    def get(self, key, default=None):
+        if key in self:
+            return self[key]
+        else:
+            return default
+
+    __marker = object()
+
+    def pop(self, key, default=__marker):
+        if key in self:
+            value = self[key]
+            del self[key]
+        elif default is self.__marker:
+            raise KeyError(key)
+        else:
+            value = default
+        return value
+
+    def setdefault(self, key, default=None):
+        if key in self:
+            value = self[key]
+        else:
+            self[key] = value = default
+        return value
+
+
+DefaultMapping.register(dict)
+
+
+class _DefaultSize(object):
+    def __getitem__(self, _):
+        return 1
+
+    def __setitem__(self, _, value):
+        assert value == 1
+
+    def pop(self, _):
+        return 1
+
+
+class Cache(DefaultMapping):
+    """Mutable mapping to serve as a simple cache or cache base class."""
+
+    __size = _DefaultSize()
+
+    def __init__(self, maxsize, missing=None, getsizeof=None):
+        if missing:
+            self.__missing = missing
+        if getsizeof:
+            self.__getsizeof = getsizeof
+            self.__size = dict()
+        self.__data = dict()
+        self.__currsize = 0
+        self.__maxsize = maxsize
+
+    def __repr__(self):
+        return '%s(%r, maxsize=%r, currsize=%r)' % (
+            self.__class__.__name__,
+            list(self.__data.items()),
+            self.__maxsize,
+            self.__currsize,
+        )
+
+    def __getitem__(self, key):
+        try:
+            return self.__data[key]
+        except KeyError:
+            return self.__missing__(key)
+
+    def __setitem__(self, key, value):
+        maxsize = self.__maxsize
+        size = self.getsizeof(value)
+        if size > maxsize:
+            raise ValueError('value too large')
+        if key not in self.__data or self.__size[key] < size:
+            while self.__currsize + size > maxsize:
+                self.popitem()
+        if key in self.__data:
+            diffsize = size - self.__size[key]
+        else:
+            diffsize = size
+        self.__data[key] = value
+        self.__size[key] = size
+        self.__currsize += diffsize
+
+    def __delitem__(self, key):
+        size = self.__size.pop(key)
+        del self.__data[key]
+        self.__currsize -= size
+
+    def __contains__(self, key):
+        return key in self.__data
+
+    def __missing__(self, key):
+        value = self.__missing(key)
+        try:
+            self.__setitem__(key, value)
+        except ValueError:
+            pass  # value too large
+        return value
+
+    def __iter__(self):
+        return iter(self.__data)
+
+    def __len__(self):
+        return len(self.__data)
+
+    @staticmethod
+    def __getsizeof(value):
+        return 1
+
+    @staticmethod
+    def __missing(key):
+        raise KeyError(key)
+
+    @property
+    def maxsize(self):
+        """The maximum size of the cache."""
+        return self.__maxsize
+
+    @property
+    def currsize(self):
+        """The current size of the cache."""
+        return self.__currsize
+
+    def getsizeof(self, value):
+        """Return the size of a cache element's value."""
+        return self.__getsizeof(value)
+
+
+class _Link(object):
+
+    __slots__ = ('key', 'expire', 'next', 'prev')
+
+    def __init__(self, key=None, expire=None):
+        self.key = key
+        self.expire = expire
+
+    def __reduce__(self):
+        return _Link, (self.key, self.expire)
+
+    def unlink(self):
+        next = self.next
+        prev = self.prev
+        prev.next = next
+        next.prev = prev
+
+
+class _Timer(object):
+
+    def __init__(self, timer):
+        self.__timer = timer
+        self.__nesting = 0
+
+    def __call__(self):
+        if self.__nesting == 0:
+            return self.__timer()
+        else:
+            return self.__time
+
+    def __enter__(self):
+        if self.__nesting == 0:
+            self.__time = time = self.__timer()
+        else:
+            time = self.__time
+        self.__nesting += 1
+        return time
+
+    def __exit__(self, *exc):
+        self.__nesting -= 1
+
+    def __reduce__(self):
+        return _Timer, (self.__timer,)
+
+    def __getattr__(self, name):
+        return getattr(self.__timer, name)
+
+
+class TTLCache(Cache):
+    """LRU Cache implementation with per-item time-to-live (TTL) value."""
+
+    def __init__(self, maxsize, ttl, timer=time.time, missing=None,
+                 getsizeof=None):
+        Cache.__init__(self, maxsize, missing, getsizeof)
+        self.__root = root = _Link()
+        root.prev = root.next = root
+        self.__links = collections.OrderedDict()
+        self.__timer = _Timer(timer)
+        self.__ttl = ttl
+
+    def __contains__(self, key):
+        try:
+            link = self.__links[key]  # no reordering
+        except KeyError:
+            return False
+        else:
+            return not (link.expire < self.__timer())
+
+    def __getitem__(self, key, cache_getitem=Cache.__getitem__):
+        try:
+            link = self.__getlink(key)
+        except KeyError:
+            expired = False
+        else:
+            expired = link.expire < self.__timer()
+        if expired:
+            return self.__missing__(key)
+        else:
+            return cache_getitem(self, key)
+
+    def __setitem__(self, key, value, cache_setitem=Cache.__setitem__):
+        with self.__timer as time:
+            self.expire(time)
+            cache_setitem(self, key, value)
+        try:
+            link = self.__getlink(key)
+        except KeyError:
+            self.__links[key] = link = _Link(key)
+        else:
+            link.unlink()
+        link.expire = time + self.__ttl
+        link.next = root = self.__root
+        link.prev = prev = root.prev
+        prev.next = root.prev = link
+
+    def __delitem__(self, key, cache_delitem=Cache.__delitem__):
+        cache_delitem(self, key)
+        link = self.__links.pop(key)
+        link.unlink()
+        if link.expire < self.__timer():
+            raise KeyError(key)
+
+    def __iter__(self):
+        root = self.__root
+        curr = root.next
+        while curr is not root:
+            # "freeze" time for iterator access
+            with self.__timer as time:
+                if not (curr.expire < time):
+                    yield curr.key
+            curr = curr.next
+
+    def __len__(self):
+        root = self.__root
+        curr = root.next
+        time = self.__timer()
+        count = len(self.__links)
+        while curr is not root and curr.expire < time:
+            count -= 1
+            curr = curr.next
+        return count
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        root = self.__root
+        root.prev = root.next = root
+        for link in sorted(self.__links.values(), key=lambda obj: obj.expire):
+            link.next = root
+            link.prev = prev = root.prev
+            prev.next = root.prev = link
+        self.expire(self.__timer())
+
+    def __repr__(self, cache_repr=Cache.__repr__):
+        with self.__timer as time:
+            self.expire(time)
+            return cache_repr(self)
+
+    @property
+    def currsize(self):
+        with self.__timer as time:
+            self.expire(time)
+            return super(TTLCache, self).currsize
+
+    @property
+    def timer(self):
+        """The timer function used by the cache."""
+        return self.__timer
+
+    @property
+    def ttl(self):
+        """The time-to-live value of the cache's items."""
+        return self.__ttl
+
+    def expire(self, time=None):
+        """Remove expired items from the cache."""
+        if time is None:
+            time = self.__timer()
+        root = self.__root
+        curr = root.next
+        links = self.__links
+        cache_delitem = Cache.__delitem__
+        while curr is not root and curr.expire < time:
+            cache_delitem(self, curr.key)
+            del links[curr.key]
+            next = curr.next
+            curr.unlink()
+            curr = next
+
+    def clear(self):
+        with self.__timer as time:
+            self.expire(time)
+            Cache.clear(self)
+
+    def get(self, *args, **kwargs):
+        with self.__timer:
+            return Cache.get(self, *args, **kwargs)
+
+    def pop(self, *args, **kwargs):
+        with self.__timer:
+            return Cache.pop(self, *args, **kwargs)
+
+    def setdefault(self, *args, **kwargs):
+        with self.__timer:
+            return Cache.setdefault(self, *args, **kwargs)
+
+    def popitem(self):
+        """Remove and return the `(key, value)` pair least recently used that
+        has not already expired.
+
+        """
+        with self.__timer as time:
+            self.expire(time)
+            try:
+                key = next(iter(self.__links))
+            except StopIteration:
+                raise KeyError('%s is empty' % self.__class__.__name__)
+            else:
+                return (key, self.pop(key))
+
+    if hasattr(collections.OrderedDict, 'move_to_end'):
+        def __getlink(self, key):
+            value = self.__links[key]
+            self.__links.move_to_end(key)
+            return value
+    else:
+        def __getlink(self, key):
+            value = self.__links.pop(key)
+            self.__links[key] = value
+            return value
+
+
+#############################################
+#       END IMPORT OF CACHING LIBRARY       #
+#############################################
+
+cache = TTLCache(
+    100,  # Up to 100 items
+    1 * 60  # 5 minute cache life
+)
+
+
+def _get_instance():
+    return chado.ChadoInstance(
+        os.environ['GALAXY_CHADO_DBHOST'],
+        os.environ['GALAXY_CHADO_DBNAME'],
+        os.environ['GALAXY_CHADO_DBUSER'],
+        os.environ['GALAXY_CHADO_DBPASS'],
+        os.environ['GALAXY_CHADO_DBSCHEMA'],
+        os.environ['GALAXY_CHADO_DBPORT'],
+        no_reflect=True
+    )
+
+
+def list_organisms(*args, **kwargs):
+
+    ci = _get_instance()
+
+    # Key for cached data
+    cacheKey = 'orgs'
+    # We don't want to trust "if key in cache" because between asking and fetch
+    # it might through key error.
+    if cacheKey not in cache:
+        # However if it ISN'T there, we know we're safe to fetch + put in
+        # there.
+        data = _list_organisms(ci, *args, **kwargs)
+        cache[cacheKey] = data
+        return data
+    try:
+        # The cache key may or may not be in the cache at this point, it
+        # /likely/ is. However we take no chances that it wasn't evicted between
+        # when we checked above and now, so we reference the object from the
+        # cache in preparation to return.
+        data = cache[cacheKey]
+        return data
+    except KeyError:
+        # If access fails due to eviction, we will fail over and can ensure that
+        # data is inserted.
+        data = _list_organisms(ci, *args, **kwargs)
+        cache[cacheKey] = data
+        return data
+
+
+def _list_organisms(ci, *args, **kwargs):
+    # Fetch the orgs.
+    orgs_data = []
+    for org in ci.organism.get_organisms():
+        clean_name = '%s %s' % (org['genus'], org['species'])
+        if 'infraspecific_name' in org and org['infraspecific_name']:
+            clean_name += ' (%s)' % (org['infraspecific_name'])
+        orgs_data.append((clean_name, str(org['organism_id']), False))
+    return orgs_data
+
+
+def list_analyses(*args, **kwargs):
+
+    ci = _get_instance()
+
+    # Key for cached data
+    cacheKey = 'analyses'
+    # We don't want to trust "if key in cache" because between asking and fetch
+    # it might through key error.
+    if cacheKey not in cache:
+        # However if it ISN'T there, we know we're safe to fetch + put in
+        # there.<?xml version="1.0"?>
+
+        data = _list_analyses(ci, *args, **kwargs)
+        cache[cacheKey] = data
+        return data
+    try:
+        # The cache key may or may not be in the cache at this point, it
+        # /likely/ is. However we take no chances that it wasn't evicted between
+        # when we checked above and now, so we reference the object from the
+        # cache in preparation to return.
+        data = cache[cacheKey]
+        return data
+    except KeyError:
+        # If access fails due to eviction, we will fail over and can ensure that
+        # data is inserted.
+        data = _list_analyses(ci, *args, **kwargs)
+        cache[cacheKey] = data
+        return data
+
+
+def _list_analyses(ci, *args, **kwargs):
+    ans_data = []
+    for an in ci.analysis.get_analyses():
+        ans_data.append((an['name'], str(an['analysis_id']), False))
+    return ans_data
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/feature_load_gff.xml	Thu Jun 21 08:45:47 2018 -0400
@@ -0,0 +1,70 @@
+<?xml version="1.0"?>
+<tool id="feature_load_gff" name="Chado load gff" version="@WRAPPER_VERSION@.0">
+    <description></description>
+    <macros>
+    	 <import>macros.xml</import>
+    </macros>
+    <code file="chado.py"/>
+    <expand macro="requirements"/>
+    <command detect_errors="aggressive"><![CDATA[
+@AUTH@
+
+chakin feature load_gff
+'$gff'
+'$analysis_id'
+'$organism'
+
+#if $landmark_type:
+  --landmark_type '$landmark_type'
+#end if
+#if $re_protein:
+  --re_protein '$re_protein'
+#end if
+#if $re_protein_capture:
+  --re_protein_capture '$re_protein_capture'
+#end if
+#if $fasta:
+  --fasta '$fasta'
+#end if
+#if $no_seq_compute:
+  $no_seq_compute
+#end if
+#if $add_only:
+  $add_only
+#end if
+
+--quiet
+
+ > $results
+    ]]></command>
+  	<inputs>
+          <!-- arguments -->
+      	<param name="gff" label="Gff" argument="gff" type="data" format="gff" help="Path to the Fasta file to load" />
+      	<param argument="--analysis_id"
+      		type="select"
+      		dynamic_options="list_analyses()"
+      		label="Analysis" />
+          <param argument="--organism"
+                 type="select"
+                 dynamic_options="list_organisms()"
+                 label="Organism" />
+
+          <!-- options -->
+      	<param name="landmark_type" label="Landmark Type" argument="landmark_type" type="text" help="Type of the landmarks (will speed up loading if provided, e.g. contig, should be a term of the Sequence ontology)" />
+      	<param name="re_protein_capture" label="Regex protein capture" argument="re_protein_capture" type="text" help="Regular expression to capture groups in mRNA name to use in 'Regex protein' (e.g. '^(.*?)-R([A-Z]+)$', default='^(.*?)$' )" />
+      	<param name="re_protein" label="Regex protein" argument="re_protein" type="text" help="Replacement string for the protein name using capturing groups defined in 'Regex protein capture'" />
+      	<param name="fasta" label="Fasta" argument="fasta" type="data" format="fasta" help="A Fasta containing sequences for some features. When creating a feature, if its sequence is in this fasta file it will be loaded. Otherwise for mRNA and polypeptides it will be computed from the genome sequence (if available), otherwise it will be left empty." optional="true" />
+      	<param name="no_seq_compute" label="Allow computing missing sequences" argument="no_seq_compute" type="boolean" truevalue="" falsevalue="--no_seq_compute" help="Enable the computation of mRNA and polypeptides sequences based on genome sequence and positions." />
+      	<param name="add_only" label="Add only" argument="add_only" type="boolean" truevalue="--add_only" falsevalue="" help="Use this flag if you're not updating existing features, but just adding new features to the selected analysis and organism. It will speedup loading, and reduce memory usage, but might produce errors in case of already existing feature." />
+
+        <expand macro="wait_for"/>
+  	</inputs>
+  	<outputs>
+  		  <data format="txt" name="results"/>
+  	</outputs>
+  	<help>
+Load features from a gff file
+
+@HELP@
+  	</help>
+</tool>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Thu Jun 21 08:45:47 2018 -0400
@@ -0,0 +1,85 @@
+<?xml version="1.0"?>
+<macros>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="2.1.2">python-chado</requirement>
+            <yield/>
+        </requirements>
+    </xml>
+
+    <xml name="stdio">
+        <stdio>
+            <regex level="fatal" match="Exception:" source="stderr" />
+            <regex level="fatal" match="error" source="stderr" />
+            <exit_code range="1:" />
+        </stdio>
+    </xml>
+
+    <token name="@WRAPPER_VERSION@">2.1.1</token>
+
+    <xml name="citation">
+        <citations>
+        </citations>
+    </xml>
+
+    <token name="@HELP_OVERVIEW@"><![CDATA[
+        **Python-chado Overview**
+
+        Python-cado provides several tools allowing to load data into a remote Chado database.
+    ]]></token>
+
+    <token name="@HELP@"><![CDATA[
+    ]]></token>
+
+    <token name="@AUTH@"><![CDATA[
+        echo "__default: local" > '.auth.yml' &&
+        echo "local:" >> '.auth.yml' &&
+        echo "    dbhost: \"\$GALAXY_CHADO_DBHOST\"" >> '.auth.yml' &&
+        echo "    dbname: \"\$GALAXY_CHADO_DBNAME\"" >> '.auth.yml' &&
+        echo "    dbpass: \"\$GALAXY_CHADO_DBPASS\"" >> '.auth.yml' &&
+        echo "    dbuser: \"\$GALAXY_CHADO_DBUSER\"" >> '.auth.yml' &&
+        echo "    dbschema: \"\$GALAXY_CHADO_DBSCHEMA\"" >> '.auth.yml' &&
+        echo "    dbport: \"\$GALAXY_CHADO_DBPORT\"" >> '.auth.yml' &&
+
+        CHAKIN_GLOBAL_CONFIG_PATH='.auth.yml'
+    ]]></token>
+
+    <xml name="sanitized">
+        <sanitizer>
+            <valid initial="string.printable">
+                <remove value="&apos;"/>
+            </valid>
+            <mapping initial="none">
+                <add source="&apos;" target="&apos;&quot;&apos;&quot;&apos;"/>
+                <add source="(" target="\("/>
+                <add source=")" target="\)"/>
+            </mapping>
+        </sanitizer>
+    </xml>
+
+    <!-- I'm not proud of it, but it is needed for workflows -->
+    <xml name="wait_for">
+        <param name="wait_for"
+               type="data"
+               format="data"
+               optional="true"
+               label="Run this only after the following dataset is ready"
+               help="Use this if you want to delay the job execution until some data is already loaded. The selected dataset will not be used for anything else."/>
+    </xml>
+
+    <xml name="feature_rel">
+        <param name="rel_subject_re"
+               argument="--rel-subject-re"
+               type="text"
+               label="Regular expression to extract the unique name of the parent feature"
+               help="this regex will be applied on the fasta definition line to generate the unique name of the parent feature">
+            <expand macro="sanitized"/>
+        </param>
+
+        <param name="rel_subject_type"
+               argument="--rel-subject-type"
+               type="text"
+               label="Sequence type of the parent"
+               help="this should be a Sequence Ontology term" />
+    </xml>
+</macros>