diff env/lib/python3.7/site-packages/rdflib/util.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/env/lib/python3.7/site-packages/rdflib/util.py	Sat May 02 07:14:21 2020 -0400
@@ -0,0 +1,502 @@
+"""
+Some utility functions.
+
+Miscellaneous utilities
+
+* list2set
+* first
+* uniq
+* more_than
+
+Term characterisation and generation
+
+* to_term
+* from_n3
+
+Date/time utilities
+
+* date_time
+* parse_date_time
+
+Statement and component type checkers
+
+* check_context
+* check_subject
+* check_predicate
+* check_object
+* check_statement
+* check_pattern
+
+"""
+
+from calendar import timegm
+from time import altzone
+# from time import daylight
+from time import gmtime
+from time import localtime
+from time import time
+from time import timezone
+
+from os.path import splitext
+from io import StringIO
+
+from rdflib.exceptions import ContextTypeError
+from rdflib.exceptions import ObjectTypeError
+from rdflib.exceptions import PredicateTypeError
+from rdflib.exceptions import SubjectTypeError
+from rdflib.graph import Graph
+from rdflib.graph import QuotedGraph
+from rdflib.namespace import Namespace
+from rdflib.namespace import NamespaceManager
+from rdflib.term import BNode
+from rdflib.term import Literal
+from rdflib.term import URIRef
+from rdflib.py3compat import sign
+
+__all__ = [
+    'list2set', 'first', 'uniq', 'more_than', 'to_term', 'from_n3',
+    'date_time', 'parse_date_time', 'check_context', 'check_subject',
+    'check_predicate', 'check_object', 'check_statement', 'check_pattern',
+    'guess_format', 'find_roots', 'get_tree']
+
+
+def list2set(seq):
+    """
+    Return a new list without duplicates.
+    Preserves the order, unlike set(seq)
+    """
+    seen = set()
+    return [x for x in seq if x not in seen and not seen.add(x)]
+
+
+def first(seq):
+    """
+    return the first element in a python sequence
+    for graphs, use graph.value instead
+    """
+    for result in seq:
+        return result
+    return None
+
+
+def uniq(sequence, strip=0):
+    """removes duplicate strings from the sequence."""
+    if strip:
+        return set(s.strip() for s in sequence)
+    else:
+        return set(sequence)
+
+
+def more_than(sequence, number):
+    "Returns 1 if sequence has more items than number and 0 if not."
+    i = 0
+    for item in sequence:
+        i += 1
+        if i > number:
+            return 1
+    return 0
+
+
+def to_term(s, default=None):
+    """
+    Creates and returns an Identifier of type corresponding
+    to the pattern of the given positional argument string ``s``:
+
+    '' returns the ``default`` keyword argument value or ``None``
+
+    '<s>' returns ``URIRef(s)`` (i.e. without angle brackets)
+
+    '"s"' returns ``Literal(s)`` (i.e. without doublequotes)
+
+    '_s' returns ``BNode(s)`` (i.e. without leading underscore)
+
+    """
+    if not s:
+        return default
+    elif s.startswith("<") and s.endswith(">"):
+        return URIRef(s[1:-1])
+    elif s.startswith('"') and s.endswith('"'):
+        return Literal(s[1:-1])
+    elif s.startswith("_"):
+        return BNode(s)
+    else:
+        msg = "Unrecognised term syntax: '%s'" % s
+        raise Exception(msg)
+
+
+def from_n3(s, default=None, backend=None, nsm=None):
+    r'''
+    Creates the Identifier corresponding to the given n3 string.
+
+        >>> from_n3('<http://ex.com/foo>') == URIRef('http://ex.com/foo')
+        True
+        >>> from_n3('"foo"@de') == Literal('foo', lang='de')
+        True
+        >>> from_n3('"""multi\nline\nstring"""@en') == Literal(
+        ...     'multi\nline\nstring', lang='en')
+        True
+        >>> from_n3('42') == Literal(42)
+        True
+        >>> from_n3(Literal(42).n3()) == Literal(42)
+        True
+        >>> from_n3('"42"^^xsd:integer') == Literal(42)
+        True
+        >>> from rdflib import RDFS
+        >>> from_n3('rdfs:label') == RDFS['label']
+        True
+        >>> nsm = NamespaceManager(Graph())
+        >>> nsm.bind('dbpedia', 'http://dbpedia.org/resource/')
+        >>> berlin = URIRef('http://dbpedia.org/resource/Berlin')
+        >>> from_n3('dbpedia:Berlin', nsm=nsm) == berlin
+        True
+
+    '''
+    if not s:
+        return default
+    if s.startswith('<'):
+        return URIRef(s[1:-1])
+    elif s.startswith('"'):
+        if s.startswith('"""'):
+            quotes = '"""'
+        else:
+            quotes = '"'
+        value, rest = s.rsplit(quotes, 1)
+        value = value[len(quotes):]  # strip leading quotes
+        datatype = None
+        language = None
+
+        # as a given datatype overrules lang-tag check for it first
+        dtoffset = rest.rfind('^^')
+        if dtoffset >= 0:
+            # found a datatype
+            # datatype has to come after lang-tag so ignore everything before
+            # see: http://www.w3.org/TR/2011/WD-turtle-20110809/
+            # #prod-turtle2-RDFLiteral
+            datatype = from_n3(rest[dtoffset + 2:], default, backend, nsm)
+        else:
+            if rest.startswith("@"):
+                language = rest[1:]  # strip leading at sign
+
+        value = value.replace(r'\"', '"')
+        # Hack: this should correctly handle strings with either native unicode
+        # characters, or \u1234 unicode escapes.
+        value = value.encode("raw-unicode-escape").decode("unicode-escape")
+        return Literal(value, language, datatype)
+    elif s == 'true' or s == 'false':
+        return Literal(s == 'true')
+    elif s.isdigit():
+        return Literal(int(s))
+    elif s.startswith('{'):
+        identifier = from_n3(s[1:-1])
+        return QuotedGraph(backend, identifier)
+    elif s.startswith('['):
+        identifier = from_n3(s[1:-1])
+        return Graph(backend, identifier)
+    elif s.startswith("_:"):
+        return BNode(s[2:])
+    elif ':' in s:
+        if nsm is None:
+            # instantiate default NamespaceManager and rely on its defaults
+            nsm = NamespaceManager(Graph())
+        prefix, last_part = s.split(':', 1)
+        ns = dict(nsm.namespaces())[prefix]
+        return Namespace(ns)[last_part]
+    else:
+        return BNode(s)
+
+
+def check_context(c):
+    if not (isinstance(c, URIRef) or
+            isinstance(c, BNode)):
+        raise ContextTypeError("%s:%s" % (c, type(c)))
+
+
+def check_subject(s):
+    """ Test that s is a valid subject identifier."""
+    if not (isinstance(s, URIRef) or isinstance(s, BNode)):
+        raise SubjectTypeError(s)
+
+
+def check_predicate(p):
+    """ Test that p is a valid predicate identifier."""
+    if not isinstance(p, URIRef):
+        raise PredicateTypeError(p)
+
+
+def check_object(o):
+    """ Test that o is a valid object identifier."""
+    if not (isinstance(o, URIRef) or
+            isinstance(o, Literal) or
+            isinstance(o, BNode)):
+        raise ObjectTypeError(o)
+
+
+def check_statement(triple):
+    (s, p, o) = triple
+    if not (isinstance(s, URIRef) or isinstance(s, BNode)):
+        raise SubjectTypeError(s)
+
+    if not isinstance(p, URIRef):
+        raise PredicateTypeError(p)
+
+    if not (isinstance(o, URIRef) or
+            isinstance(o, Literal) or
+            isinstance(o, BNode)):
+        raise ObjectTypeError(o)
+
+
+def check_pattern(triple):
+    (s, p, o) = triple
+    if s and not (isinstance(s, URIRef) or isinstance(s, BNode)):
+        raise SubjectTypeError(s)
+
+    if p and not isinstance(p, URIRef):
+        raise PredicateTypeError(p)
+
+    if o and not (isinstance(o, URIRef) or
+                  isinstance(o, Literal) or
+                  isinstance(o, BNode)):
+        raise ObjectTypeError(o)
+
+
+def date_time(t=None, local_time_zone=False):
+    """http://www.w3.org/TR/NOTE-datetime ex: 1997-07-16T19:20:30Z
+
+    >>> date_time(1126482850)
+    '2005-09-11T23:54:10Z'
+
+    @@ this will change depending on where it is run
+    #>>> date_time(1126482850, local_time_zone=True)
+    #'2005-09-11T19:54:10-04:00'
+
+    >>> date_time(1)
+    '1970-01-01T00:00:01Z'
+
+    >>> date_time(0)
+    '1970-01-01T00:00:00Z'
+    """
+    if t is None:
+        t = time()
+
+    if local_time_zone:
+        time_tuple = localtime(t)
+        if time_tuple[8]:
+            tz_mins = altzone // 60
+        else:
+            tz_mins = timezone // 60
+        tzd = "-%02d:%02d" % (tz_mins // 60, tz_mins % 60)
+    else:
+        time_tuple = gmtime(t)
+        tzd = "Z"
+
+    year, month, day, hh, mm, ss, wd, y, z = time_tuple
+    s = "%0004d-%02d-%02dT%02d:%02d:%02d%s" % (
+        year, month, day, hh, mm, ss, tzd)
+    return s
+
+
+def parse_date_time(val):
+    """always returns seconds in UTC
+
+    # tests are written like this to make any errors easier to understand
+    >>> parse_date_time('2005-09-11T23:54:10Z') - 1126482850.0
+    0.0
+
+    >>> parse_date_time('2005-09-11T16:54:10-07:00') - 1126482850.0
+    0.0
+
+    >>> parse_date_time('1970-01-01T00:00:01Z') - 1.0
+    0.0
+
+    >>> parse_date_time('1970-01-01T00:00:00Z') - 0.0
+    0.0
+    >>> parse_date_time("2005-09-05T10:42:00") - 1125916920.0
+    0.0
+    """
+
+    if "T" not in val:
+        val += "T00:00:00Z"
+
+    ymd, time = val.split("T")
+    hms, tz_str = time[0:8], time[8:]
+
+    if not tz_str or tz_str == "Z":
+        time = time[:-1]
+        tz_offset = 0
+    else:
+        signed_hrs = int(tz_str[:3])
+        mins = int(tz_str[4:6])
+        secs = (sign(signed_hrs) * mins + signed_hrs * 60) * 60
+        tz_offset = -secs
+
+    year, month, day = ymd.split("-")
+    hour, minute, second = hms.split(":")
+
+    t = timegm((int(year), int(month), int(day), int(hour),
+                int(minute), int(second), 0, 0, 0))
+    t = t + tz_offset
+    return t
+
+
+
+
+
+SUFFIX_FORMAT_MAP = {
+    'rdf': 'xml',
+    'rdfs': 'xml',
+    'owl': 'xml',
+    'n3': 'n3',
+    'ttl': 'turtle',
+    'nt': 'nt',
+    'trix': 'trix',
+    'xhtml': 'rdfa',
+    'html': 'rdfa',
+    'svg': 'rdfa',
+    'nq': 'nquads',
+    'trig': 'trig'
+}
+
+
+def guess_format(fpath, fmap=None):
+    """
+    Guess RDF serialization based on file suffix. Uses
+    ``SUFFIX_FORMAT_MAP`` unless ``fmap`` is provided. Examples:
+
+        >>> guess_format('path/to/file.rdf')
+        'xml'
+        >>> guess_format('path/to/file.owl')
+        'xml'
+        >>> guess_format('path/to/file.ttl')
+        'turtle'
+        >>> guess_format('path/to/file.xhtml')
+        'rdfa'
+        >>> guess_format('path/to/file.svg')
+        'rdfa'
+        >>> guess_format('path/to/file.xhtml', {'xhtml': 'grddl'})
+        'grddl'
+
+    This also works with just the suffixes, with or without leading dot, and
+    regardless of letter case::
+
+        >>> guess_format('.rdf')
+        'xml'
+        >>> guess_format('rdf')
+        'xml'
+        >>> guess_format('RDF')
+        'xml'
+    """
+    fmap = fmap or SUFFIX_FORMAT_MAP
+    return fmap.get(_get_ext(fpath)) or fmap.get(fpath.lower())
+
+
+def _get_ext(fpath, lower=True):
+    """
+    Gets the file extension from a file(path); stripped of leading '.' and in
+    lower case. Examples:
+
+        >>> _get_ext("path/to/file.txt")
+        'txt'
+        >>> _get_ext("OTHER.PDF")
+        'pdf'
+        >>> _get_ext("noext")
+        ''
+        >>> _get_ext(".rdf")
+        'rdf'
+    """
+    ext = splitext(fpath)[-1]
+    if ext == '' and fpath.startswith("."):
+        ext = fpath
+    if lower:
+        ext = ext.lower()
+    if ext.startswith('.'):
+        ext = ext[1:]
+    return ext
+
+
+def find_roots(graph, prop, roots=None):
+    """
+    Find the roots in some sort of transitive hierarchy.
+
+    find_roots(graph, rdflib.RDFS.subClassOf)
+    will return a set of all roots of the sub-class hierarchy
+
+    Assumes triple of the form (child, prop, parent), i.e. the direction of
+    RDFS.subClassOf or SKOS.broader
+
+    """
+
+    non_roots = set()
+    if roots is None:
+        roots = set()
+    for x, y in graph.subject_objects(prop):
+        non_roots.add(x)
+        if x in roots:
+            roots.remove(x)
+        if y not in non_roots:
+            roots.add(y)
+    return roots
+
+
+def get_tree(graph,
+             root,
+             prop,
+             mapper=lambda x: x,
+             sortkey=None,
+             done=None,
+             dir='down'):
+    """
+    Return a nested list/tuple structure representing the tree
+    built by the transitive property given, starting from the root given
+
+    i.e.
+
+    get_tree(graph,
+       rdflib.URIRef("http://xmlns.com/foaf/0.1/Person"),
+       rdflib.RDFS.subClassOf)
+
+    will return the structure for the subClassTree below person.
+
+    dir='down' assumes triple of the form (child, prop, parent),
+    i.e. the direction of RDFS.subClassOf or SKOS.broader
+    Any other dir traverses in the other direction
+
+    """
+
+    if done is None:
+        done = set()
+    if root in done:
+        return
+    done.add(root)
+    tree = []
+
+    if dir == 'down':
+        branches = graph.subjects(prop, root)
+    else:
+        branches = graph.objects(root, prop)
+
+    for branch in branches:
+        t = get_tree(graph, branch, prop, mapper, sortkey, done, dir)
+        if t:
+            tree.append(t)
+
+    return (mapper(root), sorted(tree, key=sortkey))
+
+
+
+
+def test():
+    import doctest
+    doctest.testmod()
+
+if __name__ == "__main__":
+    # try to make the tests work outside of the time zone they were written in
+    # import os, time
+    # os.environ['TZ'] = 'US/Pacific'
+    # try:
+    #    time.tzset()
+    # except AttributeError, e:
+    #    print e
+        # pass
+        # tzset missing! see
+        # http://mail.python.org/pipermail/python-dev/2003-April/034480.html
+    test()  # pragma: no cover