diff planemo/lib/python3.7/site-packages/rdflib/compare.py @ 1:56ad4e20f292 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:32:28 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/planemo/lib/python3.7/site-packages/rdflib/compare.py	Fri Jul 31 00:32:28 2020 -0400
@@ -0,0 +1,562 @@
+# -*- coding: utf-8 -*-
+"""
+A collection of utilities for canonicalizing and inspecting graphs.
+
+Among other things, they solve of the problem of deterministic bnode
+comparisons.
+
+Warning: the time to canonicalize bnodes may increase exponentially on
+degenerate larger graphs. Use with care!
+
+Example of comparing two graphs::
+
+    >>> g1 = Graph().parse(format='n3', data='''
+    ...     @prefix : <http://example.org/ns#> .
+    ...     <http://example.org> :rel
+    ...         <http://example.org/same>,
+    ...         [ :label "Same" ],
+    ...         <http://example.org/a>,
+    ...         [ :label "A" ] .
+    ... ''')
+    >>> g2 = Graph().parse(format='n3', data='''
+    ...     @prefix : <http://example.org/ns#> .
+    ...     <http://example.org> :rel
+    ...         <http://example.org/same>,
+    ...         [ :label "Same" ],
+    ...         <http://example.org/b>,
+    ...         [ :label "B" ] .
+    ... ''')
+    >>>
+    >>> iso1 = to_isomorphic(g1)
+    >>> iso2 = to_isomorphic(g2)
+
+These are not isomorphic::
+
+    >>> iso1 == iso2
+    False
+
+Diff the two graphs::
+
+    >>> in_both, in_first, in_second = graph_diff(iso1, iso2)
+
+Present in both::
+
+    >>> def dump_nt_sorted(g):
+    ...     for l in sorted(g.serialize(format='nt').splitlines()):
+    ...         if l: print(l.decode('ascii'))
+
+    >>> dump_nt_sorted(in_both) #doctest: +SKIP
+    <http://example.org>
+        <http://example.org/ns#rel> <http://example.org/same> .
+    <http://example.org>
+        <http://example.org/ns#rel> _:cbcaabaaba17fecbc304a64f8edee4335e .
+    _:cbcaabaaba17fecbc304a64f8edee4335e
+        <http://example.org/ns#label> "Same" .
+
+Only in first::
+
+    >>> dump_nt_sorted(in_first) #doctest: +SKIP
+    <http://example.org>
+        <http://example.org/ns#rel> <http://example.org/a> .
+    <http://example.org>
+        <http://example.org/ns#rel> _:cb124e4c6da0579f810c0ffe4eff485bd9 .
+    _:cb124e4c6da0579f810c0ffe4eff485bd9
+        <http://example.org/ns#label> "A" .
+
+Only in second::
+
+    >>> dump_nt_sorted(in_second) #doctest: +SKIP
+    <http://example.org>
+        <http://example.org/ns#rel> <http://example.org/b> .
+    <http://example.org>
+        <http://example.org/ns#rel> _:cb558f30e21ddfc05ca53108348338ade8 .
+    _:cb558f30e21ddfc05ca53108348338ade8
+        <http://example.org/ns#label> "B" .
+"""
+
+
+# TODO:
+# - Doesn't handle quads.
+# - Add warning and/or safety mechanism before working on large graphs?
+# - use this in existing Graph.isomorphic?
+
+__all__ = ['IsomorphicGraph', 'to_isomorphic', 'isomorphic',
+           'to_canonical_graph', 'graph_diff', 'similar']
+
+from rdflib.graph import Graph, ConjunctiveGraph, ReadOnlyGraphAggregate
+from rdflib.term import BNode, Node
+try:
+    import hashlib
+    sha256 = hashlib.sha256
+except ImportError:
+    # for Python << 2.5
+    import sha256
+    sha256 = sha256.new
+
+from datetime import datetime
+from collections import defaultdict
+
+
+def _total_seconds(td):
+    result = td.days * 24 * 60 * 60
+    result += td.seconds
+    result += td.microseconds / 1000000.0
+    return result
+
+
+class _runtime(object):
+    def __init__(self, label):
+        self.label = label
+
+    def __call__(self, f):
+        if self.label is None:
+            self.label = f.__name__ + "_runtime"
+
+        def wrapped_f(*args, **kwargs):
+            start = datetime.now()
+            result = f(*args, **kwargs)
+            if 'stats' in kwargs and kwargs['stats'] is not None:
+                stats = kwargs['stats']
+                stats[self.label] = _total_seconds(datetime.now() - start)
+            return result
+        return wrapped_f
+
+
+class _call_count(object):
+    def __init__(self, label):
+        self.label = label
+
+    def __call__(self, f):
+        if self.label is None:
+            self.label = f.__name__ + "_runtime"
+
+        def wrapped_f(*args, **kwargs):
+            if 'stats' in kwargs and kwargs['stats'] is not None:
+                stats = kwargs['stats']
+                if self.label not in stats:
+                    stats[self.label] = 0
+                stats[self.label] += 1
+            return f(*args, **kwargs)
+        return wrapped_f
+
+
+class IsomorphicGraph(ConjunctiveGraph):
+    """An implementation of the RGDA1 graph digest algorithm.
+
+    An implementation of RGDA1 (publication below),
+    a combination of Sayers & Karp's graph digest algorithm using
+    sum and SHA-256 <http://www.hpl.hp.com/techreports/2003/HPL-2003-235R1.pdf>
+    and traces <http://pallini.di.uniroma1.it>, an average case
+    polynomial time algorithm for graph canonicalization.
+    
+    McCusker, J. P. (2015). WebSig: A Digital Signature Framework for the Web.
+    Rensselaer Polytechnic Institute, Troy, NY.
+    http://gradworks.umi.com/3727015.pdf
+    """
+
+    def __init__(self, **kwargs):
+        super(IsomorphicGraph, self).__init__(**kwargs)
+
+    def __eq__(self, other):
+        """Graph isomorphism testing."""
+        if not isinstance(other, IsomorphicGraph):
+            return False
+        elif len(self) != len(other):
+            return False
+        return self.internal_hash() == other.internal_hash()
+
+    def __ne__(self, other):
+        """Negative graph isomorphism testing."""
+        return not self.__eq__(other)
+
+    def graph_digest(self, stats=None):
+        """Synonym for IsomorphicGraph.internal_hash."""
+        return self.internal_hash(stats=stats)
+
+    def internal_hash(self, stats=None):
+        """
+        This is defined instead of __hash__ to avoid a circular recursion
+        scenario with the Memory store for rdflib which requires a hash lookup
+        in order to return a generator of triples.
+        """
+        return _TripleCanonicalizer(self).to_hash(stats=stats)
+
+
+class Color:
+    def __init__(self, nodes, hashfunc, color=(), hash_cache=None):
+        if hash_cache is None:
+            hash_cache = {}
+        self._hash_cache = hash_cache
+        self.color = color
+        self.nodes = nodes
+        self.hashfunc = hashfunc
+        self._hash_color = None
+
+    def key(self):
+        return (len(self.nodes), self.hash_color())
+
+    def hash_color(self, color=None):
+        if color is None:
+            color = self.color
+        if color in self._hash_cache:
+            return self._hash_cache[color]
+
+        def stringify(x):
+            if isinstance(x, Node):
+                return x.n3()
+            else:
+                return str(x)
+        if isinstance(color, Node):
+            return stringify(color)
+        value = 0
+        for triple in color:
+            value += self.hashfunc(' '.join([stringify(x) for x in triple]))
+        val = "%x" % value
+        self._hash_cache[color] = val
+        return val
+
+    def distinguish(self, W, graph):
+        colors = {}
+        for n in self.nodes:
+            new_color = list(self.color)
+            for node in W.nodes:
+                new_color += [
+                    (1, p, W.hash_color())
+                    for s, p, o in graph.triples((n, None, node))]
+                new_color += [
+                    (W.hash_color(), p, 3)
+                    for s, p, o in graph.triples((node, None, n))]
+            new_color = tuple(new_color)
+            new_hash_color = self.hash_color(new_color)
+
+            if new_hash_color not in colors:
+                c = Color(
+                    [], self.hashfunc, new_color,
+                    hash_cache=self._hash_cache)
+                colors[new_hash_color] = c
+            colors[new_hash_color].nodes.append(n)
+        return list(colors.values())
+
+    def discrete(self):
+        return len(self.nodes) == 1
+
+    def copy(self):
+        return Color(
+            self.nodes[:], self.hashfunc, self.color,
+            hash_cache=self._hash_cache)
+
+
+class _TripleCanonicalizer(object):
+
+    def __init__(self, graph, hashfunc=sha256):
+        self.graph = graph
+
+        def _hashfunc(s):
+            h = hashfunc()
+            h.update(str(s).encode("utf8"))
+            return int(h.hexdigest(), 16)
+        self._hash_cache = {}
+        self.hashfunc = _hashfunc
+
+
+    def _discrete(self, coloring):
+        return len([c for c in coloring if not c.discrete()]) == 0
+
+
+    def _initial_color(self):
+        """Finds an initial color for the graph.
+
+        Finds an initial color fo the graph by finding all blank nodes and
+        non-blank nodes that are adjacent. Nodes that are not adjacent to blank
+        nodes are not included, as they are a) already colored (by URI or literal)
+        and b) do not factor into the color of any blank node.
+        """
+        bnodes = set()
+        others = set()
+        self._neighbors = defaultdict(set)
+        for s, p, o in self.graph:
+            nodes = set([s, o])
+            b = set([x for x in nodes if isinstance(x, BNode)])
+            if len(b) > 0:
+                others |= nodes - b
+                bnodes |= b
+                if isinstance(s, BNode):
+                    self._neighbors[s].add(o)
+                if isinstance(o, BNode):
+                    self._neighbors[o].add(s)
+        if len(bnodes) > 0:
+            return [
+                    Color(list(bnodes), self.hashfunc, hash_cache=self._hash_cache)
+                ] + [
+                    Color([x], self.hashfunc, x, hash_cache=self._hash_cache)
+                    for x in others
+                ]
+        else:
+            return []
+
+    def _individuate(self, color, individual):
+        new_color = list(color.color)
+        new_color.append((len(color.nodes),))
+
+        color.nodes.remove(individual)
+        c = Color([individual], self.hashfunc, tuple(new_color),
+                  hash_cache=self._hash_cache)
+        return c
+
+    def _get_candidates(self, coloring):
+        candidates = [c for c in coloring if not c.discrete()]
+        for c in [c for c in coloring if not c.discrete()]:
+            for node in c.nodes:
+                yield node, c
+
+    def _refine(self, coloring, sequence):
+        sequence = sorted(sequence, key=lambda x: x.key(), reverse=True)
+        coloring = coloring[:]
+        while len(sequence) > 0 and not self._discrete(coloring):
+            W = sequence.pop()
+            for c in coloring[:]:
+                if len(c.nodes) > 1:
+                    colors = sorted(c.distinguish(W, self.graph),
+                                    key=lambda x: x.key(),
+                                    reverse=True)
+                    coloring.remove(c)
+                    coloring.extend(colors)
+                    try:
+                        si = sequence.index(c)
+                        sequence = sequence[:si] + colors + sequence[si+1:]
+                    except ValueError:
+                        sequence = colors[1:] + sequence
+
+        return coloring
+
+    @_runtime("to_hash_runtime")
+    def to_hash(self, stats=None):
+        result = 0
+        for triple in self.canonical_triples(stats=stats):
+            result += self.hashfunc(' '.join([x.n3() for x in triple]))
+        if stats is not None:
+            stats['graph_digest'] = "%x" % result
+        return result
+
+    def _experimental_path(self, coloring):
+        coloring = [c.copy() for c in coloring]
+        while not self._discrete(coloring):
+            color = [x for x in coloring if not x.discrete()][0]
+            node = color.nodes[0]
+            new_color = self._individuate(color, node)
+            coloring.append(new_color)
+            coloring = self._refine(coloring, [new_color])
+        return coloring
+
+    def _create_generator(self, colorings, groupings=None):
+        if not groupings:
+            groupings = defaultdict(set)
+        for group in zip(*colorings):
+            g = set([c.nodes[0] for c in group])
+            for n in group:
+                g |= groupings[n]
+            for n in g:
+                groupings[n] = g
+        return groupings
+
+    @_call_count("individuations")
+    def _traces(self, coloring, stats=None, depth=[0]):
+        if stats is not None and 'prunings' not in stats:
+            stats['prunings'] = 0
+        depth[0] += 1
+        candidates = self._get_candidates(coloring)
+        best = []
+        best_score = None
+        best_experimental = None
+        best_experimental_score = None
+        last_coloring = None
+        generator = defaultdict(set)
+        visited = set()
+        for candidate, color in candidates:
+            if candidate in generator:
+                v = generator[candidate] & visited
+                if len(v) > 0:
+                    visited.add(candidate)
+                    continue
+            visited.add(candidate)
+            coloring_copy = []
+            color_copy = None
+            for c in coloring:
+                c_copy = c.copy()
+                coloring_copy.append(c_copy)
+                if c == color:
+                    color_copy = c_copy
+            new_color = self._individuate(color_copy, candidate)
+            coloring_copy.append(new_color)
+            refined_coloring = self._refine(coloring_copy, [new_color])
+            color_score = tuple([c.key() for c in refined_coloring])
+            experimental = self._experimental_path(coloring_copy)
+            experimental_score = set([c.key() for c in experimental])
+            if last_coloring:
+                generator = self._create_generator(
+                    [last_coloring, experimental],
+                    generator)
+            last_coloring = experimental
+            if best_score is None or best_score < color_score:
+                best = [refined_coloring]
+                best_score = color_score
+                best_experimental = experimental
+                best_experimental_score = experimental_score
+            elif best_score > color_score:
+                # prune this branch.
+                if stats is not None:
+                    stats['prunings'] += 1
+            elif experimental_score != best_experimental_score:
+                best.append(refined_coloring)
+            else:
+                # prune this branch.
+                if stats is not None:
+                    stats['prunings'] += 1
+        discrete = [x for x in best if self._discrete(x)]
+        if len(discrete) == 0:
+            best_score = None
+            best_depth = None
+            for coloring in best:
+                d = [depth[0]]
+                new_color = self._traces(coloring, stats=stats, depth=d)
+                color_score = tuple([c.key() for c in refined_coloring])
+                if best_score is None or color_score > best_score:
+                    discrete = [new_color]
+                    best_score = color_score
+                    best_depth = d[0]
+            depth[0] = best_depth
+        return discrete[0]
+
+    def canonical_triples(self, stats=None):
+        if stats is not None:
+            start_canonicalization = datetime.now()
+        if stats is not None:
+            start_coloring = datetime.now()
+        coloring = self._initial_color()
+        if stats is not None:
+            stats['triple_count'] = len(self.graph)
+            stats['adjacent_nodes'] = max(0, len(coloring) - 1)
+        coloring = self._refine(coloring, coloring[:])
+        if stats is not None:
+            stats['initial_coloring_runtime'] = _total_seconds(datetime.now() - start_coloring)
+            stats['initial_color_count'] = len(coloring)
+
+        if not self._discrete(coloring):
+            depth = [0]
+            coloring = self._traces(coloring, stats=stats, depth=depth)
+            if stats is not None:
+                stats['tree_depth'] = depth[0]
+        elif stats is not None:
+            stats['individuations'] = 0
+            stats['tree_depth'] = 0
+        if stats is not None:
+            stats['color_count'] = len(coloring)
+
+        bnode_labels = dict([(c.nodes[0], c.hash_color()) for c in coloring])
+        if stats is not None:
+            stats["canonicalize_triples_runtime"] = _total_seconds(datetime.now() - start_coloring)
+        for triple in self.graph:
+            result = tuple(self._canonicalize_bnodes(triple, bnode_labels))
+            yield result
+
+    def _canonicalize_bnodes(self, triple, labels):
+        for term in triple:
+            if isinstance(term, BNode):
+                yield BNode(value="cb%s" % labels[term])
+            else:
+                yield term
+
+
+def to_isomorphic(graph):
+    if isinstance(graph, IsomorphicGraph):
+        return graph
+    return IsomorphicGraph(store=graph.store)
+
+
+def isomorphic(graph1, graph2):
+    """Compare graph for equality.
+
+    Uses an algorithm to compute unique hashes which takes bnodes into account.
+
+    Examples::
+
+        >>> g1 = Graph().parse(format='n3', data='''
+        ...     @prefix : <http://example.org/ns#> .
+        ...     <http://example.org> :rel <http://example.org/a> .
+        ...     <http://example.org> :rel <http://example.org/b> .
+        ...     <http://example.org> :rel [ :label "A bnode." ] .
+        ... ''')
+        >>> g2 = Graph().parse(format='n3', data='''
+        ...     @prefix ns: <http://example.org/ns#> .
+        ...     <http://example.org> ns:rel [ ns:label "A bnode." ] .
+        ...     <http://example.org> ns:rel <http://example.org/b>,
+        ...             <http://example.org/a> .
+        ... ''')
+        >>> isomorphic(g1, g2)
+        True
+
+        >>> g3 = Graph().parse(format='n3', data='''
+        ...     @prefix : <http://example.org/ns#> .
+        ...     <http://example.org> :rel <http://example.org/a> .
+        ...     <http://example.org> :rel <http://example.org/b> .
+        ...     <http://example.org> :rel <http://example.org/c> .
+        ... ''')
+        >>> isomorphic(g1, g3)
+        False
+    """
+    gd1 = _TripleCanonicalizer(graph1).to_hash()
+    gd2 = _TripleCanonicalizer(graph2).to_hash()
+    return gd1 == gd2
+
+
+
+def to_canonical_graph(g1):
+    """Creates a canonical, read-only graph.
+
+    Creates a canonical, read-only graph where all bnode id:s are based on
+    deterministical SHA-256 checksums, correlated with the graph contents.
+    """
+    graph = Graph()
+    graph += _TripleCanonicalizer(g1).canonical_triples()
+    return ReadOnlyGraphAggregate([graph])
+
+
+def graph_diff(g1, g2):
+    """Returns three sets of triples: "in both", "in first" and "in second"."""
+    # bnodes have deterministic values in canonical graphs:
+    cg1 = to_canonical_graph(g1)
+    cg2 = to_canonical_graph(g2)
+    in_both = cg1 * cg2
+    in_first = cg1 - cg2
+    in_second = cg2 - cg1
+    return (in_both, in_first, in_second)
+
+
+
+_MOCK_BNODE = BNode()
+
+
+def similar(g1, g2):
+    """Checks if the two graphs are "similar".
+
+    Checks if the two graphs are "similar", by comparing sorted triples where
+    all bnodes have been replaced by a singular mock bnode (the
+    ``_MOCK_BNODE``).
+
+    This is a much cheaper, but less reliable, alternative to the comparison
+    algorithm in ``isomorphic``.
+    """
+    return all(t1 == t2 for (t1, t2) in _squashed_graphs_triples(g1, g2))
+
+
+def _squashed_graphs_triples(g1, g2):
+    for (t1, t2) in zip(sorted(_squash_graph(g1)), sorted(_squash_graph(g2))):
+        yield t1, t2
+
+
+def _squash_graph(graph):
+    return (_squash_bnodes(triple) for triple in graph)
+
+
+def _squash_bnodes(triple):
+    return tuple((isinstance(t, BNode) and _MOCK_BNODE) or t for t in triple)