Mercurial > repos > shellac > guppy_basecaller
diff env/lib/python3.7/site-packages/rdflib/compare.py @ 5:9b1c78e6ba9c draft default tip
"planemo upload commit 6c0a8142489327ece472c84e558c47da711a9142"
author | shellac |
---|---|
date | Mon, 01 Jun 2020 08:59:25 -0400 |
parents | 79f47841a781 |
children |
line wrap: on
line diff
--- a/env/lib/python3.7/site-packages/rdflib/compare.py Thu May 14 16:47:39 2020 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,562 +0,0 @@ -# -*- coding: utf-8 -*- -""" -A collection of utilities for canonicalizing and inspecting graphs. - -Among other things, they solve of the problem of deterministic bnode -comparisons. - -Warning: the time to canonicalize bnodes may increase exponentially on -degenerate larger graphs. Use with care! - -Example of comparing two graphs:: - - >>> g1 = Graph().parse(format='n3', data=''' - ... @prefix : <http://example.org/ns#> . - ... <http://example.org> :rel - ... <http://example.org/same>, - ... [ :label "Same" ], - ... <http://example.org/a>, - ... [ :label "A" ] . - ... ''') - >>> g2 = Graph().parse(format='n3', data=''' - ... @prefix : <http://example.org/ns#> . - ... <http://example.org> :rel - ... <http://example.org/same>, - ... [ :label "Same" ], - ... <http://example.org/b>, - ... [ :label "B" ] . - ... ''') - >>> - >>> iso1 = to_isomorphic(g1) - >>> iso2 = to_isomorphic(g2) - -These are not isomorphic:: - - >>> iso1 == iso2 - False - -Diff the two graphs:: - - >>> in_both, in_first, in_second = graph_diff(iso1, iso2) - -Present in both:: - - >>> def dump_nt_sorted(g): - ... for l in sorted(g.serialize(format='nt').splitlines()): - ... if l: print(l.decode('ascii')) - - >>> dump_nt_sorted(in_both) #doctest: +SKIP - <http://example.org> - <http://example.org/ns#rel> <http://example.org/same> . - <http://example.org> - <http://example.org/ns#rel> _:cbcaabaaba17fecbc304a64f8edee4335e . - _:cbcaabaaba17fecbc304a64f8edee4335e - <http://example.org/ns#label> "Same" . - -Only in first:: - - >>> dump_nt_sorted(in_first) #doctest: +SKIP - <http://example.org> - <http://example.org/ns#rel> <http://example.org/a> . - <http://example.org> - <http://example.org/ns#rel> _:cb124e4c6da0579f810c0ffe4eff485bd9 . - _:cb124e4c6da0579f810c0ffe4eff485bd9 - <http://example.org/ns#label> "A" . - -Only in second:: - - >>> dump_nt_sorted(in_second) #doctest: +SKIP - <http://example.org> - <http://example.org/ns#rel> <http://example.org/b> . - <http://example.org> - <http://example.org/ns#rel> _:cb558f30e21ddfc05ca53108348338ade8 . - _:cb558f30e21ddfc05ca53108348338ade8 - <http://example.org/ns#label> "B" . -""" - - -# TODO: -# - Doesn't handle quads. -# - Add warning and/or safety mechanism before working on large graphs? -# - use this in existing Graph.isomorphic? - -__all__ = ['IsomorphicGraph', 'to_isomorphic', 'isomorphic', - 'to_canonical_graph', 'graph_diff', 'similar'] - -from rdflib.graph import Graph, ConjunctiveGraph, ReadOnlyGraphAggregate -from rdflib.term import BNode, Node -try: - import hashlib - sha256 = hashlib.sha256 -except ImportError: - # for Python << 2.5 - import sha256 - sha256 = sha256.new - -from datetime import datetime -from collections import defaultdict - - -def _total_seconds(td): - result = td.days * 24 * 60 * 60 - result += td.seconds - result += td.microseconds / 1000000.0 - return result - - -class _runtime(object): - def __init__(self, label): - self.label = label - - def __call__(self, f): - if self.label is None: - self.label = f.__name__ + "_runtime" - - def wrapped_f(*args, **kwargs): - start = datetime.now() - result = f(*args, **kwargs) - if 'stats' in kwargs and kwargs['stats'] is not None: - stats = kwargs['stats'] - stats[self.label] = _total_seconds(datetime.now() - start) - return result - return wrapped_f - - -class _call_count(object): - def __init__(self, label): - self.label = label - - def __call__(self, f): - if self.label is None: - self.label = f.__name__ + "_runtime" - - def wrapped_f(*args, **kwargs): - if 'stats' in kwargs and kwargs['stats'] is not None: - stats = kwargs['stats'] - if self.label not in stats: - stats[self.label] = 0 - stats[self.label] += 1 - return f(*args, **kwargs) - return wrapped_f - - -class IsomorphicGraph(ConjunctiveGraph): - """An implementation of the RGDA1 graph digest algorithm. - - An implementation of RGDA1 (publication below), - a combination of Sayers & Karp's graph digest algorithm using - sum and SHA-256 <http://www.hpl.hp.com/techreports/2003/HPL-2003-235R1.pdf> - and traces <http://pallini.di.uniroma1.it>, an average case - polynomial time algorithm for graph canonicalization. - - McCusker, J. P. (2015). WebSig: A Digital Signature Framework for the Web. - Rensselaer Polytechnic Institute, Troy, NY. - http://gradworks.umi.com/3727015.pdf - """ - - def __init__(self, **kwargs): - super(IsomorphicGraph, self).__init__(**kwargs) - - def __eq__(self, other): - """Graph isomorphism testing.""" - if not isinstance(other, IsomorphicGraph): - return False - elif len(self) != len(other): - return False - return self.internal_hash() == other.internal_hash() - - def __ne__(self, other): - """Negative graph isomorphism testing.""" - return not self.__eq__(other) - - def graph_digest(self, stats=None): - """Synonym for IsomorphicGraph.internal_hash.""" - return self.internal_hash(stats=stats) - - def internal_hash(self, stats=None): - """ - This is defined instead of __hash__ to avoid a circular recursion - scenario with the Memory store for rdflib which requires a hash lookup - in order to return a generator of triples. - """ - return _TripleCanonicalizer(self).to_hash(stats=stats) - - -class Color: - def __init__(self, nodes, hashfunc, color=(), hash_cache=None): - if hash_cache is None: - hash_cache = {} - self._hash_cache = hash_cache - self.color = color - self.nodes = nodes - self.hashfunc = hashfunc - self._hash_color = None - - def key(self): - return (len(self.nodes), self.hash_color()) - - def hash_color(self, color=None): - if color is None: - color = self.color - if color in self._hash_cache: - return self._hash_cache[color] - - def stringify(x): - if isinstance(x, Node): - return x.n3() - else: - return str(x) - if isinstance(color, Node): - return stringify(color) - value = 0 - for triple in color: - value += self.hashfunc(' '.join([stringify(x) for x in triple])) - val = "%x" % value - self._hash_cache[color] = val - return val - - def distinguish(self, W, graph): - colors = {} - for n in self.nodes: - new_color = list(self.color) - for node in W.nodes: - new_color += [ - (1, p, W.hash_color()) - for s, p, o in graph.triples((n, None, node))] - new_color += [ - (W.hash_color(), p, 3) - for s, p, o in graph.triples((node, None, n))] - new_color = tuple(new_color) - new_hash_color = self.hash_color(new_color) - - if new_hash_color not in colors: - c = Color( - [], self.hashfunc, new_color, - hash_cache=self._hash_cache) - colors[new_hash_color] = c - colors[new_hash_color].nodes.append(n) - return list(colors.values()) - - def discrete(self): - return len(self.nodes) == 1 - - def copy(self): - return Color( - self.nodes[:], self.hashfunc, self.color, - hash_cache=self._hash_cache) - - -class _TripleCanonicalizer(object): - - def __init__(self, graph, hashfunc=sha256): - self.graph = graph - - def _hashfunc(s): - h = hashfunc() - h.update(str(s).encode("utf8")) - return int(h.hexdigest(), 16) - self._hash_cache = {} - self.hashfunc = _hashfunc - - - def _discrete(self, coloring): - return len([c for c in coloring if not c.discrete()]) == 0 - - - def _initial_color(self): - """Finds an initial color for the graph. - - Finds an initial color fo the graph by finding all blank nodes and - non-blank nodes that are adjacent. Nodes that are not adjacent to blank - nodes are not included, as they are a) already colored (by URI or literal) - and b) do not factor into the color of any blank node. - """ - bnodes = set() - others = set() - self._neighbors = defaultdict(set) - for s, p, o in self.graph: - nodes = set([s, o]) - b = set([x for x in nodes if isinstance(x, BNode)]) - if len(b) > 0: - others |= nodes - b - bnodes |= b - if isinstance(s, BNode): - self._neighbors[s].add(o) - if isinstance(o, BNode): - self._neighbors[o].add(s) - if len(bnodes) > 0: - return [ - Color(list(bnodes), self.hashfunc, hash_cache=self._hash_cache) - ] + [ - Color([x], self.hashfunc, x, hash_cache=self._hash_cache) - for x in others - ] - else: - return [] - - def _individuate(self, color, individual): - new_color = list(color.color) - new_color.append((len(color.nodes),)) - - color.nodes.remove(individual) - c = Color([individual], self.hashfunc, tuple(new_color), - hash_cache=self._hash_cache) - return c - - def _get_candidates(self, coloring): - candidates = [c for c in coloring if not c.discrete()] - for c in [c for c in coloring if not c.discrete()]: - for node in c.nodes: - yield node, c - - def _refine(self, coloring, sequence): - sequence = sorted(sequence, key=lambda x: x.key(), reverse=True) - coloring = coloring[:] - while len(sequence) > 0 and not self._discrete(coloring): - W = sequence.pop() - for c in coloring[:]: - if len(c.nodes) > 1: - colors = sorted(c.distinguish(W, self.graph), - key=lambda x: x.key(), - reverse=True) - coloring.remove(c) - coloring.extend(colors) - try: - si = sequence.index(c) - sequence = sequence[:si] + colors + sequence[si+1:] - except ValueError: - sequence = colors[1:] + sequence - - return coloring - - @_runtime("to_hash_runtime") - def to_hash(self, stats=None): - result = 0 - for triple in self.canonical_triples(stats=stats): - result += self.hashfunc(' '.join([x.n3() for x in triple])) - if stats is not None: - stats['graph_digest'] = "%x" % result - return result - - def _experimental_path(self, coloring): - coloring = [c.copy() for c in coloring] - while not self._discrete(coloring): - color = [x for x in coloring if not x.discrete()][0] - node = color.nodes[0] - new_color = self._individuate(color, node) - coloring.append(new_color) - coloring = self._refine(coloring, [new_color]) - return coloring - - def _create_generator(self, colorings, groupings=None): - if not groupings: - groupings = defaultdict(set) - for group in zip(*colorings): - g = set([c.nodes[0] for c in group]) - for n in group: - g |= groupings[n] - for n in g: - groupings[n] = g - return groupings - - @_call_count("individuations") - def _traces(self, coloring, stats=None, depth=[0]): - if stats is not None and 'prunings' not in stats: - stats['prunings'] = 0 - depth[0] += 1 - candidates = self._get_candidates(coloring) - best = [] - best_score = None - best_experimental = None - best_experimental_score = None - last_coloring = None - generator = defaultdict(set) - visited = set() - for candidate, color in candidates: - if candidate in generator: - v = generator[candidate] & visited - if len(v) > 0: - visited.add(candidate) - continue - visited.add(candidate) - coloring_copy = [] - color_copy = None - for c in coloring: - c_copy = c.copy() - coloring_copy.append(c_copy) - if c == color: - color_copy = c_copy - new_color = self._individuate(color_copy, candidate) - coloring_copy.append(new_color) - refined_coloring = self._refine(coloring_copy, [new_color]) - color_score = tuple([c.key() for c in refined_coloring]) - experimental = self._experimental_path(coloring_copy) - experimental_score = set([c.key() for c in experimental]) - if last_coloring: - generator = self._create_generator( - [last_coloring, experimental], - generator) - last_coloring = experimental - if best_score is None or best_score < color_score: - best = [refined_coloring] - best_score = color_score - best_experimental = experimental - best_experimental_score = experimental_score - elif best_score > color_score: - # prune this branch. - if stats is not None: - stats['prunings'] += 1 - elif experimental_score != best_experimental_score: - best.append(refined_coloring) - else: - # prune this branch. - if stats is not None: - stats['prunings'] += 1 - discrete = [x for x in best if self._discrete(x)] - if len(discrete) == 0: - best_score = None - best_depth = None - for coloring in best: - d = [depth[0]] - new_color = self._traces(coloring, stats=stats, depth=d) - color_score = tuple([c.key() for c in refined_coloring]) - if best_score is None or color_score > best_score: - discrete = [new_color] - best_score = color_score - best_depth = d[0] - depth[0] = best_depth - return discrete[0] - - def canonical_triples(self, stats=None): - if stats is not None: - start_canonicalization = datetime.now() - if stats is not None: - start_coloring = datetime.now() - coloring = self._initial_color() - if stats is not None: - stats['triple_count'] = len(self.graph) - stats['adjacent_nodes'] = max(0, len(coloring) - 1) - coloring = self._refine(coloring, coloring[:]) - if stats is not None: - stats['initial_coloring_runtime'] = _total_seconds(datetime.now() - start_coloring) - stats['initial_color_count'] = len(coloring) - - if not self._discrete(coloring): - depth = [0] - coloring = self._traces(coloring, stats=stats, depth=depth) - if stats is not None: - stats['tree_depth'] = depth[0] - elif stats is not None: - stats['individuations'] = 0 - stats['tree_depth'] = 0 - if stats is not None: - stats['color_count'] = len(coloring) - - bnode_labels = dict([(c.nodes[0], c.hash_color()) for c in coloring]) - if stats is not None: - stats["canonicalize_triples_runtime"] = _total_seconds(datetime.now() - start_coloring) - for triple in self.graph: - result = tuple(self._canonicalize_bnodes(triple, bnode_labels)) - yield result - - def _canonicalize_bnodes(self, triple, labels): - for term in triple: - if isinstance(term, BNode): - yield BNode(value="cb%s" % labels[term]) - else: - yield term - - -def to_isomorphic(graph): - if isinstance(graph, IsomorphicGraph): - return graph - return IsomorphicGraph(store=graph.store) - - -def isomorphic(graph1, graph2): - """Compare graph for equality. - - Uses an algorithm to compute unique hashes which takes bnodes into account. - - Examples:: - - >>> g1 = Graph().parse(format='n3', data=''' - ... @prefix : <http://example.org/ns#> . - ... <http://example.org> :rel <http://example.org/a> . - ... <http://example.org> :rel <http://example.org/b> . - ... <http://example.org> :rel [ :label "A bnode." ] . - ... ''') - >>> g2 = Graph().parse(format='n3', data=''' - ... @prefix ns: <http://example.org/ns#> . - ... <http://example.org> ns:rel [ ns:label "A bnode." ] . - ... <http://example.org> ns:rel <http://example.org/b>, - ... <http://example.org/a> . - ... ''') - >>> isomorphic(g1, g2) - True - - >>> g3 = Graph().parse(format='n3', data=''' - ... @prefix : <http://example.org/ns#> . - ... <http://example.org> :rel <http://example.org/a> . - ... <http://example.org> :rel <http://example.org/b> . - ... <http://example.org> :rel <http://example.org/c> . - ... ''') - >>> isomorphic(g1, g3) - False - """ - gd1 = _TripleCanonicalizer(graph1).to_hash() - gd2 = _TripleCanonicalizer(graph2).to_hash() - return gd1 == gd2 - - - -def to_canonical_graph(g1): - """Creates a canonical, read-only graph. - - Creates a canonical, read-only graph where all bnode id:s are based on - deterministical SHA-256 checksums, correlated with the graph contents. - """ - graph = Graph() - graph += _TripleCanonicalizer(g1).canonical_triples() - return ReadOnlyGraphAggregate([graph]) - - -def graph_diff(g1, g2): - """Returns three sets of triples: "in both", "in first" and "in second".""" - # bnodes have deterministic values in canonical graphs: - cg1 = to_canonical_graph(g1) - cg2 = to_canonical_graph(g2) - in_both = cg1 * cg2 - in_first = cg1 - cg2 - in_second = cg2 - cg1 - return (in_both, in_first, in_second) - - - -_MOCK_BNODE = BNode() - - -def similar(g1, g2): - """Checks if the two graphs are "similar". - - Checks if the two graphs are "similar", by comparing sorted triples where - all bnodes have been replaced by a singular mock bnode (the - ``_MOCK_BNODE``). - - This is a much cheaper, but less reliable, alternative to the comparison - algorithm in ``isomorphic``. - """ - return all(t1 == t2 for (t1, t2) in _squashed_graphs_triples(g1, g2)) - - -def _squashed_graphs_triples(g1, g2): - for (t1, t2) in zip(sorted(_squash_graph(g1)), sorted(_squash_graph(g2))): - yield t1, t2 - - -def _squash_graph(graph): - return (_squash_bnodes(triple) for triple in graph) - - -def _squash_bnodes(triple): - return tuple((isinstance(t, BNode) and _MOCK_BNODE) or t for t in triple)