comparison env/lib/python3.7/site-packages/rdflib/compare.py @ 0:26e78fe6e8c4 draft

"planemo upload commit c699937486c35866861690329de38ec1a5d9f783"
author shellac
date Sat, 02 May 2020 07:14:21 -0400
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:26e78fe6e8c4
1 # -*- coding: utf-8 -*-
2 """
3 A collection of utilities for canonicalizing and inspecting graphs.
4
5 Among other things, they solve of the problem of deterministic bnode
6 comparisons.
7
8 Warning: the time to canonicalize bnodes may increase exponentially on
9 degenerate larger graphs. Use with care!
10
11 Example of comparing two graphs::
12
13 >>> g1 = Graph().parse(format='n3', data='''
14 ... @prefix : <http://example.org/ns#> .
15 ... <http://example.org> :rel
16 ... <http://example.org/same>,
17 ... [ :label "Same" ],
18 ... <http://example.org/a>,
19 ... [ :label "A" ] .
20 ... ''')
21 >>> g2 = Graph().parse(format='n3', data='''
22 ... @prefix : <http://example.org/ns#> .
23 ... <http://example.org> :rel
24 ... <http://example.org/same>,
25 ... [ :label "Same" ],
26 ... <http://example.org/b>,
27 ... [ :label "B" ] .
28 ... ''')
29 >>>
30 >>> iso1 = to_isomorphic(g1)
31 >>> iso2 = to_isomorphic(g2)
32
33 These are not isomorphic::
34
35 >>> iso1 == iso2
36 False
37
38 Diff the two graphs::
39
40 >>> in_both, in_first, in_second = graph_diff(iso1, iso2)
41
42 Present in both::
43
44 >>> def dump_nt_sorted(g):
45 ... for l in sorted(g.serialize(format='nt').splitlines()):
46 ... if l: print(l.decode('ascii'))
47
48 >>> dump_nt_sorted(in_both) #doctest: +SKIP
49 <http://example.org>
50 <http://example.org/ns#rel> <http://example.org/same> .
51 <http://example.org>
52 <http://example.org/ns#rel> _:cbcaabaaba17fecbc304a64f8edee4335e .
53 _:cbcaabaaba17fecbc304a64f8edee4335e
54 <http://example.org/ns#label> "Same" .
55
56 Only in first::
57
58 >>> dump_nt_sorted(in_first) #doctest: +SKIP
59 <http://example.org>
60 <http://example.org/ns#rel> <http://example.org/a> .
61 <http://example.org>
62 <http://example.org/ns#rel> _:cb124e4c6da0579f810c0ffe4eff485bd9 .
63 _:cb124e4c6da0579f810c0ffe4eff485bd9
64 <http://example.org/ns#label> "A" .
65
66 Only in second::
67
68 >>> dump_nt_sorted(in_second) #doctest: +SKIP
69 <http://example.org>
70 <http://example.org/ns#rel> <http://example.org/b> .
71 <http://example.org>
72 <http://example.org/ns#rel> _:cb558f30e21ddfc05ca53108348338ade8 .
73 _:cb558f30e21ddfc05ca53108348338ade8
74 <http://example.org/ns#label> "B" .
75 """
76
77
78 # TODO:
79 # - Doesn't handle quads.
80 # - Add warning and/or safety mechanism before working on large graphs?
81 # - use this in existing Graph.isomorphic?
82
83 __all__ = ['IsomorphicGraph', 'to_isomorphic', 'isomorphic',
84 'to_canonical_graph', 'graph_diff', 'similar']
85
86 from rdflib.graph import Graph, ConjunctiveGraph, ReadOnlyGraphAggregate
87 from rdflib.term import BNode, Node
88 try:
89 import hashlib
90 sha256 = hashlib.sha256
91 except ImportError:
92 # for Python << 2.5
93 import sha256
94 sha256 = sha256.new
95
96 from datetime import datetime
97 from collections import defaultdict
98
99
100 def _total_seconds(td):
101 result = td.days * 24 * 60 * 60
102 result += td.seconds
103 result += td.microseconds / 1000000.0
104 return result
105
106
107 class _runtime(object):
108 def __init__(self, label):
109 self.label = label
110
111 def __call__(self, f):
112 if self.label is None:
113 self.label = f.__name__ + "_runtime"
114
115 def wrapped_f(*args, **kwargs):
116 start = datetime.now()
117 result = f(*args, **kwargs)
118 if 'stats' in kwargs and kwargs['stats'] is not None:
119 stats = kwargs['stats']
120 stats[self.label] = _total_seconds(datetime.now() - start)
121 return result
122 return wrapped_f
123
124
125 class _call_count(object):
126 def __init__(self, label):
127 self.label = label
128
129 def __call__(self, f):
130 if self.label is None:
131 self.label = f.__name__ + "_runtime"
132
133 def wrapped_f(*args, **kwargs):
134 if 'stats' in kwargs and kwargs['stats'] is not None:
135 stats = kwargs['stats']
136 if self.label not in stats:
137 stats[self.label] = 0
138 stats[self.label] += 1
139 return f(*args, **kwargs)
140 return wrapped_f
141
142
143 class IsomorphicGraph(ConjunctiveGraph):
144 """An implementation of the RGDA1 graph digest algorithm.
145
146 An implementation of RGDA1 (publication below),
147 a combination of Sayers & Karp's graph digest algorithm using
148 sum and SHA-256 <http://www.hpl.hp.com/techreports/2003/HPL-2003-235R1.pdf>
149 and traces <http://pallini.di.uniroma1.it>, an average case
150 polynomial time algorithm for graph canonicalization.
151
152 McCusker, J. P. (2015). WebSig: A Digital Signature Framework for the Web.
153 Rensselaer Polytechnic Institute, Troy, NY.
154 http://gradworks.umi.com/3727015.pdf
155 """
156
157 def __init__(self, **kwargs):
158 super(IsomorphicGraph, self).__init__(**kwargs)
159
160 def __eq__(self, other):
161 """Graph isomorphism testing."""
162 if not isinstance(other, IsomorphicGraph):
163 return False
164 elif len(self) != len(other):
165 return False
166 return self.internal_hash() == other.internal_hash()
167
168 def __ne__(self, other):
169 """Negative graph isomorphism testing."""
170 return not self.__eq__(other)
171
172 def graph_digest(self, stats=None):
173 """Synonym for IsomorphicGraph.internal_hash."""
174 return self.internal_hash(stats=stats)
175
176 def internal_hash(self, stats=None):
177 """
178 This is defined instead of __hash__ to avoid a circular recursion
179 scenario with the Memory store for rdflib which requires a hash lookup
180 in order to return a generator of triples.
181 """
182 return _TripleCanonicalizer(self).to_hash(stats=stats)
183
184
185 class Color:
186 def __init__(self, nodes, hashfunc, color=(), hash_cache=None):
187 if hash_cache is None:
188 hash_cache = {}
189 self._hash_cache = hash_cache
190 self.color = color
191 self.nodes = nodes
192 self.hashfunc = hashfunc
193 self._hash_color = None
194
195 def key(self):
196 return (len(self.nodes), self.hash_color())
197
198 def hash_color(self, color=None):
199 if color is None:
200 color = self.color
201 if color in self._hash_cache:
202 return self._hash_cache[color]
203
204 def stringify(x):
205 if isinstance(x, Node):
206 return x.n3()
207 else:
208 return str(x)
209 if isinstance(color, Node):
210 return stringify(color)
211 value = 0
212 for triple in color:
213 value += self.hashfunc(' '.join([stringify(x) for x in triple]))
214 val = "%x" % value
215 self._hash_cache[color] = val
216 return val
217
218 def distinguish(self, W, graph):
219 colors = {}
220 for n in self.nodes:
221 new_color = list(self.color)
222 for node in W.nodes:
223 new_color += [
224 (1, p, W.hash_color())
225 for s, p, o in graph.triples((n, None, node))]
226 new_color += [
227 (W.hash_color(), p, 3)
228 for s, p, o in graph.triples((node, None, n))]
229 new_color = tuple(new_color)
230 new_hash_color = self.hash_color(new_color)
231
232 if new_hash_color not in colors:
233 c = Color(
234 [], self.hashfunc, new_color,
235 hash_cache=self._hash_cache)
236 colors[new_hash_color] = c
237 colors[new_hash_color].nodes.append(n)
238 return list(colors.values())
239
240 def discrete(self):
241 return len(self.nodes) == 1
242
243 def copy(self):
244 return Color(
245 self.nodes[:], self.hashfunc, self.color,
246 hash_cache=self._hash_cache)
247
248
249 class _TripleCanonicalizer(object):
250
251 def __init__(self, graph, hashfunc=sha256):
252 self.graph = graph
253
254 def _hashfunc(s):
255 h = hashfunc()
256 h.update(str(s).encode("utf8"))
257 return int(h.hexdigest(), 16)
258 self._hash_cache = {}
259 self.hashfunc = _hashfunc
260
261
262 def _discrete(self, coloring):
263 return len([c for c in coloring if not c.discrete()]) == 0
264
265
266 def _initial_color(self):
267 """Finds an initial color for the graph.
268
269 Finds an initial color fo the graph by finding all blank nodes and
270 non-blank nodes that are adjacent. Nodes that are not adjacent to blank
271 nodes are not included, as they are a) already colored (by URI or literal)
272 and b) do not factor into the color of any blank node.
273 """
274 bnodes = set()
275 others = set()
276 self._neighbors = defaultdict(set)
277 for s, p, o in self.graph:
278 nodes = set([s, o])
279 b = set([x for x in nodes if isinstance(x, BNode)])
280 if len(b) > 0:
281 others |= nodes - b
282 bnodes |= b
283 if isinstance(s, BNode):
284 self._neighbors[s].add(o)
285 if isinstance(o, BNode):
286 self._neighbors[o].add(s)
287 if len(bnodes) > 0:
288 return [
289 Color(list(bnodes), self.hashfunc, hash_cache=self._hash_cache)
290 ] + [
291 Color([x], self.hashfunc, x, hash_cache=self._hash_cache)
292 for x in others
293 ]
294 else:
295 return []
296
297 def _individuate(self, color, individual):
298 new_color = list(color.color)
299 new_color.append((len(color.nodes),))
300
301 color.nodes.remove(individual)
302 c = Color([individual], self.hashfunc, tuple(new_color),
303 hash_cache=self._hash_cache)
304 return c
305
306 def _get_candidates(self, coloring):
307 candidates = [c for c in coloring if not c.discrete()]
308 for c in [c for c in coloring if not c.discrete()]:
309 for node in c.nodes:
310 yield node, c
311
312 def _refine(self, coloring, sequence):
313 sequence = sorted(sequence, key=lambda x: x.key(), reverse=True)
314 coloring = coloring[:]
315 while len(sequence) > 0 and not self._discrete(coloring):
316 W = sequence.pop()
317 for c in coloring[:]:
318 if len(c.nodes) > 1:
319 colors = sorted(c.distinguish(W, self.graph),
320 key=lambda x: x.key(),
321 reverse=True)
322 coloring.remove(c)
323 coloring.extend(colors)
324 try:
325 si = sequence.index(c)
326 sequence = sequence[:si] + colors + sequence[si+1:]
327 except ValueError:
328 sequence = colors[1:] + sequence
329
330 return coloring
331
332 @_runtime("to_hash_runtime")
333 def to_hash(self, stats=None):
334 result = 0
335 for triple in self.canonical_triples(stats=stats):
336 result += self.hashfunc(' '.join([x.n3() for x in triple]))
337 if stats is not None:
338 stats['graph_digest'] = "%x" % result
339 return result
340
341 def _experimental_path(self, coloring):
342 coloring = [c.copy() for c in coloring]
343 while not self._discrete(coloring):
344 color = [x for x in coloring if not x.discrete()][0]
345 node = color.nodes[0]
346 new_color = self._individuate(color, node)
347 coloring.append(new_color)
348 coloring = self._refine(coloring, [new_color])
349 return coloring
350
351 def _create_generator(self, colorings, groupings=None):
352 if not groupings:
353 groupings = defaultdict(set)
354 for group in zip(*colorings):
355 g = set([c.nodes[0] for c in group])
356 for n in group:
357 g |= groupings[n]
358 for n in g:
359 groupings[n] = g
360 return groupings
361
362 @_call_count("individuations")
363 def _traces(self, coloring, stats=None, depth=[0]):
364 if stats is not None and 'prunings' not in stats:
365 stats['prunings'] = 0
366 depth[0] += 1
367 candidates = self._get_candidates(coloring)
368 best = []
369 best_score = None
370 best_experimental = None
371 best_experimental_score = None
372 last_coloring = None
373 generator = defaultdict(set)
374 visited = set()
375 for candidate, color in candidates:
376 if candidate in generator:
377 v = generator[candidate] & visited
378 if len(v) > 0:
379 visited.add(candidate)
380 continue
381 visited.add(candidate)
382 coloring_copy = []
383 color_copy = None
384 for c in coloring:
385 c_copy = c.copy()
386 coloring_copy.append(c_copy)
387 if c == color:
388 color_copy = c_copy
389 new_color = self._individuate(color_copy, candidate)
390 coloring_copy.append(new_color)
391 refined_coloring = self._refine(coloring_copy, [new_color])
392 color_score = tuple([c.key() for c in refined_coloring])
393 experimental = self._experimental_path(coloring_copy)
394 experimental_score = set([c.key() for c in experimental])
395 if last_coloring:
396 generator = self._create_generator(
397 [last_coloring, experimental],
398 generator)
399 last_coloring = experimental
400 if best_score is None or best_score < color_score:
401 best = [refined_coloring]
402 best_score = color_score
403 best_experimental = experimental
404 best_experimental_score = experimental_score
405 elif best_score > color_score:
406 # prune this branch.
407 if stats is not None:
408 stats['prunings'] += 1
409 elif experimental_score != best_experimental_score:
410 best.append(refined_coloring)
411 else:
412 # prune this branch.
413 if stats is not None:
414 stats['prunings'] += 1
415 discrete = [x for x in best if self._discrete(x)]
416 if len(discrete) == 0:
417 best_score = None
418 best_depth = None
419 for coloring in best:
420 d = [depth[0]]
421 new_color = self._traces(coloring, stats=stats, depth=d)
422 color_score = tuple([c.key() for c in refined_coloring])
423 if best_score is None or color_score > best_score:
424 discrete = [new_color]
425 best_score = color_score
426 best_depth = d[0]
427 depth[0] = best_depth
428 return discrete[0]
429
430 def canonical_triples(self, stats=None):
431 if stats is not None:
432 start_canonicalization = datetime.now()
433 if stats is not None:
434 start_coloring = datetime.now()
435 coloring = self._initial_color()
436 if stats is not None:
437 stats['triple_count'] = len(self.graph)
438 stats['adjacent_nodes'] = max(0, len(coloring) - 1)
439 coloring = self._refine(coloring, coloring[:])
440 if stats is not None:
441 stats['initial_coloring_runtime'] = _total_seconds(datetime.now() - start_coloring)
442 stats['initial_color_count'] = len(coloring)
443
444 if not self._discrete(coloring):
445 depth = [0]
446 coloring = self._traces(coloring, stats=stats, depth=depth)
447 if stats is not None:
448 stats['tree_depth'] = depth[0]
449 elif stats is not None:
450 stats['individuations'] = 0
451 stats['tree_depth'] = 0
452 if stats is not None:
453 stats['color_count'] = len(coloring)
454
455 bnode_labels = dict([(c.nodes[0], c.hash_color()) for c in coloring])
456 if stats is not None:
457 stats["canonicalize_triples_runtime"] = _total_seconds(datetime.now() - start_coloring)
458 for triple in self.graph:
459 result = tuple(self._canonicalize_bnodes(triple, bnode_labels))
460 yield result
461
462 def _canonicalize_bnodes(self, triple, labels):
463 for term in triple:
464 if isinstance(term, BNode):
465 yield BNode(value="cb%s" % labels[term])
466 else:
467 yield term
468
469
470 def to_isomorphic(graph):
471 if isinstance(graph, IsomorphicGraph):
472 return graph
473 return IsomorphicGraph(store=graph.store)
474
475
476 def isomorphic(graph1, graph2):
477 """Compare graph for equality.
478
479 Uses an algorithm to compute unique hashes which takes bnodes into account.
480
481 Examples::
482
483 >>> g1 = Graph().parse(format='n3', data='''
484 ... @prefix : <http://example.org/ns#> .
485 ... <http://example.org> :rel <http://example.org/a> .
486 ... <http://example.org> :rel <http://example.org/b> .
487 ... <http://example.org> :rel [ :label "A bnode." ] .
488 ... ''')
489 >>> g2 = Graph().parse(format='n3', data='''
490 ... @prefix ns: <http://example.org/ns#> .
491 ... <http://example.org> ns:rel [ ns:label "A bnode." ] .
492 ... <http://example.org> ns:rel <http://example.org/b>,
493 ... <http://example.org/a> .
494 ... ''')
495 >>> isomorphic(g1, g2)
496 True
497
498 >>> g3 = Graph().parse(format='n3', data='''
499 ... @prefix : <http://example.org/ns#> .
500 ... <http://example.org> :rel <http://example.org/a> .
501 ... <http://example.org> :rel <http://example.org/b> .
502 ... <http://example.org> :rel <http://example.org/c> .
503 ... ''')
504 >>> isomorphic(g1, g3)
505 False
506 """
507 gd1 = _TripleCanonicalizer(graph1).to_hash()
508 gd2 = _TripleCanonicalizer(graph2).to_hash()
509 return gd1 == gd2
510
511
512
513 def to_canonical_graph(g1):
514 """Creates a canonical, read-only graph.
515
516 Creates a canonical, read-only graph where all bnode id:s are based on
517 deterministical SHA-256 checksums, correlated with the graph contents.
518 """
519 graph = Graph()
520 graph += _TripleCanonicalizer(g1).canonical_triples()
521 return ReadOnlyGraphAggregate([graph])
522
523
524 def graph_diff(g1, g2):
525 """Returns three sets of triples: "in both", "in first" and "in second"."""
526 # bnodes have deterministic values in canonical graphs:
527 cg1 = to_canonical_graph(g1)
528 cg2 = to_canonical_graph(g2)
529 in_both = cg1 * cg2
530 in_first = cg1 - cg2
531 in_second = cg2 - cg1
532 return (in_both, in_first, in_second)
533
534
535
536 _MOCK_BNODE = BNode()
537
538
539 def similar(g1, g2):
540 """Checks if the two graphs are "similar".
541
542 Checks if the two graphs are "similar", by comparing sorted triples where
543 all bnodes have been replaced by a singular mock bnode (the
544 ``_MOCK_BNODE``).
545
546 This is a much cheaper, but less reliable, alternative to the comparison
547 algorithm in ``isomorphic``.
548 """
549 return all(t1 == t2 for (t1, t2) in _squashed_graphs_triples(g1, g2))
550
551
552 def _squashed_graphs_triples(g1, g2):
553 for (t1, t2) in zip(sorted(_squash_graph(g1)), sorted(_squash_graph(g2))):
554 yield t1, t2
555
556
557 def _squash_graph(graph):
558 return (_squash_bnodes(triple) for triple in graph)
559
560
561 def _squash_bnodes(triple):
562 return tuple((isinstance(t, BNode) and _MOCK_BNODE) or t for t in triple)