Mercurial > repos > shellac > sam_consensus_v3
comparison env/lib/python3.9/site-packages/rdflib/compare.py @ 0:4f3585e2f14b draft default tip
"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author | shellac |
---|---|
date | Mon, 22 Mar 2021 18:12:50 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:4f3585e2f14b |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 A collection of utilities for canonicalizing and inspecting graphs. | |
4 | |
5 Among other things, they solve of the problem of deterministic bnode | |
6 comparisons. | |
7 | |
8 Warning: the time to canonicalize bnodes may increase exponentially on | |
9 degenerate larger graphs. Use with care! | |
10 | |
11 Example of comparing two graphs:: | |
12 | |
13 >>> g1 = Graph().parse(format='n3', data=''' | |
14 ... @prefix : <http://example.org/ns#> . | |
15 ... <http://example.org> :rel | |
16 ... <http://example.org/same>, | |
17 ... [ :label "Same" ], | |
18 ... <http://example.org/a>, | |
19 ... [ :label "A" ] . | |
20 ... ''') | |
21 >>> g2 = Graph().parse(format='n3', data=''' | |
22 ... @prefix : <http://example.org/ns#> . | |
23 ... <http://example.org> :rel | |
24 ... <http://example.org/same>, | |
25 ... [ :label "Same" ], | |
26 ... <http://example.org/b>, | |
27 ... [ :label "B" ] . | |
28 ... ''') | |
29 >>> | |
30 >>> iso1 = to_isomorphic(g1) | |
31 >>> iso2 = to_isomorphic(g2) | |
32 | |
33 These are not isomorphic:: | |
34 | |
35 >>> iso1 == iso2 | |
36 False | |
37 | |
38 Diff the two graphs:: | |
39 | |
40 >>> in_both, in_first, in_second = graph_diff(iso1, iso2) | |
41 | |
42 Present in both:: | |
43 | |
44 >>> def dump_nt_sorted(g): | |
45 ... for l in sorted(g.serialize(format='nt').splitlines()): | |
46 ... if l: print(l.decode('ascii')) | |
47 | |
48 >>> dump_nt_sorted(in_both) #doctest: +SKIP | |
49 <http://example.org> | |
50 <http://example.org/ns#rel> <http://example.org/same> . | |
51 <http://example.org> | |
52 <http://example.org/ns#rel> _:cbcaabaaba17fecbc304a64f8edee4335e . | |
53 _:cbcaabaaba17fecbc304a64f8edee4335e | |
54 <http://example.org/ns#label> "Same" . | |
55 | |
56 Only in first:: | |
57 | |
58 >>> dump_nt_sorted(in_first) #doctest: +SKIP | |
59 <http://example.org> | |
60 <http://example.org/ns#rel> <http://example.org/a> . | |
61 <http://example.org> | |
62 <http://example.org/ns#rel> _:cb124e4c6da0579f810c0ffe4eff485bd9 . | |
63 _:cb124e4c6da0579f810c0ffe4eff485bd9 | |
64 <http://example.org/ns#label> "A" . | |
65 | |
66 Only in second:: | |
67 | |
68 >>> dump_nt_sorted(in_second) #doctest: +SKIP | |
69 <http://example.org> | |
70 <http://example.org/ns#rel> <http://example.org/b> . | |
71 <http://example.org> | |
72 <http://example.org/ns#rel> _:cb558f30e21ddfc05ca53108348338ade8 . | |
73 _:cb558f30e21ddfc05ca53108348338ade8 | |
74 <http://example.org/ns#label> "B" . | |
75 """ | |
76 from __future__ import absolute_import | |
77 from __future__ import division | |
78 from __future__ import print_function | |
79 | |
80 | |
81 # TODO: | |
82 # - Doesn't handle quads. | |
83 # - Add warning and/or safety mechanism before working on large graphs? | |
84 # - use this in existing Graph.isomorphic? | |
85 | |
86 __all__ = ['IsomorphicGraph', 'to_isomorphic', 'isomorphic', | |
87 'to_canonical_graph', 'graph_diff', 'similar'] | |
88 | |
89 from rdflib.graph import Graph, ConjunctiveGraph, ReadOnlyGraphAggregate | |
90 from rdflib.term import BNode, Node | |
91 from hashlib import sha256 | |
92 | |
93 from datetime import datetime | |
94 from collections import defaultdict | |
95 | |
96 from six import text_type | |
97 | |
98 | |
99 def _total_seconds(td): | |
100 result = td.days * 24 * 60 * 60 | |
101 result += td.seconds | |
102 result += td.microseconds / 1000000.0 | |
103 return result | |
104 | |
105 | |
106 class _runtime(object): | |
107 def __init__(self, label): | |
108 self.label = label | |
109 | |
110 def __call__(self, f): | |
111 if self.label is None: | |
112 self.label = f.__name__ + "_runtime" | |
113 | |
114 def wrapped_f(*args, **kwargs): | |
115 start = datetime.now() | |
116 result = f(*args, **kwargs) | |
117 if 'stats' in kwargs and kwargs['stats'] is not None: | |
118 stats = kwargs['stats'] | |
119 stats[self.label] = _total_seconds(datetime.now() - start) | |
120 return result | |
121 return wrapped_f | |
122 | |
123 | |
124 class _call_count(object): | |
125 def __init__(self, label): | |
126 self.label = label | |
127 | |
128 def __call__(self, f): | |
129 if self.label is None: | |
130 self.label = f.__name__ + "_runtime" | |
131 | |
132 def wrapped_f(*args, **kwargs): | |
133 if 'stats' in kwargs and kwargs['stats'] is not None: | |
134 stats = kwargs['stats'] | |
135 if self.label not in stats: | |
136 stats[self.label] = 0 | |
137 stats[self.label] += 1 | |
138 return f(*args, **kwargs) | |
139 return wrapped_f | |
140 | |
141 | |
142 class IsomorphicGraph(ConjunctiveGraph): | |
143 """An implementation of the RGDA1 graph digest algorithm. | |
144 | |
145 An implementation of RGDA1 (publication below), | |
146 a combination of Sayers & Karp's graph digest algorithm using | |
147 sum and SHA-256 <http://www.hpl.hp.com/techreports/2003/HPL-2003-235R1.pdf> | |
148 and traces <http://pallini.di.uniroma1.it>, an average case | |
149 polynomial time algorithm for graph canonicalization. | |
150 | |
151 McCusker, J. P. (2015). WebSig: A Digital Signature Framework for the Web. | |
152 Rensselaer Polytechnic Institute, Troy, NY. | |
153 http://gradworks.umi.com/3727015.pdf | |
154 """ | |
155 | |
156 def __init__(self, **kwargs): | |
157 super(IsomorphicGraph, self).__init__(**kwargs) | |
158 | |
159 def __eq__(self, other): | |
160 """Graph isomorphism testing.""" | |
161 if not isinstance(other, IsomorphicGraph): | |
162 return False | |
163 elif len(self) != len(other): | |
164 return False | |
165 return self.internal_hash() == other.internal_hash() | |
166 | |
167 def __ne__(self, other): | |
168 """Negative graph isomorphism testing.""" | |
169 return not self.__eq__(other) | |
170 | |
171 def __hash__(self): | |
172 return super(IsomorphicGraph, self).__hash__() | |
173 | |
174 def graph_digest(self, stats=None): | |
175 """Synonym for IsomorphicGraph.internal_hash.""" | |
176 return self.internal_hash(stats=stats) | |
177 | |
178 def internal_hash(self, stats=None): | |
179 """ | |
180 This is defined instead of __hash__ to avoid a circular recursion | |
181 scenario with the Memory store for rdflib which requires a hash lookup | |
182 in order to return a generator of triples. | |
183 """ | |
184 return _TripleCanonicalizer(self).to_hash(stats=stats) | |
185 | |
186 | |
187 class Color: | |
188 def __init__(self, nodes, hashfunc, color=(), hash_cache=None): | |
189 if hash_cache is None: | |
190 hash_cache = {} | |
191 self._hash_cache = hash_cache | |
192 self.color = color | |
193 self.nodes = nodes | |
194 self.hashfunc = hashfunc | |
195 self._hash_color = None | |
196 | |
197 def __str__(self): | |
198 nodes, color = self.key() | |
199 return "Color %s (%s nodes)" % (color, nodes) | |
200 | |
201 def key(self): | |
202 return (len(self.nodes), self.hash_color()) | |
203 | |
204 def hash_color(self, color=None): | |
205 if color is None: | |
206 color = self.color | |
207 if color in self._hash_cache: | |
208 return self._hash_cache[color] | |
209 | |
210 def stringify(x): | |
211 if isinstance(x, Node): | |
212 return x.n3() | |
213 else: | |
214 return text_type(x) | |
215 if isinstance(color, Node): | |
216 return stringify(color) | |
217 value = 0 | |
218 for triple in color: | |
219 value += self.hashfunc(' '.join([stringify(x) for x in triple])) | |
220 val = u"%x" % value | |
221 self._hash_cache[color] = val | |
222 return val | |
223 | |
224 def distinguish(self, W, graph): | |
225 colors = {} | |
226 for n in self.nodes: | |
227 new_color = list(self.color) | |
228 for node in W.nodes: | |
229 new_color += [ | |
230 (1, p, W.hash_color()) | |
231 for s, p, o in graph.triples((n, None, node))] | |
232 new_color += [ | |
233 (W.hash_color(), p, 3) | |
234 for s, p, o in graph.triples((node, None, n))] | |
235 new_color = tuple(new_color) | |
236 new_hash_color = self.hash_color(new_color) | |
237 | |
238 if new_hash_color not in colors: | |
239 c = Color( | |
240 [], self.hashfunc, new_color, | |
241 hash_cache=self._hash_cache) | |
242 colors[new_hash_color] = c | |
243 colors[new_hash_color].nodes.append(n) | |
244 return colors.values() | |
245 | |
246 def discrete(self): | |
247 return len(self.nodes) == 1 | |
248 | |
249 def copy(self): | |
250 return Color( | |
251 self.nodes[:], self.hashfunc, self.color, | |
252 hash_cache=self._hash_cache) | |
253 | |
254 | |
255 class _TripleCanonicalizer(object): | |
256 | |
257 def __init__(self, graph, hashfunc=sha256): | |
258 self.graph = graph | |
259 | |
260 def _hashfunc(s): | |
261 h = hashfunc() | |
262 h.update(text_type(s).encode("utf8")) | |
263 return int(h.hexdigest(), 16) | |
264 self._hash_cache = {} | |
265 self.hashfunc = _hashfunc | |
266 | |
267 def _discrete(self, coloring): | |
268 return len([c for c in coloring if not c.discrete()]) == 0 | |
269 | |
270 def _initial_color(self): | |
271 """Finds an initial color for the graph. | |
272 | |
273 Finds an initial color fo the graph by finding all blank nodes and | |
274 non-blank nodes that are adjacent. Nodes that are not adjacent to blank | |
275 nodes are not included, as they are a) already colored (by URI or literal) | |
276 and b) do not factor into the color of any blank node. | |
277 """ | |
278 bnodes = set() | |
279 others = set() | |
280 self._neighbors = defaultdict(set) | |
281 for s, p, o in self.graph: | |
282 nodes = set([s, p, o]) | |
283 b = set([x for x in nodes if isinstance(x, BNode)]) | |
284 if len(b) > 0: | |
285 others |= nodes - b | |
286 bnodes |= b | |
287 if isinstance(s, BNode): | |
288 self._neighbors[s].add(o) | |
289 if isinstance(o, BNode): | |
290 self._neighbors[o].add(s) | |
291 if isinstance(p, BNode): | |
292 self._neighbors[p].add(s) | |
293 self._neighbors[p].add(p) | |
294 if len(bnodes) > 0: | |
295 return [ | |
296 Color(list(bnodes), self.hashfunc, hash_cache=self._hash_cache) | |
297 ] + [ | |
298 Color([x], self.hashfunc, x, hash_cache=self._hash_cache) | |
299 for x in others | |
300 ] | |
301 else: | |
302 return [] | |
303 | |
304 def _individuate(self, color, individual): | |
305 new_color = list(color.color) | |
306 new_color.append((len(color.nodes),)) | |
307 | |
308 color.nodes.remove(individual) | |
309 c = Color([individual], self.hashfunc, tuple(new_color), | |
310 hash_cache=self._hash_cache) | |
311 return c | |
312 | |
313 def _get_candidates(self, coloring): | |
314 candidates = [c for c in coloring if not c.discrete()] | |
315 for c in [c for c in coloring if not c.discrete()]: | |
316 for node in c.nodes: | |
317 yield node, c | |
318 | |
319 def _refine(self, coloring, sequence): | |
320 sequence = sorted(sequence, key=lambda x: x.key(), reverse=True) | |
321 coloring = coloring[:] | |
322 while len(sequence) > 0 and not self._discrete(coloring): | |
323 W = sequence.pop() | |
324 for c in coloring[:]: | |
325 if len(c.nodes) > 1 or isinstance(c.nodes[0], BNode): | |
326 colors = sorted(c.distinguish(W, self.graph), | |
327 key=lambda x: x.key(), | |
328 reverse=True) | |
329 coloring.remove(c) | |
330 coloring.extend(colors) | |
331 try: | |
332 si = sequence.index(c) | |
333 sequence = sequence[:si] + colors + sequence[si+1:] | |
334 except ValueError: | |
335 sequence = colors[1:] + sequence | |
336 combined_colors = [] | |
337 combined_color_map = dict() | |
338 for color in coloring: | |
339 color_hash = color.hash_color() | |
340 # This is a hash collision, and be combined into a single color for individuation. | |
341 if color_hash in combined_color_map: | |
342 combined_color_map[color_hash].nodes.extend(color.nodes) | |
343 else: | |
344 combined_colors.append(color) | |
345 combined_color_map[color_hash] = color | |
346 return combined_colors | |
347 | |
348 @_runtime("to_hash_runtime") | |
349 def to_hash(self, stats=None): | |
350 result = 0 | |
351 for triple in self.canonical_triples(stats=stats): | |
352 result += self.hashfunc(' '.join([x.n3() for x in triple])) | |
353 if stats is not None: | |
354 stats['graph_digest'] = "%x" % result | |
355 return result | |
356 | |
357 def _experimental_path(self, coloring): | |
358 coloring = [c.copy() for c in coloring] | |
359 while not self._discrete(coloring): | |
360 color = [x for x in coloring if not x.discrete()][0] | |
361 node = color.nodes[0] | |
362 new_color = self._individuate(color, node) | |
363 coloring.append(new_color) | |
364 coloring = self._refine(coloring, [new_color]) | |
365 return coloring | |
366 | |
367 def _create_generator(self, colorings, groupings=None): | |
368 if not groupings: | |
369 groupings = defaultdict(set) | |
370 for group in zip(*colorings): | |
371 g = set([c.nodes[0] for c in group]) | |
372 for n in group: | |
373 g |= groupings[n] | |
374 for n in g: | |
375 groupings[n] = g | |
376 return groupings | |
377 | |
378 @_call_count("individuations") | |
379 def _traces(self, coloring, stats=None, depth=[0]): | |
380 if stats is not None and 'prunings' not in stats: | |
381 stats['prunings'] = 0 | |
382 depth[0] += 1 | |
383 candidates = self._get_candidates(coloring) | |
384 best = [] | |
385 best_score = None | |
386 best_experimental = None | |
387 best_experimental_score = None | |
388 last_coloring = None | |
389 generator = defaultdict(set) | |
390 visited = set() | |
391 for candidate, color in candidates: | |
392 if candidate in generator: | |
393 v = generator[candidate] & visited | |
394 if len(v) > 0: | |
395 visited.add(candidate) | |
396 continue | |
397 visited.add(candidate) | |
398 coloring_copy = [] | |
399 color_copy = None | |
400 for c in coloring: | |
401 c_copy = c.copy() | |
402 coloring_copy.append(c_copy) | |
403 if c == color: | |
404 color_copy = c_copy | |
405 new_color = self._individuate(color_copy, candidate) | |
406 coloring_copy.append(new_color) | |
407 refined_coloring = self._refine(coloring_copy, [new_color]) | |
408 color_score = tuple([c.key() for c in refined_coloring]) | |
409 experimental = self._experimental_path(coloring_copy) | |
410 experimental_score = set([c.key() for c in experimental]) | |
411 if last_coloring: | |
412 generator = self._create_generator( | |
413 [last_coloring, experimental], | |
414 generator) | |
415 last_coloring = experimental | |
416 if best_score is None or best_score < color_score: | |
417 best = [refined_coloring] | |
418 best_score = color_score | |
419 best_experimental = experimental | |
420 best_experimental_score = experimental_score | |
421 elif best_score > color_score: | |
422 # prune this branch. | |
423 if stats is not None: | |
424 stats['prunings'] += 1 | |
425 elif experimental_score != best_experimental_score: | |
426 best.append(refined_coloring) | |
427 else: | |
428 # prune this branch. | |
429 if stats is not None: | |
430 stats['prunings'] += 1 | |
431 discrete = [x for x in best if self._discrete(x)] | |
432 if len(discrete) == 0: | |
433 best_score = None | |
434 best_depth = None | |
435 for coloring in best: | |
436 d = [depth[0]] | |
437 new_color = self._traces(coloring, stats=stats, depth=d) | |
438 color_score = tuple([c.key() for c in refined_coloring]) | |
439 if best_score is None or color_score > best_score: | |
440 discrete = [new_color] | |
441 best_score = color_score | |
442 best_depth = d[0] | |
443 depth[0] = best_depth | |
444 return discrete[0] | |
445 | |
446 def canonical_triples(self, stats=None): | |
447 if stats is not None: | |
448 start_canonicalization = datetime.now() | |
449 if stats is not None: | |
450 start_coloring = datetime.now() | |
451 coloring = self._initial_color() | |
452 if stats is not None: | |
453 stats['triple_count'] = len(self.graph) | |
454 stats['adjacent_nodes'] = max(0, len(coloring) - 1) | |
455 coloring = self._refine(coloring, coloring[:]) | |
456 if stats is not None: | |
457 stats['initial_coloring_runtime'] = _total_seconds(datetime.now() - start_coloring) | |
458 stats['initial_color_count'] = len(coloring) | |
459 | |
460 if not self._discrete(coloring): | |
461 depth = [0] | |
462 coloring = self._traces(coloring, stats=stats, depth=depth) | |
463 if stats is not None: | |
464 stats['tree_depth'] = depth[0] | |
465 elif stats is not None: | |
466 stats['individuations'] = 0 | |
467 stats['tree_depth'] = 0 | |
468 if stats is not None: | |
469 stats['color_count'] = len(coloring) | |
470 | |
471 bnode_labels = dict([(c.nodes[0], c.hash_color()) for c in coloring]) | |
472 if stats is not None: | |
473 stats["canonicalize_triples_runtime"] = _total_seconds(datetime.now() - start_coloring) | |
474 for triple in self.graph: | |
475 result = tuple(self._canonicalize_bnodes(triple, bnode_labels)) | |
476 yield result | |
477 | |
478 def _canonicalize_bnodes(self, triple, labels): | |
479 for term in triple: | |
480 if isinstance(term, BNode): | |
481 yield BNode(value="cb%s" % labels[term]) | |
482 else: | |
483 yield term | |
484 | |
485 | |
486 def to_isomorphic(graph): | |
487 if isinstance(graph, IsomorphicGraph): | |
488 return graph | |
489 result = IsomorphicGraph() | |
490 if hasattr(graph, "identifier"): | |
491 result = IsomorphicGraph(identifier=graph.identifier) | |
492 result += graph | |
493 return result | |
494 | |
495 | |
496 def isomorphic(graph1, graph2): | |
497 """Compare graph for equality. | |
498 | |
499 Uses an algorithm to compute unique hashes which takes bnodes into account. | |
500 | |
501 Examples:: | |
502 | |
503 >>> g1 = Graph().parse(format='n3', data=''' | |
504 ... @prefix : <http://example.org/ns#> . | |
505 ... <http://example.org> :rel <http://example.org/a> . | |
506 ... <http://example.org> :rel <http://example.org/b> . | |
507 ... <http://example.org> :rel [ :label "A bnode." ] . | |
508 ... ''') | |
509 >>> g2 = Graph().parse(format='n3', data=''' | |
510 ... @prefix ns: <http://example.org/ns#> . | |
511 ... <http://example.org> ns:rel [ ns:label "A bnode." ] . | |
512 ... <http://example.org> ns:rel <http://example.org/b>, | |
513 ... <http://example.org/a> . | |
514 ... ''') | |
515 >>> isomorphic(g1, g2) | |
516 True | |
517 | |
518 >>> g3 = Graph().parse(format='n3', data=''' | |
519 ... @prefix : <http://example.org/ns#> . | |
520 ... <http://example.org> :rel <http://example.org/a> . | |
521 ... <http://example.org> :rel <http://example.org/b> . | |
522 ... <http://example.org> :rel <http://example.org/c> . | |
523 ... ''') | |
524 >>> isomorphic(g1, g3) | |
525 False | |
526 """ | |
527 gd1 = _TripleCanonicalizer(graph1).to_hash() | |
528 gd2 = _TripleCanonicalizer(graph2).to_hash() | |
529 return gd1 == gd2 | |
530 | |
531 | |
532 def to_canonical_graph(g1, stats=None): | |
533 """Creates a canonical, read-only graph. | |
534 | |
535 Creates a canonical, read-only graph where all bnode id:s are based on | |
536 deterministical SHA-256 checksums, correlated with the graph contents. | |
537 """ | |
538 graph = Graph() | |
539 graph += _TripleCanonicalizer(g1).canonical_triples(stats=stats) | |
540 return ReadOnlyGraphAggregate([graph]) | |
541 | |
542 | |
543 def graph_diff(g1, g2): | |
544 """Returns three sets of triples: "in both", "in first" and "in second".""" | |
545 # bnodes have deterministic values in canonical graphs: | |
546 cg1 = to_canonical_graph(g1) | |
547 cg2 = to_canonical_graph(g2) | |
548 in_both = cg1 * cg2 | |
549 in_first = cg1 - cg2 | |
550 in_second = cg2 - cg1 | |
551 return (in_both, in_first, in_second) | |
552 | |
553 | |
554 _MOCK_BNODE = BNode() | |
555 | |
556 | |
557 def similar(g1, g2): | |
558 """Checks if the two graphs are "similar". | |
559 | |
560 Checks if the two graphs are "similar", by comparing sorted triples where | |
561 all bnodes have been replaced by a singular mock bnode (the | |
562 ``_MOCK_BNODE``). | |
563 | |
564 This is a much cheaper, but less reliable, alternative to the comparison | |
565 algorithm in ``isomorphic``. | |
566 """ | |
567 return all(t1 == t2 for (t1, t2) in _squashed_graphs_triples(g1, g2)) | |
568 | |
569 | |
570 def _squashed_graphs_triples(g1, g2): | |
571 for (t1, t2) in zip(sorted(_squash_graph(g1)), sorted(_squash_graph(g2))): | |
572 yield t1, t2 | |
573 | |
574 | |
575 def _squash_graph(graph): | |
576 return (_squash_bnodes(triple) for triple in graph) | |
577 | |
578 | |
579 def _squash_bnodes(triple): | |
580 return tuple((isinstance(t, BNode) and _MOCK_BNODE) or t for t in triple) |