Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/rdflib/compare.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:d30785e31577 | 1:56ad4e20f292 |
---|---|
1 # -*- coding: utf-8 -*- | |
2 """ | |
3 A collection of utilities for canonicalizing and inspecting graphs. | |
4 | |
5 Among other things, they solve of the problem of deterministic bnode | |
6 comparisons. | |
7 | |
8 Warning: the time to canonicalize bnodes may increase exponentially on | |
9 degenerate larger graphs. Use with care! | |
10 | |
11 Example of comparing two graphs:: | |
12 | |
13 >>> g1 = Graph().parse(format='n3', data=''' | |
14 ... @prefix : <http://example.org/ns#> . | |
15 ... <http://example.org> :rel | |
16 ... <http://example.org/same>, | |
17 ... [ :label "Same" ], | |
18 ... <http://example.org/a>, | |
19 ... [ :label "A" ] . | |
20 ... ''') | |
21 >>> g2 = Graph().parse(format='n3', data=''' | |
22 ... @prefix : <http://example.org/ns#> . | |
23 ... <http://example.org> :rel | |
24 ... <http://example.org/same>, | |
25 ... [ :label "Same" ], | |
26 ... <http://example.org/b>, | |
27 ... [ :label "B" ] . | |
28 ... ''') | |
29 >>> | |
30 >>> iso1 = to_isomorphic(g1) | |
31 >>> iso2 = to_isomorphic(g2) | |
32 | |
33 These are not isomorphic:: | |
34 | |
35 >>> iso1 == iso2 | |
36 False | |
37 | |
38 Diff the two graphs:: | |
39 | |
40 >>> in_both, in_first, in_second = graph_diff(iso1, iso2) | |
41 | |
42 Present in both:: | |
43 | |
44 >>> def dump_nt_sorted(g): | |
45 ... for l in sorted(g.serialize(format='nt').splitlines()): | |
46 ... if l: print(l.decode('ascii')) | |
47 | |
48 >>> dump_nt_sorted(in_both) #doctest: +SKIP | |
49 <http://example.org> | |
50 <http://example.org/ns#rel> <http://example.org/same> . | |
51 <http://example.org> | |
52 <http://example.org/ns#rel> _:cbcaabaaba17fecbc304a64f8edee4335e . | |
53 _:cbcaabaaba17fecbc304a64f8edee4335e | |
54 <http://example.org/ns#label> "Same" . | |
55 | |
56 Only in first:: | |
57 | |
58 >>> dump_nt_sorted(in_first) #doctest: +SKIP | |
59 <http://example.org> | |
60 <http://example.org/ns#rel> <http://example.org/a> . | |
61 <http://example.org> | |
62 <http://example.org/ns#rel> _:cb124e4c6da0579f810c0ffe4eff485bd9 . | |
63 _:cb124e4c6da0579f810c0ffe4eff485bd9 | |
64 <http://example.org/ns#label> "A" . | |
65 | |
66 Only in second:: | |
67 | |
68 >>> dump_nt_sorted(in_second) #doctest: +SKIP | |
69 <http://example.org> | |
70 <http://example.org/ns#rel> <http://example.org/b> . | |
71 <http://example.org> | |
72 <http://example.org/ns#rel> _:cb558f30e21ddfc05ca53108348338ade8 . | |
73 _:cb558f30e21ddfc05ca53108348338ade8 | |
74 <http://example.org/ns#label> "B" . | |
75 """ | |
76 | |
77 | |
78 # TODO: | |
79 # - Doesn't handle quads. | |
80 # - Add warning and/or safety mechanism before working on large graphs? | |
81 # - use this in existing Graph.isomorphic? | |
82 | |
83 __all__ = ['IsomorphicGraph', 'to_isomorphic', 'isomorphic', | |
84 'to_canonical_graph', 'graph_diff', 'similar'] | |
85 | |
86 from rdflib.graph import Graph, ConjunctiveGraph, ReadOnlyGraphAggregate | |
87 from rdflib.term import BNode, Node | |
88 try: | |
89 import hashlib | |
90 sha256 = hashlib.sha256 | |
91 except ImportError: | |
92 # for Python << 2.5 | |
93 import sha256 | |
94 sha256 = sha256.new | |
95 | |
96 from datetime import datetime | |
97 from collections import defaultdict | |
98 | |
99 | |
100 def _total_seconds(td): | |
101 result = td.days * 24 * 60 * 60 | |
102 result += td.seconds | |
103 result += td.microseconds / 1000000.0 | |
104 return result | |
105 | |
106 | |
107 class _runtime(object): | |
108 def __init__(self, label): | |
109 self.label = label | |
110 | |
111 def __call__(self, f): | |
112 if self.label is None: | |
113 self.label = f.__name__ + "_runtime" | |
114 | |
115 def wrapped_f(*args, **kwargs): | |
116 start = datetime.now() | |
117 result = f(*args, **kwargs) | |
118 if 'stats' in kwargs and kwargs['stats'] is not None: | |
119 stats = kwargs['stats'] | |
120 stats[self.label] = _total_seconds(datetime.now() - start) | |
121 return result | |
122 return wrapped_f | |
123 | |
124 | |
125 class _call_count(object): | |
126 def __init__(self, label): | |
127 self.label = label | |
128 | |
129 def __call__(self, f): | |
130 if self.label is None: | |
131 self.label = f.__name__ + "_runtime" | |
132 | |
133 def wrapped_f(*args, **kwargs): | |
134 if 'stats' in kwargs and kwargs['stats'] is not None: | |
135 stats = kwargs['stats'] | |
136 if self.label not in stats: | |
137 stats[self.label] = 0 | |
138 stats[self.label] += 1 | |
139 return f(*args, **kwargs) | |
140 return wrapped_f | |
141 | |
142 | |
143 class IsomorphicGraph(ConjunctiveGraph): | |
144 """An implementation of the RGDA1 graph digest algorithm. | |
145 | |
146 An implementation of RGDA1 (publication below), | |
147 a combination of Sayers & Karp's graph digest algorithm using | |
148 sum and SHA-256 <http://www.hpl.hp.com/techreports/2003/HPL-2003-235R1.pdf> | |
149 and traces <http://pallini.di.uniroma1.it>, an average case | |
150 polynomial time algorithm for graph canonicalization. | |
151 | |
152 McCusker, J. P. (2015). WebSig: A Digital Signature Framework for the Web. | |
153 Rensselaer Polytechnic Institute, Troy, NY. | |
154 http://gradworks.umi.com/3727015.pdf | |
155 """ | |
156 | |
157 def __init__(self, **kwargs): | |
158 super(IsomorphicGraph, self).__init__(**kwargs) | |
159 | |
160 def __eq__(self, other): | |
161 """Graph isomorphism testing.""" | |
162 if not isinstance(other, IsomorphicGraph): | |
163 return False | |
164 elif len(self) != len(other): | |
165 return False | |
166 return self.internal_hash() == other.internal_hash() | |
167 | |
168 def __ne__(self, other): | |
169 """Negative graph isomorphism testing.""" | |
170 return not self.__eq__(other) | |
171 | |
172 def graph_digest(self, stats=None): | |
173 """Synonym for IsomorphicGraph.internal_hash.""" | |
174 return self.internal_hash(stats=stats) | |
175 | |
176 def internal_hash(self, stats=None): | |
177 """ | |
178 This is defined instead of __hash__ to avoid a circular recursion | |
179 scenario with the Memory store for rdflib which requires a hash lookup | |
180 in order to return a generator of triples. | |
181 """ | |
182 return _TripleCanonicalizer(self).to_hash(stats=stats) | |
183 | |
184 | |
185 class Color: | |
186 def __init__(self, nodes, hashfunc, color=(), hash_cache=None): | |
187 if hash_cache is None: | |
188 hash_cache = {} | |
189 self._hash_cache = hash_cache | |
190 self.color = color | |
191 self.nodes = nodes | |
192 self.hashfunc = hashfunc | |
193 self._hash_color = None | |
194 | |
195 def key(self): | |
196 return (len(self.nodes), self.hash_color()) | |
197 | |
198 def hash_color(self, color=None): | |
199 if color is None: | |
200 color = self.color | |
201 if color in self._hash_cache: | |
202 return self._hash_cache[color] | |
203 | |
204 def stringify(x): | |
205 if isinstance(x, Node): | |
206 return x.n3() | |
207 else: | |
208 return str(x) | |
209 if isinstance(color, Node): | |
210 return stringify(color) | |
211 value = 0 | |
212 for triple in color: | |
213 value += self.hashfunc(' '.join([stringify(x) for x in triple])) | |
214 val = "%x" % value | |
215 self._hash_cache[color] = val | |
216 return val | |
217 | |
218 def distinguish(self, W, graph): | |
219 colors = {} | |
220 for n in self.nodes: | |
221 new_color = list(self.color) | |
222 for node in W.nodes: | |
223 new_color += [ | |
224 (1, p, W.hash_color()) | |
225 for s, p, o in graph.triples((n, None, node))] | |
226 new_color += [ | |
227 (W.hash_color(), p, 3) | |
228 for s, p, o in graph.triples((node, None, n))] | |
229 new_color = tuple(new_color) | |
230 new_hash_color = self.hash_color(new_color) | |
231 | |
232 if new_hash_color not in colors: | |
233 c = Color( | |
234 [], self.hashfunc, new_color, | |
235 hash_cache=self._hash_cache) | |
236 colors[new_hash_color] = c | |
237 colors[new_hash_color].nodes.append(n) | |
238 return list(colors.values()) | |
239 | |
240 def discrete(self): | |
241 return len(self.nodes) == 1 | |
242 | |
243 def copy(self): | |
244 return Color( | |
245 self.nodes[:], self.hashfunc, self.color, | |
246 hash_cache=self._hash_cache) | |
247 | |
248 | |
249 class _TripleCanonicalizer(object): | |
250 | |
251 def __init__(self, graph, hashfunc=sha256): | |
252 self.graph = graph | |
253 | |
254 def _hashfunc(s): | |
255 h = hashfunc() | |
256 h.update(str(s).encode("utf8")) | |
257 return int(h.hexdigest(), 16) | |
258 self._hash_cache = {} | |
259 self.hashfunc = _hashfunc | |
260 | |
261 | |
262 def _discrete(self, coloring): | |
263 return len([c for c in coloring if not c.discrete()]) == 0 | |
264 | |
265 | |
266 def _initial_color(self): | |
267 """Finds an initial color for the graph. | |
268 | |
269 Finds an initial color fo the graph by finding all blank nodes and | |
270 non-blank nodes that are adjacent. Nodes that are not adjacent to blank | |
271 nodes are not included, as they are a) already colored (by URI or literal) | |
272 and b) do not factor into the color of any blank node. | |
273 """ | |
274 bnodes = set() | |
275 others = set() | |
276 self._neighbors = defaultdict(set) | |
277 for s, p, o in self.graph: | |
278 nodes = set([s, o]) | |
279 b = set([x for x in nodes if isinstance(x, BNode)]) | |
280 if len(b) > 0: | |
281 others |= nodes - b | |
282 bnodes |= b | |
283 if isinstance(s, BNode): | |
284 self._neighbors[s].add(o) | |
285 if isinstance(o, BNode): | |
286 self._neighbors[o].add(s) | |
287 if len(bnodes) > 0: | |
288 return [ | |
289 Color(list(bnodes), self.hashfunc, hash_cache=self._hash_cache) | |
290 ] + [ | |
291 Color([x], self.hashfunc, x, hash_cache=self._hash_cache) | |
292 for x in others | |
293 ] | |
294 else: | |
295 return [] | |
296 | |
297 def _individuate(self, color, individual): | |
298 new_color = list(color.color) | |
299 new_color.append((len(color.nodes),)) | |
300 | |
301 color.nodes.remove(individual) | |
302 c = Color([individual], self.hashfunc, tuple(new_color), | |
303 hash_cache=self._hash_cache) | |
304 return c | |
305 | |
306 def _get_candidates(self, coloring): | |
307 candidates = [c for c in coloring if not c.discrete()] | |
308 for c in [c for c in coloring if not c.discrete()]: | |
309 for node in c.nodes: | |
310 yield node, c | |
311 | |
312 def _refine(self, coloring, sequence): | |
313 sequence = sorted(sequence, key=lambda x: x.key(), reverse=True) | |
314 coloring = coloring[:] | |
315 while len(sequence) > 0 and not self._discrete(coloring): | |
316 W = sequence.pop() | |
317 for c in coloring[:]: | |
318 if len(c.nodes) > 1: | |
319 colors = sorted(c.distinguish(W, self.graph), | |
320 key=lambda x: x.key(), | |
321 reverse=True) | |
322 coloring.remove(c) | |
323 coloring.extend(colors) | |
324 try: | |
325 si = sequence.index(c) | |
326 sequence = sequence[:si] + colors + sequence[si+1:] | |
327 except ValueError: | |
328 sequence = colors[1:] + sequence | |
329 | |
330 return coloring | |
331 | |
332 @_runtime("to_hash_runtime") | |
333 def to_hash(self, stats=None): | |
334 result = 0 | |
335 for triple in self.canonical_triples(stats=stats): | |
336 result += self.hashfunc(' '.join([x.n3() for x in triple])) | |
337 if stats is not None: | |
338 stats['graph_digest'] = "%x" % result | |
339 return result | |
340 | |
341 def _experimental_path(self, coloring): | |
342 coloring = [c.copy() for c in coloring] | |
343 while not self._discrete(coloring): | |
344 color = [x for x in coloring if not x.discrete()][0] | |
345 node = color.nodes[0] | |
346 new_color = self._individuate(color, node) | |
347 coloring.append(new_color) | |
348 coloring = self._refine(coloring, [new_color]) | |
349 return coloring | |
350 | |
351 def _create_generator(self, colorings, groupings=None): | |
352 if not groupings: | |
353 groupings = defaultdict(set) | |
354 for group in zip(*colorings): | |
355 g = set([c.nodes[0] for c in group]) | |
356 for n in group: | |
357 g |= groupings[n] | |
358 for n in g: | |
359 groupings[n] = g | |
360 return groupings | |
361 | |
362 @_call_count("individuations") | |
363 def _traces(self, coloring, stats=None, depth=[0]): | |
364 if stats is not None and 'prunings' not in stats: | |
365 stats['prunings'] = 0 | |
366 depth[0] += 1 | |
367 candidates = self._get_candidates(coloring) | |
368 best = [] | |
369 best_score = None | |
370 best_experimental = None | |
371 best_experimental_score = None | |
372 last_coloring = None | |
373 generator = defaultdict(set) | |
374 visited = set() | |
375 for candidate, color in candidates: | |
376 if candidate in generator: | |
377 v = generator[candidate] & visited | |
378 if len(v) > 0: | |
379 visited.add(candidate) | |
380 continue | |
381 visited.add(candidate) | |
382 coloring_copy = [] | |
383 color_copy = None | |
384 for c in coloring: | |
385 c_copy = c.copy() | |
386 coloring_copy.append(c_copy) | |
387 if c == color: | |
388 color_copy = c_copy | |
389 new_color = self._individuate(color_copy, candidate) | |
390 coloring_copy.append(new_color) | |
391 refined_coloring = self._refine(coloring_copy, [new_color]) | |
392 color_score = tuple([c.key() for c in refined_coloring]) | |
393 experimental = self._experimental_path(coloring_copy) | |
394 experimental_score = set([c.key() for c in experimental]) | |
395 if last_coloring: | |
396 generator = self._create_generator( | |
397 [last_coloring, experimental], | |
398 generator) | |
399 last_coloring = experimental | |
400 if best_score is None or best_score < color_score: | |
401 best = [refined_coloring] | |
402 best_score = color_score | |
403 best_experimental = experimental | |
404 best_experimental_score = experimental_score | |
405 elif best_score > color_score: | |
406 # prune this branch. | |
407 if stats is not None: | |
408 stats['prunings'] += 1 | |
409 elif experimental_score != best_experimental_score: | |
410 best.append(refined_coloring) | |
411 else: | |
412 # prune this branch. | |
413 if stats is not None: | |
414 stats['prunings'] += 1 | |
415 discrete = [x for x in best if self._discrete(x)] | |
416 if len(discrete) == 0: | |
417 best_score = None | |
418 best_depth = None | |
419 for coloring in best: | |
420 d = [depth[0]] | |
421 new_color = self._traces(coloring, stats=stats, depth=d) | |
422 color_score = tuple([c.key() for c in refined_coloring]) | |
423 if best_score is None or color_score > best_score: | |
424 discrete = [new_color] | |
425 best_score = color_score | |
426 best_depth = d[0] | |
427 depth[0] = best_depth | |
428 return discrete[0] | |
429 | |
430 def canonical_triples(self, stats=None): | |
431 if stats is not None: | |
432 start_canonicalization = datetime.now() | |
433 if stats is not None: | |
434 start_coloring = datetime.now() | |
435 coloring = self._initial_color() | |
436 if stats is not None: | |
437 stats['triple_count'] = len(self.graph) | |
438 stats['adjacent_nodes'] = max(0, len(coloring) - 1) | |
439 coloring = self._refine(coloring, coloring[:]) | |
440 if stats is not None: | |
441 stats['initial_coloring_runtime'] = _total_seconds(datetime.now() - start_coloring) | |
442 stats['initial_color_count'] = len(coloring) | |
443 | |
444 if not self._discrete(coloring): | |
445 depth = [0] | |
446 coloring = self._traces(coloring, stats=stats, depth=depth) | |
447 if stats is not None: | |
448 stats['tree_depth'] = depth[0] | |
449 elif stats is not None: | |
450 stats['individuations'] = 0 | |
451 stats['tree_depth'] = 0 | |
452 if stats is not None: | |
453 stats['color_count'] = len(coloring) | |
454 | |
455 bnode_labels = dict([(c.nodes[0], c.hash_color()) for c in coloring]) | |
456 if stats is not None: | |
457 stats["canonicalize_triples_runtime"] = _total_seconds(datetime.now() - start_coloring) | |
458 for triple in self.graph: | |
459 result = tuple(self._canonicalize_bnodes(triple, bnode_labels)) | |
460 yield result | |
461 | |
462 def _canonicalize_bnodes(self, triple, labels): | |
463 for term in triple: | |
464 if isinstance(term, BNode): | |
465 yield BNode(value="cb%s" % labels[term]) | |
466 else: | |
467 yield term | |
468 | |
469 | |
470 def to_isomorphic(graph): | |
471 if isinstance(graph, IsomorphicGraph): | |
472 return graph | |
473 return IsomorphicGraph(store=graph.store) | |
474 | |
475 | |
476 def isomorphic(graph1, graph2): | |
477 """Compare graph for equality. | |
478 | |
479 Uses an algorithm to compute unique hashes which takes bnodes into account. | |
480 | |
481 Examples:: | |
482 | |
483 >>> g1 = Graph().parse(format='n3', data=''' | |
484 ... @prefix : <http://example.org/ns#> . | |
485 ... <http://example.org> :rel <http://example.org/a> . | |
486 ... <http://example.org> :rel <http://example.org/b> . | |
487 ... <http://example.org> :rel [ :label "A bnode." ] . | |
488 ... ''') | |
489 >>> g2 = Graph().parse(format='n3', data=''' | |
490 ... @prefix ns: <http://example.org/ns#> . | |
491 ... <http://example.org> ns:rel [ ns:label "A bnode." ] . | |
492 ... <http://example.org> ns:rel <http://example.org/b>, | |
493 ... <http://example.org/a> . | |
494 ... ''') | |
495 >>> isomorphic(g1, g2) | |
496 True | |
497 | |
498 >>> g3 = Graph().parse(format='n3', data=''' | |
499 ... @prefix : <http://example.org/ns#> . | |
500 ... <http://example.org> :rel <http://example.org/a> . | |
501 ... <http://example.org> :rel <http://example.org/b> . | |
502 ... <http://example.org> :rel <http://example.org/c> . | |
503 ... ''') | |
504 >>> isomorphic(g1, g3) | |
505 False | |
506 """ | |
507 gd1 = _TripleCanonicalizer(graph1).to_hash() | |
508 gd2 = _TripleCanonicalizer(graph2).to_hash() | |
509 return gd1 == gd2 | |
510 | |
511 | |
512 | |
513 def to_canonical_graph(g1): | |
514 """Creates a canonical, read-only graph. | |
515 | |
516 Creates a canonical, read-only graph where all bnode id:s are based on | |
517 deterministical SHA-256 checksums, correlated with the graph contents. | |
518 """ | |
519 graph = Graph() | |
520 graph += _TripleCanonicalizer(g1).canonical_triples() | |
521 return ReadOnlyGraphAggregate([graph]) | |
522 | |
523 | |
524 def graph_diff(g1, g2): | |
525 """Returns three sets of triples: "in both", "in first" and "in second".""" | |
526 # bnodes have deterministic values in canonical graphs: | |
527 cg1 = to_canonical_graph(g1) | |
528 cg2 = to_canonical_graph(g2) | |
529 in_both = cg1 * cg2 | |
530 in_first = cg1 - cg2 | |
531 in_second = cg2 - cg1 | |
532 return (in_both, in_first, in_second) | |
533 | |
534 | |
535 | |
536 _MOCK_BNODE = BNode() | |
537 | |
538 | |
539 def similar(g1, g2): | |
540 """Checks if the two graphs are "similar". | |
541 | |
542 Checks if the two graphs are "similar", by comparing sorted triples where | |
543 all bnodes have been replaced by a singular mock bnode (the | |
544 ``_MOCK_BNODE``). | |
545 | |
546 This is a much cheaper, but less reliable, alternative to the comparison | |
547 algorithm in ``isomorphic``. | |
548 """ | |
549 return all(t1 == t2 for (t1, t2) in _squashed_graphs_triples(g1, g2)) | |
550 | |
551 | |
552 def _squashed_graphs_triples(g1, g2): | |
553 for (t1, t2) in zip(sorted(_squash_graph(g1)), sorted(_squash_graph(g2))): | |
554 yield t1, t2 | |
555 | |
556 | |
557 def _squash_graph(graph): | |
558 return (_squash_bnodes(triple) for triple in graph) | |
559 | |
560 | |
561 def _squash_bnodes(triple): | |
562 return tuple((isinstance(t, BNode) and _MOCK_BNODE) or t for t in triple) |