Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/networkx/algorithms/similarity.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:d30785e31577 | 1:56ad4e20f292 |
---|---|
1 # -*- coding: utf-8 -*- | |
2 # Copyright (C) 2010 by | |
3 # Aric Hagberg <hagberg@lanl.gov> | |
4 # Dan Schult <dschult@colgate.edu> | |
5 # Pieter Swart <swart@lanl.gov> | |
6 # All rights reserved. | |
7 # BSD license. | |
8 # | |
9 # Author: Andrey Paramonov <paramon@acdlabs.ru> | |
10 """ Functions measuring similarity using graph edit distance. | |
11 | |
12 The graph edit distance is the number of edge/node changes needed | |
13 to make two graphs isomorphic. | |
14 | |
15 The default algorithm/implementation is sub-optimal for some graphs. | |
16 The problem of finding the exact Graph Edit Distance (GED) is NP-hard | |
17 so it is often slow. If the simple interface `graph_edit_distance` | |
18 takes too long for your graph, try `optimize_graph_edit_distance` | |
19 and/or `optimize_edit_paths`. | |
20 | |
21 At the same time, I encourage capable people to investigate | |
22 alternative GED algorithms, in order to improve the choices available. | |
23 """ | |
24 from itertools import product | |
25 import math | |
26 import networkx as nx | |
27 from operator import * | |
28 import sys | |
29 | |
30 __author__ = 'Andrey Paramonov <paramon@acdlabs.ru>' | |
31 | |
32 __all__ = [ | |
33 'graph_edit_distance', | |
34 'optimal_edit_paths', | |
35 'optimize_graph_edit_distance', | |
36 'optimize_edit_paths', | |
37 'simrank_similarity', | |
38 'simrank_similarity_numpy', | |
39 ] | |
40 | |
41 | |
42 def debug_print(*args, **kwargs): | |
43 print(*args, **kwargs) | |
44 | |
45 | |
46 def graph_edit_distance(G1, G2, node_match=None, edge_match=None, | |
47 node_subst_cost=None, node_del_cost=None, | |
48 node_ins_cost=None, | |
49 edge_subst_cost=None, edge_del_cost=None, | |
50 edge_ins_cost=None, | |
51 upper_bound=None): | |
52 """Returns GED (graph edit distance) between graphs G1 and G2. | |
53 | |
54 Graph edit distance is a graph similarity measure analogous to | |
55 Levenshtein distance for strings. It is defined as minimum cost | |
56 of edit path (sequence of node and edge edit operations) | |
57 transforming graph G1 to graph isomorphic to G2. | |
58 | |
59 Parameters | |
60 ---------- | |
61 G1, G2: graphs | |
62 The two graphs G1 and G2 must be of the same type. | |
63 | |
64 node_match : callable | |
65 A function that returns True if node n1 in G1 and n2 in G2 | |
66 should be considered equal during matching. | |
67 | |
68 The function will be called like | |
69 | |
70 node_match(G1.nodes[n1], G2.nodes[n2]). | |
71 | |
72 That is, the function will receive the node attribute | |
73 dictionaries for n1 and n2 as inputs. | |
74 | |
75 Ignored if node_subst_cost is specified. If neither | |
76 node_match nor node_subst_cost are specified then node | |
77 attributes are not considered. | |
78 | |
79 edge_match : callable | |
80 A function that returns True if the edge attribute dictionaries | |
81 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should | |
82 be considered equal during matching. | |
83 | |
84 The function will be called like | |
85 | |
86 edge_match(G1[u1][v1], G2[u2][v2]). | |
87 | |
88 That is, the function will receive the edge attribute | |
89 dictionaries of the edges under consideration. | |
90 | |
91 Ignored if edge_subst_cost is specified. If neither | |
92 edge_match nor edge_subst_cost are specified then edge | |
93 attributes are not considered. | |
94 | |
95 node_subst_cost, node_del_cost, node_ins_cost : callable | |
96 Functions that return the costs of node substitution, node | |
97 deletion, and node insertion, respectively. | |
98 | |
99 The functions will be called like | |
100 | |
101 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), | |
102 node_del_cost(G1.nodes[n1]), | |
103 node_ins_cost(G2.nodes[n2]). | |
104 | |
105 That is, the functions will receive the node attribute | |
106 dictionaries as inputs. The functions are expected to return | |
107 positive numeric values. | |
108 | |
109 Function node_subst_cost overrides node_match if specified. | |
110 If neither node_match nor node_subst_cost are specified then | |
111 default node substitution cost of 0 is used (node attributes | |
112 are not considered during matching). | |
113 | |
114 If node_del_cost is not specified then default node deletion | |
115 cost of 1 is used. If node_ins_cost is not specified then | |
116 default node insertion cost of 1 is used. | |
117 | |
118 edge_subst_cost, edge_del_cost, edge_ins_cost : callable | |
119 Functions that return the costs of edge substitution, edge | |
120 deletion, and edge insertion, respectively. | |
121 | |
122 The functions will be called like | |
123 | |
124 edge_subst_cost(G1[u1][v1], G2[u2][v2]), | |
125 edge_del_cost(G1[u1][v1]), | |
126 edge_ins_cost(G2[u2][v2]). | |
127 | |
128 That is, the functions will receive the edge attribute | |
129 dictionaries as inputs. The functions are expected to return | |
130 positive numeric values. | |
131 | |
132 Function edge_subst_cost overrides edge_match if specified. | |
133 If neither edge_match nor edge_subst_cost are specified then | |
134 default edge substitution cost of 0 is used (edge attributes | |
135 are not considered during matching). | |
136 | |
137 If edge_del_cost is not specified then default edge deletion | |
138 cost of 1 is used. If edge_ins_cost is not specified then | |
139 default edge insertion cost of 1 is used. | |
140 | |
141 upper_bound : numeric | |
142 Maximum edit distance to consider. Return None if no edit | |
143 distance under or equal to upper_bound exists. | |
144 | |
145 Examples | |
146 -------- | |
147 >>> G1 = nx.cycle_graph(6) | |
148 >>> G2 = nx.wheel_graph(7) | |
149 >>> nx.graph_edit_distance(G1, G2) | |
150 7.0 | |
151 | |
152 See Also | |
153 -------- | |
154 optimal_edit_paths, optimize_graph_edit_distance, | |
155 | |
156 is_isomorphic (test for graph edit distance of 0) | |
157 | |
158 References | |
159 ---------- | |
160 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick | |
161 Martineau. An Exact Graph Edit Distance Algorithm for Solving | |
162 Pattern Recognition Problems. 4th International Conference on | |
163 Pattern Recognition Applications and Methods 2015, Jan 2015, | |
164 Lisbon, Portugal. 2015, | |
165 <10.5220/0005209202710278>. <hal-01168816> | |
166 https://hal.archives-ouvertes.fr/hal-01168816 | |
167 | |
168 """ | |
169 bestcost = None | |
170 for vertex_path, edge_path, cost in \ | |
171 optimize_edit_paths(G1, G2, node_match, edge_match, | |
172 node_subst_cost, node_del_cost, node_ins_cost, | |
173 edge_subst_cost, edge_del_cost, edge_ins_cost, | |
174 upper_bound, True): | |
175 #assert bestcost is None or cost < bestcost | |
176 bestcost = cost | |
177 return bestcost | |
178 | |
179 | |
180 def optimal_edit_paths(G1, G2, node_match=None, edge_match=None, | |
181 node_subst_cost=None, node_del_cost=None, | |
182 node_ins_cost=None, | |
183 edge_subst_cost=None, edge_del_cost=None, | |
184 edge_ins_cost=None, | |
185 upper_bound=None): | |
186 """Returns all minimum-cost edit paths transforming G1 to G2. | |
187 | |
188 Graph edit path is a sequence of node and edge edit operations | |
189 transforming graph G1 to graph isomorphic to G2. Edit operations | |
190 include substitutions, deletions, and insertions. | |
191 | |
192 Parameters | |
193 ---------- | |
194 G1, G2: graphs | |
195 The two graphs G1 and G2 must be of the same type. | |
196 | |
197 node_match : callable | |
198 A function that returns True if node n1 in G1 and n2 in G2 | |
199 should be considered equal during matching. | |
200 | |
201 The function will be called like | |
202 | |
203 node_match(G1.nodes[n1], G2.nodes[n2]). | |
204 | |
205 That is, the function will receive the node attribute | |
206 dictionaries for n1 and n2 as inputs. | |
207 | |
208 Ignored if node_subst_cost is specified. If neither | |
209 node_match nor node_subst_cost are specified then node | |
210 attributes are not considered. | |
211 | |
212 edge_match : callable | |
213 A function that returns True if the edge attribute dictionaries | |
214 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should | |
215 be considered equal during matching. | |
216 | |
217 The function will be called like | |
218 | |
219 edge_match(G1[u1][v1], G2[u2][v2]). | |
220 | |
221 That is, the function will receive the edge attribute | |
222 dictionaries of the edges under consideration. | |
223 | |
224 Ignored if edge_subst_cost is specified. If neither | |
225 edge_match nor edge_subst_cost are specified then edge | |
226 attributes are not considered. | |
227 | |
228 node_subst_cost, node_del_cost, node_ins_cost : callable | |
229 Functions that return the costs of node substitution, node | |
230 deletion, and node insertion, respectively. | |
231 | |
232 The functions will be called like | |
233 | |
234 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), | |
235 node_del_cost(G1.nodes[n1]), | |
236 node_ins_cost(G2.nodes[n2]). | |
237 | |
238 That is, the functions will receive the node attribute | |
239 dictionaries as inputs. The functions are expected to return | |
240 positive numeric values. | |
241 | |
242 Function node_subst_cost overrides node_match if specified. | |
243 If neither node_match nor node_subst_cost are specified then | |
244 default node substitution cost of 0 is used (node attributes | |
245 are not considered during matching). | |
246 | |
247 If node_del_cost is not specified then default node deletion | |
248 cost of 1 is used. If node_ins_cost is not specified then | |
249 default node insertion cost of 1 is used. | |
250 | |
251 edge_subst_cost, edge_del_cost, edge_ins_cost : callable | |
252 Functions that return the costs of edge substitution, edge | |
253 deletion, and edge insertion, respectively. | |
254 | |
255 The functions will be called like | |
256 | |
257 edge_subst_cost(G1[u1][v1], G2[u2][v2]), | |
258 edge_del_cost(G1[u1][v1]), | |
259 edge_ins_cost(G2[u2][v2]). | |
260 | |
261 That is, the functions will receive the edge attribute | |
262 dictionaries as inputs. The functions are expected to return | |
263 positive numeric values. | |
264 | |
265 Function edge_subst_cost overrides edge_match if specified. | |
266 If neither edge_match nor edge_subst_cost are specified then | |
267 default edge substitution cost of 0 is used (edge attributes | |
268 are not considered during matching). | |
269 | |
270 If edge_del_cost is not specified then default edge deletion | |
271 cost of 1 is used. If edge_ins_cost is not specified then | |
272 default edge insertion cost of 1 is used. | |
273 | |
274 upper_bound : numeric | |
275 Maximum edit distance to consider. | |
276 | |
277 Returns | |
278 ------- | |
279 edit_paths : list of tuples (node_edit_path, edge_edit_path) | |
280 node_edit_path : list of tuples (u, v) | |
281 edge_edit_path : list of tuples ((u1, v1), (u2, v2)) | |
282 | |
283 cost : numeric | |
284 Optimal edit path cost (graph edit distance). | |
285 | |
286 Examples | |
287 -------- | |
288 >>> G1 = nx.cycle_graph(6) | |
289 >>> G2 = nx.wheel_graph(7) | |
290 >>> paths, cost = nx.optimal_edit_paths(G1, G2) | |
291 >>> len(paths) | |
292 84 | |
293 >>> cost | |
294 7.0 | |
295 | |
296 See Also | |
297 -------- | |
298 graph_edit_distance, optimize_edit_paths | |
299 | |
300 References | |
301 ---------- | |
302 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick | |
303 Martineau. An Exact Graph Edit Distance Algorithm for Solving | |
304 Pattern Recognition Problems. 4th International Conference on | |
305 Pattern Recognition Applications and Methods 2015, Jan 2015, | |
306 Lisbon, Portugal. 2015, | |
307 <10.5220/0005209202710278>. <hal-01168816> | |
308 https://hal.archives-ouvertes.fr/hal-01168816 | |
309 | |
310 """ | |
311 paths = list() | |
312 bestcost = None | |
313 for vertex_path, edge_path, cost in \ | |
314 optimize_edit_paths(G1, G2, node_match, edge_match, | |
315 node_subst_cost, node_del_cost, node_ins_cost, | |
316 edge_subst_cost, edge_del_cost, edge_ins_cost, | |
317 upper_bound, False): | |
318 #assert bestcost is None or cost <= bestcost | |
319 if bestcost is not None and cost < bestcost: | |
320 paths = list() | |
321 paths.append((vertex_path, edge_path)) | |
322 bestcost = cost | |
323 return paths, bestcost | |
324 | |
325 | |
326 def optimize_graph_edit_distance(G1, G2, node_match=None, edge_match=None, | |
327 node_subst_cost=None, node_del_cost=None, | |
328 node_ins_cost=None, | |
329 edge_subst_cost=None, edge_del_cost=None, | |
330 edge_ins_cost=None, | |
331 upper_bound=None): | |
332 """Returns consecutive approximations of GED (graph edit distance) | |
333 between graphs G1 and G2. | |
334 | |
335 Graph edit distance is a graph similarity measure analogous to | |
336 Levenshtein distance for strings. It is defined as minimum cost | |
337 of edit path (sequence of node and edge edit operations) | |
338 transforming graph G1 to graph isomorphic to G2. | |
339 | |
340 Parameters | |
341 ---------- | |
342 G1, G2: graphs | |
343 The two graphs G1 and G2 must be of the same type. | |
344 | |
345 node_match : callable | |
346 A function that returns True if node n1 in G1 and n2 in G2 | |
347 should be considered equal during matching. | |
348 | |
349 The function will be called like | |
350 | |
351 node_match(G1.nodes[n1], G2.nodes[n2]). | |
352 | |
353 That is, the function will receive the node attribute | |
354 dictionaries for n1 and n2 as inputs. | |
355 | |
356 Ignored if node_subst_cost is specified. If neither | |
357 node_match nor node_subst_cost are specified then node | |
358 attributes are not considered. | |
359 | |
360 edge_match : callable | |
361 A function that returns True if the edge attribute dictionaries | |
362 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should | |
363 be considered equal during matching. | |
364 | |
365 The function will be called like | |
366 | |
367 edge_match(G1[u1][v1], G2[u2][v2]). | |
368 | |
369 That is, the function will receive the edge attribute | |
370 dictionaries of the edges under consideration. | |
371 | |
372 Ignored if edge_subst_cost is specified. If neither | |
373 edge_match nor edge_subst_cost are specified then edge | |
374 attributes are not considered. | |
375 | |
376 node_subst_cost, node_del_cost, node_ins_cost : callable | |
377 Functions that return the costs of node substitution, node | |
378 deletion, and node insertion, respectively. | |
379 | |
380 The functions will be called like | |
381 | |
382 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), | |
383 node_del_cost(G1.nodes[n1]), | |
384 node_ins_cost(G2.nodes[n2]). | |
385 | |
386 That is, the functions will receive the node attribute | |
387 dictionaries as inputs. The functions are expected to return | |
388 positive numeric values. | |
389 | |
390 Function node_subst_cost overrides node_match if specified. | |
391 If neither node_match nor node_subst_cost are specified then | |
392 default node substitution cost of 0 is used (node attributes | |
393 are not considered during matching). | |
394 | |
395 If node_del_cost is not specified then default node deletion | |
396 cost of 1 is used. If node_ins_cost is not specified then | |
397 default node insertion cost of 1 is used. | |
398 | |
399 edge_subst_cost, edge_del_cost, edge_ins_cost : callable | |
400 Functions that return the costs of edge substitution, edge | |
401 deletion, and edge insertion, respectively. | |
402 | |
403 The functions will be called like | |
404 | |
405 edge_subst_cost(G1[u1][v1], G2[u2][v2]), | |
406 edge_del_cost(G1[u1][v1]), | |
407 edge_ins_cost(G2[u2][v2]). | |
408 | |
409 That is, the functions will receive the edge attribute | |
410 dictionaries as inputs. The functions are expected to return | |
411 positive numeric values. | |
412 | |
413 Function edge_subst_cost overrides edge_match if specified. | |
414 If neither edge_match nor edge_subst_cost are specified then | |
415 default edge substitution cost of 0 is used (edge attributes | |
416 are not considered during matching). | |
417 | |
418 If edge_del_cost is not specified then default edge deletion | |
419 cost of 1 is used. If edge_ins_cost is not specified then | |
420 default edge insertion cost of 1 is used. | |
421 | |
422 upper_bound : numeric | |
423 Maximum edit distance to consider. | |
424 | |
425 Returns | |
426 ------- | |
427 Generator of consecutive approximations of graph edit distance. | |
428 | |
429 Examples | |
430 -------- | |
431 >>> G1 = nx.cycle_graph(6) | |
432 >>> G2 = nx.wheel_graph(7) | |
433 >>> for v in nx.optimize_graph_edit_distance(G1, G2): | |
434 ... minv = v | |
435 >>> minv | |
436 7.0 | |
437 | |
438 See Also | |
439 -------- | |
440 graph_edit_distance, optimize_edit_paths | |
441 | |
442 References | |
443 ---------- | |
444 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick | |
445 Martineau. An Exact Graph Edit Distance Algorithm for Solving | |
446 Pattern Recognition Problems. 4th International Conference on | |
447 Pattern Recognition Applications and Methods 2015, Jan 2015, | |
448 Lisbon, Portugal. 2015, | |
449 <10.5220/0005209202710278>. <hal-01168816> | |
450 https://hal.archives-ouvertes.fr/hal-01168816 | |
451 """ | |
452 for vertex_path, edge_path, cost in \ | |
453 optimize_edit_paths(G1, G2, node_match, edge_match, | |
454 node_subst_cost, node_del_cost, node_ins_cost, | |
455 edge_subst_cost, edge_del_cost, edge_ins_cost, | |
456 upper_bound, True): | |
457 yield cost | |
458 | |
459 | |
460 def optimize_edit_paths(G1, G2, node_match=None, edge_match=None, | |
461 node_subst_cost=None, node_del_cost=None, | |
462 node_ins_cost=None, | |
463 edge_subst_cost=None, edge_del_cost=None, | |
464 edge_ins_cost=None, | |
465 upper_bound=None, strictly_decreasing=True): | |
466 """GED (graph edit distance) calculation: advanced interface. | |
467 | |
468 Graph edit path is a sequence of node and edge edit operations | |
469 transforming graph G1 to graph isomorphic to G2. Edit operations | |
470 include substitutions, deletions, and insertions. | |
471 | |
472 Graph edit distance is defined as minimum cost of edit path. | |
473 | |
474 Parameters | |
475 ---------- | |
476 G1, G2: graphs | |
477 The two graphs G1 and G2 must be of the same type. | |
478 | |
479 node_match : callable | |
480 A function that returns True if node n1 in G1 and n2 in G2 | |
481 should be considered equal during matching. | |
482 | |
483 The function will be called like | |
484 | |
485 node_match(G1.nodes[n1], G2.nodes[n2]). | |
486 | |
487 That is, the function will receive the node attribute | |
488 dictionaries for n1 and n2 as inputs. | |
489 | |
490 Ignored if node_subst_cost is specified. If neither | |
491 node_match nor node_subst_cost are specified then node | |
492 attributes are not considered. | |
493 | |
494 edge_match : callable | |
495 A function that returns True if the edge attribute dictionaries | |
496 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should | |
497 be considered equal during matching. | |
498 | |
499 The function will be called like | |
500 | |
501 edge_match(G1[u1][v1], G2[u2][v2]). | |
502 | |
503 That is, the function will receive the edge attribute | |
504 dictionaries of the edges under consideration. | |
505 | |
506 Ignored if edge_subst_cost is specified. If neither | |
507 edge_match nor edge_subst_cost are specified then edge | |
508 attributes are not considered. | |
509 | |
510 node_subst_cost, node_del_cost, node_ins_cost : callable | |
511 Functions that return the costs of node substitution, node | |
512 deletion, and node insertion, respectively. | |
513 | |
514 The functions will be called like | |
515 | |
516 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), | |
517 node_del_cost(G1.nodes[n1]), | |
518 node_ins_cost(G2.nodes[n2]). | |
519 | |
520 That is, the functions will receive the node attribute | |
521 dictionaries as inputs. The functions are expected to return | |
522 positive numeric values. | |
523 | |
524 Function node_subst_cost overrides node_match if specified. | |
525 If neither node_match nor node_subst_cost are specified then | |
526 default node substitution cost of 0 is used (node attributes | |
527 are not considered during matching). | |
528 | |
529 If node_del_cost is not specified then default node deletion | |
530 cost of 1 is used. If node_ins_cost is not specified then | |
531 default node insertion cost of 1 is used. | |
532 | |
533 edge_subst_cost, edge_del_cost, edge_ins_cost : callable | |
534 Functions that return the costs of edge substitution, edge | |
535 deletion, and edge insertion, respectively. | |
536 | |
537 The functions will be called like | |
538 | |
539 edge_subst_cost(G1[u1][v1], G2[u2][v2]), | |
540 edge_del_cost(G1[u1][v1]), | |
541 edge_ins_cost(G2[u2][v2]). | |
542 | |
543 That is, the functions will receive the edge attribute | |
544 dictionaries as inputs. The functions are expected to return | |
545 positive numeric values. | |
546 | |
547 Function edge_subst_cost overrides edge_match if specified. | |
548 If neither edge_match nor edge_subst_cost are specified then | |
549 default edge substitution cost of 0 is used (edge attributes | |
550 are not considered during matching). | |
551 | |
552 If edge_del_cost is not specified then default edge deletion | |
553 cost of 1 is used. If edge_ins_cost is not specified then | |
554 default edge insertion cost of 1 is used. | |
555 | |
556 upper_bound : numeric | |
557 Maximum edit distance to consider. | |
558 | |
559 strictly_decreasing : bool | |
560 If True, return consecutive approximations of strictly | |
561 decreasing cost. Otherwise, return all edit paths of cost | |
562 less than or equal to the previous minimum cost. | |
563 | |
564 Returns | |
565 ------- | |
566 Generator of tuples (node_edit_path, edge_edit_path, cost) | |
567 node_edit_path : list of tuples (u, v) | |
568 edge_edit_path : list of tuples ((u1, v1), (u2, v2)) | |
569 cost : numeric | |
570 | |
571 See Also | |
572 -------- | |
573 graph_edit_distance, optimize_graph_edit_distance, optimal_edit_paths | |
574 | |
575 References | |
576 ---------- | |
577 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick | |
578 Martineau. An Exact Graph Edit Distance Algorithm for Solving | |
579 Pattern Recognition Problems. 4th International Conference on | |
580 Pattern Recognition Applications and Methods 2015, Jan 2015, | |
581 Lisbon, Portugal. 2015, | |
582 <10.5220/0005209202710278>. <hal-01168816> | |
583 https://hal.archives-ouvertes.fr/hal-01168816 | |
584 | |
585 """ | |
586 # TODO: support DiGraph | |
587 | |
588 import numpy as np | |
589 from scipy.optimize import linear_sum_assignment | |
590 | |
591 class CostMatrix: | |
592 def __init__(self, C, lsa_row_ind, lsa_col_ind, ls): | |
593 #assert C.shape[0] == len(lsa_row_ind) | |
594 #assert C.shape[1] == len(lsa_col_ind) | |
595 #assert len(lsa_row_ind) == len(lsa_col_ind) | |
596 #assert set(lsa_row_ind) == set(range(len(lsa_row_ind))) | |
597 #assert set(lsa_col_ind) == set(range(len(lsa_col_ind))) | |
598 #assert ls == C[lsa_row_ind, lsa_col_ind].sum() | |
599 self.C = C | |
600 self.lsa_row_ind = lsa_row_ind | |
601 self.lsa_col_ind = lsa_col_ind | |
602 self.ls = ls | |
603 | |
604 def make_CostMatrix(C, m, n): | |
605 #assert(C.shape == (m + n, m + n)) | |
606 lsa_row_ind, lsa_col_ind = linear_sum_assignment(C) | |
607 | |
608 # Fixup dummy assignments: | |
609 # each substitution i<->j should have dummy assignment m+j<->n+i | |
610 # NOTE: fast reduce of Cv relies on it | |
611 #assert len(lsa_row_ind) == len(lsa_col_ind) | |
612 indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind) | |
613 subst_ind = list(k for k, i, j in indexes if i < m and j < n) | |
614 indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind) | |
615 dummy_ind = list(k for k, i, j in indexes if i >= m and j >= n) | |
616 #assert len(subst_ind) == len(dummy_ind) | |
617 lsa_row_ind[dummy_ind] = lsa_col_ind[subst_ind] + m | |
618 lsa_col_ind[dummy_ind] = lsa_row_ind[subst_ind] + n | |
619 | |
620 return CostMatrix(C, lsa_row_ind, lsa_col_ind, | |
621 C[lsa_row_ind, lsa_col_ind].sum()) | |
622 | |
623 def extract_C(C, i, j, m, n): | |
624 #assert(C.shape == (m + n, m + n)) | |
625 row_ind = [k in i or k - m in j for k in range(m + n)] | |
626 col_ind = [k in j or k - n in i for k in range(m + n)] | |
627 return C[row_ind, :][:, col_ind] | |
628 | |
629 def reduce_C(C, i, j, m, n): | |
630 #assert(C.shape == (m + n, m + n)) | |
631 row_ind = [k not in i and k - m not in j for k in range(m + n)] | |
632 col_ind = [k not in j and k - n not in i for k in range(m + n)] | |
633 return C[row_ind, :][:, col_ind] | |
634 | |
635 def reduce_ind(ind, i): | |
636 #assert set(ind) == set(range(len(ind))) | |
637 rind = ind[[k not in i for k in ind]] | |
638 for k in set(i): | |
639 rind[rind >= k] -= 1 | |
640 return rind | |
641 | |
642 def match_edges(u, v, pending_g, pending_h, Ce, matched_uv=[]): | |
643 """ | |
644 Parameters: | |
645 u, v: matched vertices, u=None or v=None for | |
646 deletion/insertion | |
647 pending_g, pending_h: lists of edges not yet mapped | |
648 Ce: CostMatrix of pending edge mappings | |
649 matched_uv: partial vertex edit path | |
650 list of tuples (u, v) of previously matched vertex | |
651 mappings u<->v, u=None or v=None for | |
652 deletion/insertion | |
653 | |
654 Returns: | |
655 list of (i, j): indices of edge mappings g<->h | |
656 localCe: local CostMatrix of edge mappings | |
657 (basically submatrix of Ce at cross of rows i, cols j) | |
658 """ | |
659 M = len(pending_g) | |
660 N = len(pending_h) | |
661 #assert Ce.C.shape == (M + N, M + N) | |
662 | |
663 g_ind = [i for i in range(M) if pending_g[i][:2] == (u, u) or | |
664 any(pending_g[i][:2] in ((p, u), (u, p)) | |
665 for p, q in matched_uv)] | |
666 h_ind = [j for j in range(N) if pending_h[j][:2] == (v, v) or | |
667 any(pending_h[j][:2] in ((q, v), (v, q)) | |
668 for p, q in matched_uv)] | |
669 m = len(g_ind) | |
670 n = len(h_ind) | |
671 | |
672 if m or n: | |
673 C = extract_C(Ce.C, g_ind, h_ind, M, N) | |
674 #assert C.shape == (m + n, m + n) | |
675 | |
676 # Forbid structurally invalid matches | |
677 # NOTE: inf remembered from Ce construction | |
678 for k, i in zip(range(m), g_ind): | |
679 g = pending_g[i][:2] | |
680 for l, j in zip(range(n), h_ind): | |
681 h = pending_h[j][:2] | |
682 if nx.is_directed(G1) or nx.is_directed(G2): | |
683 if any(g == (p, u) and h == (q, v) or | |
684 g == (u, p) and h == (v, q) | |
685 for p, q in matched_uv): | |
686 continue | |
687 else: | |
688 if any(g in ((p, u), (u, p)) and h in ((q, v), (v, q)) | |
689 for p, q in matched_uv): | |
690 continue | |
691 if g == (u, u): | |
692 continue | |
693 if h == (v, v): | |
694 continue | |
695 C[k, l] = inf | |
696 | |
697 localCe = make_CostMatrix(C, m, n) | |
698 ij = list((g_ind[k] if k < m else M + h_ind[l], | |
699 h_ind[l] if l < n else N + g_ind[k]) | |
700 for k, l in zip(localCe.lsa_row_ind, localCe.lsa_col_ind) | |
701 if k < m or l < n) | |
702 | |
703 else: | |
704 ij = [] | |
705 localCe = CostMatrix(np.empty((0, 0)), [], [], 0) | |
706 | |
707 return ij, localCe | |
708 | |
709 def reduce_Ce(Ce, ij, m, n): | |
710 if len(ij): | |
711 i, j = zip(*ij) | |
712 m_i = m - sum(1 for t in i if t < m) | |
713 n_j = n - sum(1 for t in j if t < n) | |
714 return make_CostMatrix(reduce_C(Ce.C, i, j, m, n), m_i, n_j) | |
715 else: | |
716 return Ce | |
717 | |
718 def get_edit_ops(matched_uv, pending_u, pending_v, Cv, | |
719 pending_g, pending_h, Ce, matched_cost): | |
720 """ | |
721 Parameters: | |
722 matched_uv: partial vertex edit path | |
723 list of tuples (u, v) of vertex mappings u<->v, | |
724 u=None or v=None for deletion/insertion | |
725 pending_u, pending_v: lists of vertices not yet mapped | |
726 Cv: CostMatrix of pending vertex mappings | |
727 pending_g, pending_h: lists of edges not yet mapped | |
728 Ce: CostMatrix of pending edge mappings | |
729 matched_cost: cost of partial edit path | |
730 | |
731 Returns: | |
732 sequence of | |
733 (i, j): indices of vertex mapping u<->v | |
734 Cv_ij: reduced CostMatrix of pending vertex mappings | |
735 (basically Cv with row i, col j removed) | |
736 list of (x, y): indices of edge mappings g<->h | |
737 Ce_xy: reduced CostMatrix of pending edge mappings | |
738 (basically Ce with rows x, cols y removed) | |
739 cost: total cost of edit operation | |
740 NOTE: most promising ops first | |
741 """ | |
742 m = len(pending_u) | |
743 n = len(pending_v) | |
744 #assert Cv.C.shape == (m + n, m + n) | |
745 | |
746 # 1) a vertex mapping from optimal linear sum assignment | |
747 i, j = min((k, l) for k, l in zip(Cv.lsa_row_ind, Cv.lsa_col_ind) | |
748 if k < m or l < n) | |
749 xy, localCe = match_edges(pending_u[i] if i < m else None, | |
750 pending_v[j] if j < n else None, | |
751 pending_g, pending_h, Ce, matched_uv) | |
752 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h)) | |
753 #assert Ce.ls <= localCe.ls + Ce_xy.ls | |
754 if prune(matched_cost + Cv.ls + localCe.ls + Ce_xy.ls): | |
755 pass | |
756 else: | |
757 # get reduced Cv efficiently | |
758 Cv_ij = CostMatrix(reduce_C(Cv.C, (i,), (j,), m, n), | |
759 reduce_ind(Cv.lsa_row_ind, (i, m + j)), | |
760 reduce_ind(Cv.lsa_col_ind, (j, n + i)), | |
761 Cv.ls - Cv.C[i, j]) | |
762 yield (i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls | |
763 | |
764 # 2) other candidates, sorted by lower-bound cost estimate | |
765 other = list() | |
766 fixed_i, fixed_j = i, j | |
767 if m <= n: | |
768 candidates = ((t, fixed_j) for t in range(m + n) | |
769 if t != fixed_i and (t < m or t == m + fixed_j)) | |
770 else: | |
771 candidates = ((fixed_i, t) for t in range(m + n) | |
772 if t != fixed_j and (t < n or t == n + fixed_i)) | |
773 for i, j in candidates: | |
774 if prune(matched_cost + Cv.C[i, j] + Ce.ls): | |
775 continue | |
776 Cv_ij = make_CostMatrix(reduce_C(Cv.C, (i,), (j,), m, n), | |
777 m - 1 if i < m else m, | |
778 n - 1 if j < n else n) | |
779 #assert Cv.ls <= Cv.C[i, j] + Cv_ij.ls | |
780 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + Ce.ls): | |
781 continue | |
782 xy, localCe = match_edges(pending_u[i] if i < m else None, | |
783 pending_v[j] if j < n else None, | |
784 pending_g, pending_h, Ce, matched_uv) | |
785 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls): | |
786 continue | |
787 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h)) | |
788 #assert Ce.ls <= localCe.ls + Ce_xy.ls | |
789 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls + | |
790 Ce_xy.ls): | |
791 continue | |
792 other.append(((i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls)) | |
793 | |
794 # yield from | |
795 for t in sorted(other, key=lambda t: t[4] + t[1].ls + t[3].ls): | |
796 yield t | |
797 | |
798 def get_edit_paths(matched_uv, pending_u, pending_v, Cv, | |
799 matched_gh, pending_g, pending_h, Ce, matched_cost): | |
800 """ | |
801 Parameters: | |
802 matched_uv: partial vertex edit path | |
803 list of tuples (u, v) of vertex mappings u<->v, | |
804 u=None or v=None for deletion/insertion | |
805 pending_u, pending_v: lists of vertices not yet mapped | |
806 Cv: CostMatrix of pending vertex mappings | |
807 matched_gh: partial edge edit path | |
808 list of tuples (g, h) of edge mappings g<->h, | |
809 g=None or h=None for deletion/insertion | |
810 pending_g, pending_h: lists of edges not yet mapped | |
811 Ce: CostMatrix of pending edge mappings | |
812 matched_cost: cost of partial edit path | |
813 | |
814 Returns: | |
815 sequence of (vertex_path, edge_path, cost) | |
816 vertex_path: complete vertex edit path | |
817 list of tuples (u, v) of vertex mappings u<->v, | |
818 u=None or v=None for deletion/insertion | |
819 edge_path: complete edge edit path | |
820 list of tuples (g, h) of edge mappings g<->h, | |
821 g=None or h=None for deletion/insertion | |
822 cost: total cost of edit path | |
823 NOTE: path costs are non-increasing | |
824 """ | |
825 #debug_print('matched-uv:', matched_uv) | |
826 #debug_print('matched-gh:', matched_gh) | |
827 #debug_print('matched-cost:', matched_cost) | |
828 #debug_print('pending-u:', pending_u) | |
829 #debug_print('pending-v:', pending_v) | |
830 #debug_print(Cv.C) | |
831 #assert list(sorted(G1.nodes)) == list(sorted(list(u for u, v in matched_uv if u is not None) + pending_u)) | |
832 #assert list(sorted(G2.nodes)) == list(sorted(list(v for u, v in matched_uv if v is not None) + pending_v)) | |
833 #debug_print('pending-g:', pending_g) | |
834 #debug_print('pending-h:', pending_h) | |
835 #debug_print(Ce.C) | |
836 #assert list(sorted(G1.edges)) == list(sorted(list(g for g, h in matched_gh if g is not None) + pending_g)) | |
837 #assert list(sorted(G2.edges)) == list(sorted(list(h for g, h in matched_gh if h is not None) + pending_h)) | |
838 #debug_print() | |
839 | |
840 if prune(matched_cost + Cv.ls + Ce.ls): | |
841 return | |
842 | |
843 if not max(len(pending_u), len(pending_v)): | |
844 #assert not len(pending_g) | |
845 #assert not len(pending_h) | |
846 # path completed! | |
847 #assert matched_cost <= maxcost.value | |
848 maxcost.value = min(maxcost.value, matched_cost) | |
849 yield matched_uv, matched_gh, matched_cost | |
850 | |
851 else: | |
852 edit_ops = get_edit_ops(matched_uv, pending_u, pending_v, Cv, | |
853 pending_g, pending_h, Ce, matched_cost) | |
854 for ij, Cv_ij, xy, Ce_xy, edit_cost in edit_ops: | |
855 i, j = ij | |
856 #assert Cv.C[i, j] + sum(Ce.C[t] for t in xy) == edit_cost | |
857 if prune(matched_cost + edit_cost + Cv_ij.ls + Ce_xy.ls): | |
858 continue | |
859 | |
860 # dive deeper | |
861 u = pending_u.pop(i) if i < len(pending_u) else None | |
862 v = pending_v.pop(j) if j < len(pending_v) else None | |
863 matched_uv.append((u, v)) | |
864 for x, y in xy: | |
865 len_g = len(pending_g) | |
866 len_h = len(pending_h) | |
867 matched_gh.append((pending_g[x] if x < len_g else None, | |
868 pending_h[y] if y < len_h else None)) | |
869 sortedx = list(sorted(x for x, y in xy)) | |
870 sortedy = list(sorted(y for x, y in xy)) | |
871 G = list((pending_g.pop(x) if x < len(pending_g) else None) | |
872 for x in reversed(sortedx)) | |
873 H = list((pending_h.pop(y) if y < len(pending_h) else None) | |
874 for y in reversed(sortedy)) | |
875 | |
876 # yield from | |
877 for t in get_edit_paths(matched_uv, pending_u, pending_v, | |
878 Cv_ij, | |
879 matched_gh, pending_g, pending_h, | |
880 Ce_xy, | |
881 matched_cost + edit_cost): | |
882 yield t | |
883 | |
884 # backtrack | |
885 if u is not None: | |
886 pending_u.insert(i, u) | |
887 if v is not None: | |
888 pending_v.insert(j, v) | |
889 matched_uv.pop() | |
890 for x, g in zip(sortedx, reversed(G)): | |
891 if g is not None: | |
892 pending_g.insert(x, g) | |
893 for y, h in zip(sortedy, reversed(H)): | |
894 if h is not None: | |
895 pending_h.insert(y, h) | |
896 for t in xy: | |
897 matched_gh.pop() | |
898 | |
899 # Initialization | |
900 | |
901 pending_u = list(G1.nodes) | |
902 pending_v = list(G2.nodes) | |
903 | |
904 # cost matrix of vertex mappings | |
905 m = len(pending_u) | |
906 n = len(pending_v) | |
907 C = np.zeros((m + n, m + n)) | |
908 if node_subst_cost: | |
909 C[0:m, 0:n] = np.array([node_subst_cost(G1.nodes[u], G2.nodes[v]) | |
910 for u in pending_u for v in pending_v] | |
911 ).reshape(m, n) | |
912 elif node_match: | |
913 C[0:m, 0:n] = np.array([1 - int(node_match(G1.nodes[u], G2.nodes[v])) | |
914 for u in pending_u for v in pending_v] | |
915 ).reshape(m, n) | |
916 else: | |
917 # all zeroes | |
918 pass | |
919 #assert not min(m, n) or C[0:m, 0:n].min() >= 0 | |
920 if node_del_cost: | |
921 del_costs = [node_del_cost(G1.nodes[u]) for u in pending_u] | |
922 else: | |
923 del_costs = [1] * len(pending_u) | |
924 #assert not m or min(del_costs) >= 0 | |
925 if node_ins_cost: | |
926 ins_costs = [node_ins_cost(G2.nodes[v]) for v in pending_v] | |
927 else: | |
928 ins_costs = [1] * len(pending_v) | |
929 #assert not n or min(ins_costs) >= 0 | |
930 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1 | |
931 C[0:m, n:n + m] = np.array([del_costs[i] if i == j else inf | |
932 for i in range(m) for j in range(m)] | |
933 ).reshape(m, m) | |
934 C[m:m + n, 0:n] = np.array([ins_costs[i] if i == j else inf | |
935 for i in range(n) for j in range(n)] | |
936 ).reshape(n, n) | |
937 Cv = make_CostMatrix(C, m, n) | |
938 #debug_print('Cv: {} x {}'.format(m, n)) | |
939 #debug_print(Cv.C) | |
940 | |
941 pending_g = list(G1.edges) | |
942 pending_h = list(G2.edges) | |
943 | |
944 # cost matrix of edge mappings | |
945 m = len(pending_g) | |
946 n = len(pending_h) | |
947 C = np.zeros((m + n, m + n)) | |
948 if edge_subst_cost: | |
949 C[0:m, 0:n] = np.array([edge_subst_cost(G1.edges[g], G2.edges[h]) | |
950 for g in pending_g for h in pending_h] | |
951 ).reshape(m, n) | |
952 elif edge_match: | |
953 C[0:m, 0:n] = np.array([1 - int(edge_match(G1.edges[g], G2.edges[h])) | |
954 for g in pending_g for h in pending_h] | |
955 ).reshape(m, n) | |
956 else: | |
957 # all zeroes | |
958 pass | |
959 #assert not min(m, n) or C[0:m, 0:n].min() >= 0 | |
960 if edge_del_cost: | |
961 del_costs = [edge_del_cost(G1.edges[g]) for g in pending_g] | |
962 else: | |
963 del_costs = [1] * len(pending_g) | |
964 #assert not m or min(del_costs) >= 0 | |
965 if edge_ins_cost: | |
966 ins_costs = [edge_ins_cost(G2.edges[h]) for h in pending_h] | |
967 else: | |
968 ins_costs = [1] * len(pending_h) | |
969 #assert not n or min(ins_costs) >= 0 | |
970 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1 | |
971 C[0:m, n:n + m] = np.array([del_costs[i] if i == j else inf | |
972 for i in range(m) for j in range(m)] | |
973 ).reshape(m, m) | |
974 C[m:m + n, 0:n] = np.array([ins_costs[i] if i == j else inf | |
975 for i in range(n) for j in range(n)] | |
976 ).reshape(n, n) | |
977 Ce = make_CostMatrix(C, m, n) | |
978 #debug_print('Ce: {} x {}'.format(m, n)) | |
979 #debug_print(Ce.C) | |
980 #debug_print() | |
981 | |
982 class MaxCost: | |
983 def __init__(self): | |
984 # initial upper-bound estimate | |
985 # NOTE: should work for empty graph | |
986 self.value = Cv.C.sum() + Ce.C.sum() + 1 | |
987 maxcost = MaxCost() | |
988 | |
989 def prune(cost): | |
990 if upper_bound is not None: | |
991 if cost > upper_bound: | |
992 return True | |
993 if cost > maxcost.value: | |
994 return True | |
995 elif strictly_decreasing and cost >= maxcost.value: | |
996 return True | |
997 | |
998 # Now go! | |
999 | |
1000 for vertex_path, edge_path, cost in \ | |
1001 get_edit_paths([], pending_u, pending_v, Cv, | |
1002 [], pending_g, pending_h, Ce, 0): | |
1003 #assert sorted(G1.nodes) == sorted(u for u, v in vertex_path if u is not None) | |
1004 #assert sorted(G2.nodes) == sorted(v for u, v in vertex_path if v is not None) | |
1005 #assert sorted(G1.edges) == sorted(g for g, h in edge_path if g is not None) | |
1006 #assert sorted(G2.edges) == sorted(h for g, h in edge_path if h is not None) | |
1007 #print(vertex_path, edge_path, cost, file = sys.stderr) | |
1008 #assert cost == maxcost.value | |
1009 yield list(vertex_path), list(edge_path), cost | |
1010 | |
1011 | |
1012 def _is_close(d1, d2, atolerance=0, rtolerance=0): | |
1013 """Determines whether two adjacency matrices are within | |
1014 a provided tolerance. | |
1015 | |
1016 Parameters | |
1017 ---------- | |
1018 d1 : dict | |
1019 Adjacency dictionary | |
1020 | |
1021 d2 : dict | |
1022 Adjacency dictionary | |
1023 | |
1024 atolerance : float | |
1025 Some scalar tolerance value to determine closeness | |
1026 | |
1027 rtolerance : float | |
1028 A scalar tolerance value that will be some proportion | |
1029 of ``d2``'s value | |
1030 | |
1031 Returns | |
1032 ------- | |
1033 closeness : bool | |
1034 If all of the nodes within ``d1`` and ``d2`` are within | |
1035 a predefined tolerance, they are considered "close" and | |
1036 this method will return True. Otherwise, this method will | |
1037 return False. | |
1038 | |
1039 """ | |
1040 # Pre-condition: d1 and d2 have the same keys at each level if they | |
1041 # are dictionaries. | |
1042 if not isinstance(d1, dict) and not isinstance(d2, dict): | |
1043 return abs(d1 - d2) <= atolerance + rtolerance * abs(d2) | |
1044 return all(all(_is_close(d1[u][v], d2[u][v]) for v in d1[u]) for u in d1) | |
1045 | |
1046 | |
1047 def simrank_similarity(G, source=None, target=None, importance_factor=0.9, | |
1048 max_iterations=100, tolerance=1e-4): | |
1049 """Returns the SimRank similarity of nodes in the graph ``G``. | |
1050 | |
1051 SimRank is a similarity metric that says "two objects are considered | |
1052 to be similar if they are referenced by similar objects." [1]_. | |
1053 | |
1054 The pseudo-code definition from the paper is:: | |
1055 | |
1056 def simrank(G, u, v): | |
1057 in_neighbors_u = G.predecessors(u) | |
1058 in_neighbors_v = G.predecessors(v) | |
1059 scale = C / (len(in_neighbors_u) * len(in_neighbors_v)) | |
1060 return scale * sum(simrank(G, w, x) | |
1061 for w, x in product(in_neighbors_u, | |
1062 in_neighbors_v)) | |
1063 | |
1064 where ``G`` is the graph, ``u`` is the source, ``v`` is the target, | |
1065 and ``C`` is a float decay or importance factor between 0 and 1. | |
1066 | |
1067 The SimRank algorithm for determining node similarity is defined in | |
1068 [2]_. | |
1069 | |
1070 Parameters | |
1071 ---------- | |
1072 G : NetworkX graph | |
1073 A NetworkX graph | |
1074 | |
1075 source : node | |
1076 If this is specified, the returned dictionary maps each node | |
1077 ``v`` in the graph to the similarity between ``source`` and | |
1078 ``v``. | |
1079 | |
1080 target : node | |
1081 If both ``source`` and ``target`` are specified, the similarity | |
1082 value between ``source`` and ``target`` is returned. If | |
1083 ``target`` is specified but ``source`` is not, this argument is | |
1084 ignored. | |
1085 | |
1086 importance_factor : float | |
1087 The relative importance of indirect neighbors with respect to | |
1088 direct neighbors. | |
1089 | |
1090 max_iterations : integer | |
1091 Maximum number of iterations. | |
1092 | |
1093 tolerance : float | |
1094 Error tolerance used to check convergence. When an iteration of | |
1095 the algorithm finds that no similarity value changes more than | |
1096 this amount, the algorithm halts. | |
1097 | |
1098 Returns | |
1099 ------- | |
1100 similarity : dictionary or float | |
1101 If ``source`` and ``target`` are both ``None``, this returns a | |
1102 dictionary of dictionaries, where keys are node pairs and value | |
1103 are similarity of the pair of nodes. | |
1104 | |
1105 If ``source`` is not ``None`` but ``target`` is, this returns a | |
1106 dictionary mapping node to the similarity of ``source`` and that | |
1107 node. | |
1108 | |
1109 If neither ``source`` nor ``target`` is ``None``, this returns | |
1110 the similarity value for the given pair of nodes. | |
1111 | |
1112 Examples | |
1113 -------- | |
1114 If the nodes of the graph are numbered from zero to *n - 1*, where *n* | |
1115 is the number of nodes in the graph, you can create a SimRank matrix | |
1116 from the return value of this function where the node numbers are | |
1117 the row and column indices of the matrix:: | |
1118 | |
1119 >>> import networkx as nx | |
1120 >>> from numpy import array | |
1121 >>> G = nx.cycle_graph(4) | |
1122 >>> sim = nx.simrank_similarity(G) | |
1123 >>> lol = [[sim[u][v] for v in sorted(sim[u])] for u in sorted(sim)] | |
1124 >>> sim_array = array(lol) | |
1125 | |
1126 References | |
1127 ---------- | |
1128 .. [1] https://en.wikipedia.org/wiki/SimRank | |
1129 .. [2] G. Jeh and J. Widom. | |
1130 "SimRank: a measure of structural-context similarity", | |
1131 In KDD'02: Proceedings of the Eighth ACM SIGKDD | |
1132 International Conference on Knowledge Discovery and Data Mining, | |
1133 pp. 538--543. ACM Press, 2002. | |
1134 """ | |
1135 prevsim = None | |
1136 | |
1137 # build up our similarity adjacency dictionary output | |
1138 newsim = {u: {v: 1 if u == v else 0 for v in G} for u in G} | |
1139 | |
1140 # These functions compute the update to the similarity value of the nodes | |
1141 # `u` and `v` with respect to the previous similarity values. | |
1142 avg_sim = lambda s: sum(newsim[w][x] for (w, x) in s) / len(s) if s else 0.0 | |
1143 sim = lambda u, v: importance_factor * avg_sim(list(product(G[u], G[v]))) | |
1144 | |
1145 for _ in range(max_iterations): | |
1146 if prevsim and _is_close(prevsim, newsim, tolerance): | |
1147 break | |
1148 prevsim = newsim | |
1149 newsim = {u: {v: sim(u, v) if u is not v else 1 | |
1150 for v in newsim[u]} for u in newsim} | |
1151 | |
1152 if source is not None and target is not None: | |
1153 return newsim[source][target] | |
1154 if source is not None: | |
1155 return newsim[source] | |
1156 return newsim | |
1157 | |
1158 | |
1159 def simrank_similarity_numpy(G, source=None, target=None, importance_factor=0.9, | |
1160 max_iterations=100, tolerance=1e-4): | |
1161 """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``. | |
1162 | |
1163 The SimRank algorithm for determining node similarity is defined in | |
1164 [1]_. | |
1165 | |
1166 Parameters | |
1167 ---------- | |
1168 G : NetworkX graph | |
1169 A NetworkX graph | |
1170 | |
1171 source : node | |
1172 If this is specified, the returned dictionary maps each node | |
1173 ``v`` in the graph to the similarity between ``source`` and | |
1174 ``v``. | |
1175 | |
1176 target : node | |
1177 If both ``source`` and ``target`` are specified, the similarity | |
1178 value between ``source`` and ``target`` is returned. If | |
1179 ``target`` is specified but ``source`` is not, this argument is | |
1180 ignored. | |
1181 | |
1182 importance_factor : float | |
1183 The relative importance of indirect neighbors with respect to | |
1184 direct neighbors. | |
1185 | |
1186 max_iterations : integer | |
1187 Maximum number of iterations. | |
1188 | |
1189 tolerance : float | |
1190 Error tolerance used to check convergence. When an iteration of | |
1191 the algorithm finds that no similarity value changes more than | |
1192 this amount, the algorithm halts. | |
1193 | |
1194 Returns | |
1195 ------- | |
1196 similarity : dictionary or float | |
1197 If ``source`` and ``target`` are both ``None``, this returns a | |
1198 dictionary of dictionaries, where keys are node pairs and value | |
1199 are similarity of the pair of nodes. | |
1200 | |
1201 If ``source`` is not ``None`` but ``target`` is, this returns a | |
1202 dictionary mapping node to the similarity of ``source`` and that | |
1203 node. | |
1204 | |
1205 If neither ``source`` nor ``target`` is ``None``, this returns | |
1206 the similarity value for the given pair of nodes. | |
1207 | |
1208 Examples | |
1209 -------- | |
1210 >>> import networkx as nx | |
1211 >>> from numpy import array | |
1212 >>> G = nx.cycle_graph(4) | |
1213 >>> sim = nx.simrank_similarity_numpy(G) | |
1214 | |
1215 References | |
1216 ---------- | |
1217 .. [1] G. Jeh and J. Widom. | |
1218 "SimRank: a measure of structural-context similarity", | |
1219 In KDD'02: Proceedings of the Eighth ACM SIGKDD | |
1220 International Conference on Knowledge Discovery and Data Mining, | |
1221 pp. 538--543. ACM Press, 2002. | |
1222 """ | |
1223 # This algorithm follows roughly | |
1224 # | |
1225 # S = max{C * (A.T * S * A), I} | |
1226 # | |
1227 # where C is the importance factor, A is the column normalized | |
1228 # adjacency matrix, and I is the identity matrix. | |
1229 import numpy as np | |
1230 adjacency_matrix = nx.to_numpy_array(G) | |
1231 | |
1232 # column-normalize the ``adjacency_matrix`` | |
1233 adjacency_matrix /= adjacency_matrix.sum(axis=0) | |
1234 | |
1235 newsim = np.eye(adjacency_matrix.shape[0], dtype=np.float64) | |
1236 for _ in range(max_iterations): | |
1237 prevsim = np.copy(newsim) | |
1238 newsim = importance_factor * np.matmul( | |
1239 np.matmul(adjacency_matrix.T, prevsim), adjacency_matrix) | |
1240 np.fill_diagonal(newsim, 1.0) | |
1241 | |
1242 if np.allclose(prevsim, newsim, atol=tolerance): | |
1243 break | |
1244 | |
1245 if source is not None and target is not None: | |
1246 return newsim[source, target] | |
1247 if source is not None: | |
1248 return newsim[source] | |
1249 return newsim | |
1250 | |
1251 | |
1252 # fixture for pytest | |
1253 def setup_module(module): | |
1254 import pytest | |
1255 numpy = pytest.importorskip('numpy') | |
1256 scipy = pytest.importorskip('scipy') |