comparison planemo/lib/python3.7/site-packages/networkx/algorithms/similarity.py @ 1:56ad4e20f292 draft

"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author guerler
date Fri, 31 Jul 2020 00:32:28 -0400
parents
children
comparison
equal deleted inserted replaced
0:d30785e31577 1:56ad4e20f292
1 # -*- coding: utf-8 -*-
2 # Copyright (C) 2010 by
3 # Aric Hagberg <hagberg@lanl.gov>
4 # Dan Schult <dschult@colgate.edu>
5 # Pieter Swart <swart@lanl.gov>
6 # All rights reserved.
7 # BSD license.
8 #
9 # Author: Andrey Paramonov <paramon@acdlabs.ru>
10 """ Functions measuring similarity using graph edit distance.
11
12 The graph edit distance is the number of edge/node changes needed
13 to make two graphs isomorphic.
14
15 The default algorithm/implementation is sub-optimal for some graphs.
16 The problem of finding the exact Graph Edit Distance (GED) is NP-hard
17 so it is often slow. If the simple interface `graph_edit_distance`
18 takes too long for your graph, try `optimize_graph_edit_distance`
19 and/or `optimize_edit_paths`.
20
21 At the same time, I encourage capable people to investigate
22 alternative GED algorithms, in order to improve the choices available.
23 """
24 from itertools import product
25 import math
26 import networkx as nx
27 from operator import *
28 import sys
29
30 __author__ = 'Andrey Paramonov <paramon@acdlabs.ru>'
31
32 __all__ = [
33 'graph_edit_distance',
34 'optimal_edit_paths',
35 'optimize_graph_edit_distance',
36 'optimize_edit_paths',
37 'simrank_similarity',
38 'simrank_similarity_numpy',
39 ]
40
41
42 def debug_print(*args, **kwargs):
43 print(*args, **kwargs)
44
45
46 def graph_edit_distance(G1, G2, node_match=None, edge_match=None,
47 node_subst_cost=None, node_del_cost=None,
48 node_ins_cost=None,
49 edge_subst_cost=None, edge_del_cost=None,
50 edge_ins_cost=None,
51 upper_bound=None):
52 """Returns GED (graph edit distance) between graphs G1 and G2.
53
54 Graph edit distance is a graph similarity measure analogous to
55 Levenshtein distance for strings. It is defined as minimum cost
56 of edit path (sequence of node and edge edit operations)
57 transforming graph G1 to graph isomorphic to G2.
58
59 Parameters
60 ----------
61 G1, G2: graphs
62 The two graphs G1 and G2 must be of the same type.
63
64 node_match : callable
65 A function that returns True if node n1 in G1 and n2 in G2
66 should be considered equal during matching.
67
68 The function will be called like
69
70 node_match(G1.nodes[n1], G2.nodes[n2]).
71
72 That is, the function will receive the node attribute
73 dictionaries for n1 and n2 as inputs.
74
75 Ignored if node_subst_cost is specified. If neither
76 node_match nor node_subst_cost are specified then node
77 attributes are not considered.
78
79 edge_match : callable
80 A function that returns True if the edge attribute dictionaries
81 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
82 be considered equal during matching.
83
84 The function will be called like
85
86 edge_match(G1[u1][v1], G2[u2][v2]).
87
88 That is, the function will receive the edge attribute
89 dictionaries of the edges under consideration.
90
91 Ignored if edge_subst_cost is specified. If neither
92 edge_match nor edge_subst_cost are specified then edge
93 attributes are not considered.
94
95 node_subst_cost, node_del_cost, node_ins_cost : callable
96 Functions that return the costs of node substitution, node
97 deletion, and node insertion, respectively.
98
99 The functions will be called like
100
101 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
102 node_del_cost(G1.nodes[n1]),
103 node_ins_cost(G2.nodes[n2]).
104
105 That is, the functions will receive the node attribute
106 dictionaries as inputs. The functions are expected to return
107 positive numeric values.
108
109 Function node_subst_cost overrides node_match if specified.
110 If neither node_match nor node_subst_cost are specified then
111 default node substitution cost of 0 is used (node attributes
112 are not considered during matching).
113
114 If node_del_cost is not specified then default node deletion
115 cost of 1 is used. If node_ins_cost is not specified then
116 default node insertion cost of 1 is used.
117
118 edge_subst_cost, edge_del_cost, edge_ins_cost : callable
119 Functions that return the costs of edge substitution, edge
120 deletion, and edge insertion, respectively.
121
122 The functions will be called like
123
124 edge_subst_cost(G1[u1][v1], G2[u2][v2]),
125 edge_del_cost(G1[u1][v1]),
126 edge_ins_cost(G2[u2][v2]).
127
128 That is, the functions will receive the edge attribute
129 dictionaries as inputs. The functions are expected to return
130 positive numeric values.
131
132 Function edge_subst_cost overrides edge_match if specified.
133 If neither edge_match nor edge_subst_cost are specified then
134 default edge substitution cost of 0 is used (edge attributes
135 are not considered during matching).
136
137 If edge_del_cost is not specified then default edge deletion
138 cost of 1 is used. If edge_ins_cost is not specified then
139 default edge insertion cost of 1 is used.
140
141 upper_bound : numeric
142 Maximum edit distance to consider. Return None if no edit
143 distance under or equal to upper_bound exists.
144
145 Examples
146 --------
147 >>> G1 = nx.cycle_graph(6)
148 >>> G2 = nx.wheel_graph(7)
149 >>> nx.graph_edit_distance(G1, G2)
150 7.0
151
152 See Also
153 --------
154 optimal_edit_paths, optimize_graph_edit_distance,
155
156 is_isomorphic (test for graph edit distance of 0)
157
158 References
159 ----------
160 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
161 Martineau. An Exact Graph Edit Distance Algorithm for Solving
162 Pattern Recognition Problems. 4th International Conference on
163 Pattern Recognition Applications and Methods 2015, Jan 2015,
164 Lisbon, Portugal. 2015,
165 <10.5220/0005209202710278>. <hal-01168816>
166 https://hal.archives-ouvertes.fr/hal-01168816
167
168 """
169 bestcost = None
170 for vertex_path, edge_path, cost in \
171 optimize_edit_paths(G1, G2, node_match, edge_match,
172 node_subst_cost, node_del_cost, node_ins_cost,
173 edge_subst_cost, edge_del_cost, edge_ins_cost,
174 upper_bound, True):
175 #assert bestcost is None or cost < bestcost
176 bestcost = cost
177 return bestcost
178
179
180 def optimal_edit_paths(G1, G2, node_match=None, edge_match=None,
181 node_subst_cost=None, node_del_cost=None,
182 node_ins_cost=None,
183 edge_subst_cost=None, edge_del_cost=None,
184 edge_ins_cost=None,
185 upper_bound=None):
186 """Returns all minimum-cost edit paths transforming G1 to G2.
187
188 Graph edit path is a sequence of node and edge edit operations
189 transforming graph G1 to graph isomorphic to G2. Edit operations
190 include substitutions, deletions, and insertions.
191
192 Parameters
193 ----------
194 G1, G2: graphs
195 The two graphs G1 and G2 must be of the same type.
196
197 node_match : callable
198 A function that returns True if node n1 in G1 and n2 in G2
199 should be considered equal during matching.
200
201 The function will be called like
202
203 node_match(G1.nodes[n1], G2.nodes[n2]).
204
205 That is, the function will receive the node attribute
206 dictionaries for n1 and n2 as inputs.
207
208 Ignored if node_subst_cost is specified. If neither
209 node_match nor node_subst_cost are specified then node
210 attributes are not considered.
211
212 edge_match : callable
213 A function that returns True if the edge attribute dictionaries
214 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
215 be considered equal during matching.
216
217 The function will be called like
218
219 edge_match(G1[u1][v1], G2[u2][v2]).
220
221 That is, the function will receive the edge attribute
222 dictionaries of the edges under consideration.
223
224 Ignored if edge_subst_cost is specified. If neither
225 edge_match nor edge_subst_cost are specified then edge
226 attributes are not considered.
227
228 node_subst_cost, node_del_cost, node_ins_cost : callable
229 Functions that return the costs of node substitution, node
230 deletion, and node insertion, respectively.
231
232 The functions will be called like
233
234 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
235 node_del_cost(G1.nodes[n1]),
236 node_ins_cost(G2.nodes[n2]).
237
238 That is, the functions will receive the node attribute
239 dictionaries as inputs. The functions are expected to return
240 positive numeric values.
241
242 Function node_subst_cost overrides node_match if specified.
243 If neither node_match nor node_subst_cost are specified then
244 default node substitution cost of 0 is used (node attributes
245 are not considered during matching).
246
247 If node_del_cost is not specified then default node deletion
248 cost of 1 is used. If node_ins_cost is not specified then
249 default node insertion cost of 1 is used.
250
251 edge_subst_cost, edge_del_cost, edge_ins_cost : callable
252 Functions that return the costs of edge substitution, edge
253 deletion, and edge insertion, respectively.
254
255 The functions will be called like
256
257 edge_subst_cost(G1[u1][v1], G2[u2][v2]),
258 edge_del_cost(G1[u1][v1]),
259 edge_ins_cost(G2[u2][v2]).
260
261 That is, the functions will receive the edge attribute
262 dictionaries as inputs. The functions are expected to return
263 positive numeric values.
264
265 Function edge_subst_cost overrides edge_match if specified.
266 If neither edge_match nor edge_subst_cost are specified then
267 default edge substitution cost of 0 is used (edge attributes
268 are not considered during matching).
269
270 If edge_del_cost is not specified then default edge deletion
271 cost of 1 is used. If edge_ins_cost is not specified then
272 default edge insertion cost of 1 is used.
273
274 upper_bound : numeric
275 Maximum edit distance to consider.
276
277 Returns
278 -------
279 edit_paths : list of tuples (node_edit_path, edge_edit_path)
280 node_edit_path : list of tuples (u, v)
281 edge_edit_path : list of tuples ((u1, v1), (u2, v2))
282
283 cost : numeric
284 Optimal edit path cost (graph edit distance).
285
286 Examples
287 --------
288 >>> G1 = nx.cycle_graph(6)
289 >>> G2 = nx.wheel_graph(7)
290 >>> paths, cost = nx.optimal_edit_paths(G1, G2)
291 >>> len(paths)
292 84
293 >>> cost
294 7.0
295
296 See Also
297 --------
298 graph_edit_distance, optimize_edit_paths
299
300 References
301 ----------
302 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
303 Martineau. An Exact Graph Edit Distance Algorithm for Solving
304 Pattern Recognition Problems. 4th International Conference on
305 Pattern Recognition Applications and Methods 2015, Jan 2015,
306 Lisbon, Portugal. 2015,
307 <10.5220/0005209202710278>. <hal-01168816>
308 https://hal.archives-ouvertes.fr/hal-01168816
309
310 """
311 paths = list()
312 bestcost = None
313 for vertex_path, edge_path, cost in \
314 optimize_edit_paths(G1, G2, node_match, edge_match,
315 node_subst_cost, node_del_cost, node_ins_cost,
316 edge_subst_cost, edge_del_cost, edge_ins_cost,
317 upper_bound, False):
318 #assert bestcost is None or cost <= bestcost
319 if bestcost is not None and cost < bestcost:
320 paths = list()
321 paths.append((vertex_path, edge_path))
322 bestcost = cost
323 return paths, bestcost
324
325
326 def optimize_graph_edit_distance(G1, G2, node_match=None, edge_match=None,
327 node_subst_cost=None, node_del_cost=None,
328 node_ins_cost=None,
329 edge_subst_cost=None, edge_del_cost=None,
330 edge_ins_cost=None,
331 upper_bound=None):
332 """Returns consecutive approximations of GED (graph edit distance)
333 between graphs G1 and G2.
334
335 Graph edit distance is a graph similarity measure analogous to
336 Levenshtein distance for strings. It is defined as minimum cost
337 of edit path (sequence of node and edge edit operations)
338 transforming graph G1 to graph isomorphic to G2.
339
340 Parameters
341 ----------
342 G1, G2: graphs
343 The two graphs G1 and G2 must be of the same type.
344
345 node_match : callable
346 A function that returns True if node n1 in G1 and n2 in G2
347 should be considered equal during matching.
348
349 The function will be called like
350
351 node_match(G1.nodes[n1], G2.nodes[n2]).
352
353 That is, the function will receive the node attribute
354 dictionaries for n1 and n2 as inputs.
355
356 Ignored if node_subst_cost is specified. If neither
357 node_match nor node_subst_cost are specified then node
358 attributes are not considered.
359
360 edge_match : callable
361 A function that returns True if the edge attribute dictionaries
362 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
363 be considered equal during matching.
364
365 The function will be called like
366
367 edge_match(G1[u1][v1], G2[u2][v2]).
368
369 That is, the function will receive the edge attribute
370 dictionaries of the edges under consideration.
371
372 Ignored if edge_subst_cost is specified. If neither
373 edge_match nor edge_subst_cost are specified then edge
374 attributes are not considered.
375
376 node_subst_cost, node_del_cost, node_ins_cost : callable
377 Functions that return the costs of node substitution, node
378 deletion, and node insertion, respectively.
379
380 The functions will be called like
381
382 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
383 node_del_cost(G1.nodes[n1]),
384 node_ins_cost(G2.nodes[n2]).
385
386 That is, the functions will receive the node attribute
387 dictionaries as inputs. The functions are expected to return
388 positive numeric values.
389
390 Function node_subst_cost overrides node_match if specified.
391 If neither node_match nor node_subst_cost are specified then
392 default node substitution cost of 0 is used (node attributes
393 are not considered during matching).
394
395 If node_del_cost is not specified then default node deletion
396 cost of 1 is used. If node_ins_cost is not specified then
397 default node insertion cost of 1 is used.
398
399 edge_subst_cost, edge_del_cost, edge_ins_cost : callable
400 Functions that return the costs of edge substitution, edge
401 deletion, and edge insertion, respectively.
402
403 The functions will be called like
404
405 edge_subst_cost(G1[u1][v1], G2[u2][v2]),
406 edge_del_cost(G1[u1][v1]),
407 edge_ins_cost(G2[u2][v2]).
408
409 That is, the functions will receive the edge attribute
410 dictionaries as inputs. The functions are expected to return
411 positive numeric values.
412
413 Function edge_subst_cost overrides edge_match if specified.
414 If neither edge_match nor edge_subst_cost are specified then
415 default edge substitution cost of 0 is used (edge attributes
416 are not considered during matching).
417
418 If edge_del_cost is not specified then default edge deletion
419 cost of 1 is used. If edge_ins_cost is not specified then
420 default edge insertion cost of 1 is used.
421
422 upper_bound : numeric
423 Maximum edit distance to consider.
424
425 Returns
426 -------
427 Generator of consecutive approximations of graph edit distance.
428
429 Examples
430 --------
431 >>> G1 = nx.cycle_graph(6)
432 >>> G2 = nx.wheel_graph(7)
433 >>> for v in nx.optimize_graph_edit_distance(G1, G2):
434 ... minv = v
435 >>> minv
436 7.0
437
438 See Also
439 --------
440 graph_edit_distance, optimize_edit_paths
441
442 References
443 ----------
444 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
445 Martineau. An Exact Graph Edit Distance Algorithm for Solving
446 Pattern Recognition Problems. 4th International Conference on
447 Pattern Recognition Applications and Methods 2015, Jan 2015,
448 Lisbon, Portugal. 2015,
449 <10.5220/0005209202710278>. <hal-01168816>
450 https://hal.archives-ouvertes.fr/hal-01168816
451 """
452 for vertex_path, edge_path, cost in \
453 optimize_edit_paths(G1, G2, node_match, edge_match,
454 node_subst_cost, node_del_cost, node_ins_cost,
455 edge_subst_cost, edge_del_cost, edge_ins_cost,
456 upper_bound, True):
457 yield cost
458
459
460 def optimize_edit_paths(G1, G2, node_match=None, edge_match=None,
461 node_subst_cost=None, node_del_cost=None,
462 node_ins_cost=None,
463 edge_subst_cost=None, edge_del_cost=None,
464 edge_ins_cost=None,
465 upper_bound=None, strictly_decreasing=True):
466 """GED (graph edit distance) calculation: advanced interface.
467
468 Graph edit path is a sequence of node and edge edit operations
469 transforming graph G1 to graph isomorphic to G2. Edit operations
470 include substitutions, deletions, and insertions.
471
472 Graph edit distance is defined as minimum cost of edit path.
473
474 Parameters
475 ----------
476 G1, G2: graphs
477 The two graphs G1 and G2 must be of the same type.
478
479 node_match : callable
480 A function that returns True if node n1 in G1 and n2 in G2
481 should be considered equal during matching.
482
483 The function will be called like
484
485 node_match(G1.nodes[n1], G2.nodes[n2]).
486
487 That is, the function will receive the node attribute
488 dictionaries for n1 and n2 as inputs.
489
490 Ignored if node_subst_cost is specified. If neither
491 node_match nor node_subst_cost are specified then node
492 attributes are not considered.
493
494 edge_match : callable
495 A function that returns True if the edge attribute dictionaries
496 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
497 be considered equal during matching.
498
499 The function will be called like
500
501 edge_match(G1[u1][v1], G2[u2][v2]).
502
503 That is, the function will receive the edge attribute
504 dictionaries of the edges under consideration.
505
506 Ignored if edge_subst_cost is specified. If neither
507 edge_match nor edge_subst_cost are specified then edge
508 attributes are not considered.
509
510 node_subst_cost, node_del_cost, node_ins_cost : callable
511 Functions that return the costs of node substitution, node
512 deletion, and node insertion, respectively.
513
514 The functions will be called like
515
516 node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
517 node_del_cost(G1.nodes[n1]),
518 node_ins_cost(G2.nodes[n2]).
519
520 That is, the functions will receive the node attribute
521 dictionaries as inputs. The functions are expected to return
522 positive numeric values.
523
524 Function node_subst_cost overrides node_match if specified.
525 If neither node_match nor node_subst_cost are specified then
526 default node substitution cost of 0 is used (node attributes
527 are not considered during matching).
528
529 If node_del_cost is not specified then default node deletion
530 cost of 1 is used. If node_ins_cost is not specified then
531 default node insertion cost of 1 is used.
532
533 edge_subst_cost, edge_del_cost, edge_ins_cost : callable
534 Functions that return the costs of edge substitution, edge
535 deletion, and edge insertion, respectively.
536
537 The functions will be called like
538
539 edge_subst_cost(G1[u1][v1], G2[u2][v2]),
540 edge_del_cost(G1[u1][v1]),
541 edge_ins_cost(G2[u2][v2]).
542
543 That is, the functions will receive the edge attribute
544 dictionaries as inputs. The functions are expected to return
545 positive numeric values.
546
547 Function edge_subst_cost overrides edge_match if specified.
548 If neither edge_match nor edge_subst_cost are specified then
549 default edge substitution cost of 0 is used (edge attributes
550 are not considered during matching).
551
552 If edge_del_cost is not specified then default edge deletion
553 cost of 1 is used. If edge_ins_cost is not specified then
554 default edge insertion cost of 1 is used.
555
556 upper_bound : numeric
557 Maximum edit distance to consider.
558
559 strictly_decreasing : bool
560 If True, return consecutive approximations of strictly
561 decreasing cost. Otherwise, return all edit paths of cost
562 less than or equal to the previous minimum cost.
563
564 Returns
565 -------
566 Generator of tuples (node_edit_path, edge_edit_path, cost)
567 node_edit_path : list of tuples (u, v)
568 edge_edit_path : list of tuples ((u1, v1), (u2, v2))
569 cost : numeric
570
571 See Also
572 --------
573 graph_edit_distance, optimize_graph_edit_distance, optimal_edit_paths
574
575 References
576 ----------
577 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
578 Martineau. An Exact Graph Edit Distance Algorithm for Solving
579 Pattern Recognition Problems. 4th International Conference on
580 Pattern Recognition Applications and Methods 2015, Jan 2015,
581 Lisbon, Portugal. 2015,
582 <10.5220/0005209202710278>. <hal-01168816>
583 https://hal.archives-ouvertes.fr/hal-01168816
584
585 """
586 # TODO: support DiGraph
587
588 import numpy as np
589 from scipy.optimize import linear_sum_assignment
590
591 class CostMatrix:
592 def __init__(self, C, lsa_row_ind, lsa_col_ind, ls):
593 #assert C.shape[0] == len(lsa_row_ind)
594 #assert C.shape[1] == len(lsa_col_ind)
595 #assert len(lsa_row_ind) == len(lsa_col_ind)
596 #assert set(lsa_row_ind) == set(range(len(lsa_row_ind)))
597 #assert set(lsa_col_ind) == set(range(len(lsa_col_ind)))
598 #assert ls == C[lsa_row_ind, lsa_col_ind].sum()
599 self.C = C
600 self.lsa_row_ind = lsa_row_ind
601 self.lsa_col_ind = lsa_col_ind
602 self.ls = ls
603
604 def make_CostMatrix(C, m, n):
605 #assert(C.shape == (m + n, m + n))
606 lsa_row_ind, lsa_col_ind = linear_sum_assignment(C)
607
608 # Fixup dummy assignments:
609 # each substitution i<->j should have dummy assignment m+j<->n+i
610 # NOTE: fast reduce of Cv relies on it
611 #assert len(lsa_row_ind) == len(lsa_col_ind)
612 indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind)
613 subst_ind = list(k for k, i, j in indexes if i < m and j < n)
614 indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind)
615 dummy_ind = list(k for k, i, j in indexes if i >= m and j >= n)
616 #assert len(subst_ind) == len(dummy_ind)
617 lsa_row_ind[dummy_ind] = lsa_col_ind[subst_ind] + m
618 lsa_col_ind[dummy_ind] = lsa_row_ind[subst_ind] + n
619
620 return CostMatrix(C, lsa_row_ind, lsa_col_ind,
621 C[lsa_row_ind, lsa_col_ind].sum())
622
623 def extract_C(C, i, j, m, n):
624 #assert(C.shape == (m + n, m + n))
625 row_ind = [k in i or k - m in j for k in range(m + n)]
626 col_ind = [k in j or k - n in i for k in range(m + n)]
627 return C[row_ind, :][:, col_ind]
628
629 def reduce_C(C, i, j, m, n):
630 #assert(C.shape == (m + n, m + n))
631 row_ind = [k not in i and k - m not in j for k in range(m + n)]
632 col_ind = [k not in j and k - n not in i for k in range(m + n)]
633 return C[row_ind, :][:, col_ind]
634
635 def reduce_ind(ind, i):
636 #assert set(ind) == set(range(len(ind)))
637 rind = ind[[k not in i for k in ind]]
638 for k in set(i):
639 rind[rind >= k] -= 1
640 return rind
641
642 def match_edges(u, v, pending_g, pending_h, Ce, matched_uv=[]):
643 """
644 Parameters:
645 u, v: matched vertices, u=None or v=None for
646 deletion/insertion
647 pending_g, pending_h: lists of edges not yet mapped
648 Ce: CostMatrix of pending edge mappings
649 matched_uv: partial vertex edit path
650 list of tuples (u, v) of previously matched vertex
651 mappings u<->v, u=None or v=None for
652 deletion/insertion
653
654 Returns:
655 list of (i, j): indices of edge mappings g<->h
656 localCe: local CostMatrix of edge mappings
657 (basically submatrix of Ce at cross of rows i, cols j)
658 """
659 M = len(pending_g)
660 N = len(pending_h)
661 #assert Ce.C.shape == (M + N, M + N)
662
663 g_ind = [i for i in range(M) if pending_g[i][:2] == (u, u) or
664 any(pending_g[i][:2] in ((p, u), (u, p))
665 for p, q in matched_uv)]
666 h_ind = [j for j in range(N) if pending_h[j][:2] == (v, v) or
667 any(pending_h[j][:2] in ((q, v), (v, q))
668 for p, q in matched_uv)]
669 m = len(g_ind)
670 n = len(h_ind)
671
672 if m or n:
673 C = extract_C(Ce.C, g_ind, h_ind, M, N)
674 #assert C.shape == (m + n, m + n)
675
676 # Forbid structurally invalid matches
677 # NOTE: inf remembered from Ce construction
678 for k, i in zip(range(m), g_ind):
679 g = pending_g[i][:2]
680 for l, j in zip(range(n), h_ind):
681 h = pending_h[j][:2]
682 if nx.is_directed(G1) or nx.is_directed(G2):
683 if any(g == (p, u) and h == (q, v) or
684 g == (u, p) and h == (v, q)
685 for p, q in matched_uv):
686 continue
687 else:
688 if any(g in ((p, u), (u, p)) and h in ((q, v), (v, q))
689 for p, q in matched_uv):
690 continue
691 if g == (u, u):
692 continue
693 if h == (v, v):
694 continue
695 C[k, l] = inf
696
697 localCe = make_CostMatrix(C, m, n)
698 ij = list((g_ind[k] if k < m else M + h_ind[l],
699 h_ind[l] if l < n else N + g_ind[k])
700 for k, l in zip(localCe.lsa_row_ind, localCe.lsa_col_ind)
701 if k < m or l < n)
702
703 else:
704 ij = []
705 localCe = CostMatrix(np.empty((0, 0)), [], [], 0)
706
707 return ij, localCe
708
709 def reduce_Ce(Ce, ij, m, n):
710 if len(ij):
711 i, j = zip(*ij)
712 m_i = m - sum(1 for t in i if t < m)
713 n_j = n - sum(1 for t in j if t < n)
714 return make_CostMatrix(reduce_C(Ce.C, i, j, m, n), m_i, n_j)
715 else:
716 return Ce
717
718 def get_edit_ops(matched_uv, pending_u, pending_v, Cv,
719 pending_g, pending_h, Ce, matched_cost):
720 """
721 Parameters:
722 matched_uv: partial vertex edit path
723 list of tuples (u, v) of vertex mappings u<->v,
724 u=None or v=None for deletion/insertion
725 pending_u, pending_v: lists of vertices not yet mapped
726 Cv: CostMatrix of pending vertex mappings
727 pending_g, pending_h: lists of edges not yet mapped
728 Ce: CostMatrix of pending edge mappings
729 matched_cost: cost of partial edit path
730
731 Returns:
732 sequence of
733 (i, j): indices of vertex mapping u<->v
734 Cv_ij: reduced CostMatrix of pending vertex mappings
735 (basically Cv with row i, col j removed)
736 list of (x, y): indices of edge mappings g<->h
737 Ce_xy: reduced CostMatrix of pending edge mappings
738 (basically Ce with rows x, cols y removed)
739 cost: total cost of edit operation
740 NOTE: most promising ops first
741 """
742 m = len(pending_u)
743 n = len(pending_v)
744 #assert Cv.C.shape == (m + n, m + n)
745
746 # 1) a vertex mapping from optimal linear sum assignment
747 i, j = min((k, l) for k, l in zip(Cv.lsa_row_ind, Cv.lsa_col_ind)
748 if k < m or l < n)
749 xy, localCe = match_edges(pending_u[i] if i < m else None,
750 pending_v[j] if j < n else None,
751 pending_g, pending_h, Ce, matched_uv)
752 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h))
753 #assert Ce.ls <= localCe.ls + Ce_xy.ls
754 if prune(matched_cost + Cv.ls + localCe.ls + Ce_xy.ls):
755 pass
756 else:
757 # get reduced Cv efficiently
758 Cv_ij = CostMatrix(reduce_C(Cv.C, (i,), (j,), m, n),
759 reduce_ind(Cv.lsa_row_ind, (i, m + j)),
760 reduce_ind(Cv.lsa_col_ind, (j, n + i)),
761 Cv.ls - Cv.C[i, j])
762 yield (i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls
763
764 # 2) other candidates, sorted by lower-bound cost estimate
765 other = list()
766 fixed_i, fixed_j = i, j
767 if m <= n:
768 candidates = ((t, fixed_j) for t in range(m + n)
769 if t != fixed_i and (t < m or t == m + fixed_j))
770 else:
771 candidates = ((fixed_i, t) for t in range(m + n)
772 if t != fixed_j and (t < n or t == n + fixed_i))
773 for i, j in candidates:
774 if prune(matched_cost + Cv.C[i, j] + Ce.ls):
775 continue
776 Cv_ij = make_CostMatrix(reduce_C(Cv.C, (i,), (j,), m, n),
777 m - 1 if i < m else m,
778 n - 1 if j < n else n)
779 #assert Cv.ls <= Cv.C[i, j] + Cv_ij.ls
780 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + Ce.ls):
781 continue
782 xy, localCe = match_edges(pending_u[i] if i < m else None,
783 pending_v[j] if j < n else None,
784 pending_g, pending_h, Ce, matched_uv)
785 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls):
786 continue
787 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h))
788 #assert Ce.ls <= localCe.ls + Ce_xy.ls
789 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls +
790 Ce_xy.ls):
791 continue
792 other.append(((i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls))
793
794 # yield from
795 for t in sorted(other, key=lambda t: t[4] + t[1].ls + t[3].ls):
796 yield t
797
798 def get_edit_paths(matched_uv, pending_u, pending_v, Cv,
799 matched_gh, pending_g, pending_h, Ce, matched_cost):
800 """
801 Parameters:
802 matched_uv: partial vertex edit path
803 list of tuples (u, v) of vertex mappings u<->v,
804 u=None or v=None for deletion/insertion
805 pending_u, pending_v: lists of vertices not yet mapped
806 Cv: CostMatrix of pending vertex mappings
807 matched_gh: partial edge edit path
808 list of tuples (g, h) of edge mappings g<->h,
809 g=None or h=None for deletion/insertion
810 pending_g, pending_h: lists of edges not yet mapped
811 Ce: CostMatrix of pending edge mappings
812 matched_cost: cost of partial edit path
813
814 Returns:
815 sequence of (vertex_path, edge_path, cost)
816 vertex_path: complete vertex edit path
817 list of tuples (u, v) of vertex mappings u<->v,
818 u=None or v=None for deletion/insertion
819 edge_path: complete edge edit path
820 list of tuples (g, h) of edge mappings g<->h,
821 g=None or h=None for deletion/insertion
822 cost: total cost of edit path
823 NOTE: path costs are non-increasing
824 """
825 #debug_print('matched-uv:', matched_uv)
826 #debug_print('matched-gh:', matched_gh)
827 #debug_print('matched-cost:', matched_cost)
828 #debug_print('pending-u:', pending_u)
829 #debug_print('pending-v:', pending_v)
830 #debug_print(Cv.C)
831 #assert list(sorted(G1.nodes)) == list(sorted(list(u for u, v in matched_uv if u is not None) + pending_u))
832 #assert list(sorted(G2.nodes)) == list(sorted(list(v for u, v in matched_uv if v is not None) + pending_v))
833 #debug_print('pending-g:', pending_g)
834 #debug_print('pending-h:', pending_h)
835 #debug_print(Ce.C)
836 #assert list(sorted(G1.edges)) == list(sorted(list(g for g, h in matched_gh if g is not None) + pending_g))
837 #assert list(sorted(G2.edges)) == list(sorted(list(h for g, h in matched_gh if h is not None) + pending_h))
838 #debug_print()
839
840 if prune(matched_cost + Cv.ls + Ce.ls):
841 return
842
843 if not max(len(pending_u), len(pending_v)):
844 #assert not len(pending_g)
845 #assert not len(pending_h)
846 # path completed!
847 #assert matched_cost <= maxcost.value
848 maxcost.value = min(maxcost.value, matched_cost)
849 yield matched_uv, matched_gh, matched_cost
850
851 else:
852 edit_ops = get_edit_ops(matched_uv, pending_u, pending_v, Cv,
853 pending_g, pending_h, Ce, matched_cost)
854 for ij, Cv_ij, xy, Ce_xy, edit_cost in edit_ops:
855 i, j = ij
856 #assert Cv.C[i, j] + sum(Ce.C[t] for t in xy) == edit_cost
857 if prune(matched_cost + edit_cost + Cv_ij.ls + Ce_xy.ls):
858 continue
859
860 # dive deeper
861 u = pending_u.pop(i) if i < len(pending_u) else None
862 v = pending_v.pop(j) if j < len(pending_v) else None
863 matched_uv.append((u, v))
864 for x, y in xy:
865 len_g = len(pending_g)
866 len_h = len(pending_h)
867 matched_gh.append((pending_g[x] if x < len_g else None,
868 pending_h[y] if y < len_h else None))
869 sortedx = list(sorted(x for x, y in xy))
870 sortedy = list(sorted(y for x, y in xy))
871 G = list((pending_g.pop(x) if x < len(pending_g) else None)
872 for x in reversed(sortedx))
873 H = list((pending_h.pop(y) if y < len(pending_h) else None)
874 for y in reversed(sortedy))
875
876 # yield from
877 for t in get_edit_paths(matched_uv, pending_u, pending_v,
878 Cv_ij,
879 matched_gh, pending_g, pending_h,
880 Ce_xy,
881 matched_cost + edit_cost):
882 yield t
883
884 # backtrack
885 if u is not None:
886 pending_u.insert(i, u)
887 if v is not None:
888 pending_v.insert(j, v)
889 matched_uv.pop()
890 for x, g in zip(sortedx, reversed(G)):
891 if g is not None:
892 pending_g.insert(x, g)
893 for y, h in zip(sortedy, reversed(H)):
894 if h is not None:
895 pending_h.insert(y, h)
896 for t in xy:
897 matched_gh.pop()
898
899 # Initialization
900
901 pending_u = list(G1.nodes)
902 pending_v = list(G2.nodes)
903
904 # cost matrix of vertex mappings
905 m = len(pending_u)
906 n = len(pending_v)
907 C = np.zeros((m + n, m + n))
908 if node_subst_cost:
909 C[0:m, 0:n] = np.array([node_subst_cost(G1.nodes[u], G2.nodes[v])
910 for u in pending_u for v in pending_v]
911 ).reshape(m, n)
912 elif node_match:
913 C[0:m, 0:n] = np.array([1 - int(node_match(G1.nodes[u], G2.nodes[v]))
914 for u in pending_u for v in pending_v]
915 ).reshape(m, n)
916 else:
917 # all zeroes
918 pass
919 #assert not min(m, n) or C[0:m, 0:n].min() >= 0
920 if node_del_cost:
921 del_costs = [node_del_cost(G1.nodes[u]) for u in pending_u]
922 else:
923 del_costs = [1] * len(pending_u)
924 #assert not m or min(del_costs) >= 0
925 if node_ins_cost:
926 ins_costs = [node_ins_cost(G2.nodes[v]) for v in pending_v]
927 else:
928 ins_costs = [1] * len(pending_v)
929 #assert not n or min(ins_costs) >= 0
930 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1
931 C[0:m, n:n + m] = np.array([del_costs[i] if i == j else inf
932 for i in range(m) for j in range(m)]
933 ).reshape(m, m)
934 C[m:m + n, 0:n] = np.array([ins_costs[i] if i == j else inf
935 for i in range(n) for j in range(n)]
936 ).reshape(n, n)
937 Cv = make_CostMatrix(C, m, n)
938 #debug_print('Cv: {} x {}'.format(m, n))
939 #debug_print(Cv.C)
940
941 pending_g = list(G1.edges)
942 pending_h = list(G2.edges)
943
944 # cost matrix of edge mappings
945 m = len(pending_g)
946 n = len(pending_h)
947 C = np.zeros((m + n, m + n))
948 if edge_subst_cost:
949 C[0:m, 0:n] = np.array([edge_subst_cost(G1.edges[g], G2.edges[h])
950 for g in pending_g for h in pending_h]
951 ).reshape(m, n)
952 elif edge_match:
953 C[0:m, 0:n] = np.array([1 - int(edge_match(G1.edges[g], G2.edges[h]))
954 for g in pending_g for h in pending_h]
955 ).reshape(m, n)
956 else:
957 # all zeroes
958 pass
959 #assert not min(m, n) or C[0:m, 0:n].min() >= 0
960 if edge_del_cost:
961 del_costs = [edge_del_cost(G1.edges[g]) for g in pending_g]
962 else:
963 del_costs = [1] * len(pending_g)
964 #assert not m or min(del_costs) >= 0
965 if edge_ins_cost:
966 ins_costs = [edge_ins_cost(G2.edges[h]) for h in pending_h]
967 else:
968 ins_costs = [1] * len(pending_h)
969 #assert not n or min(ins_costs) >= 0
970 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1
971 C[0:m, n:n + m] = np.array([del_costs[i] if i == j else inf
972 for i in range(m) for j in range(m)]
973 ).reshape(m, m)
974 C[m:m + n, 0:n] = np.array([ins_costs[i] if i == j else inf
975 for i in range(n) for j in range(n)]
976 ).reshape(n, n)
977 Ce = make_CostMatrix(C, m, n)
978 #debug_print('Ce: {} x {}'.format(m, n))
979 #debug_print(Ce.C)
980 #debug_print()
981
982 class MaxCost:
983 def __init__(self):
984 # initial upper-bound estimate
985 # NOTE: should work for empty graph
986 self.value = Cv.C.sum() + Ce.C.sum() + 1
987 maxcost = MaxCost()
988
989 def prune(cost):
990 if upper_bound is not None:
991 if cost > upper_bound:
992 return True
993 if cost > maxcost.value:
994 return True
995 elif strictly_decreasing and cost >= maxcost.value:
996 return True
997
998 # Now go!
999
1000 for vertex_path, edge_path, cost in \
1001 get_edit_paths([], pending_u, pending_v, Cv,
1002 [], pending_g, pending_h, Ce, 0):
1003 #assert sorted(G1.nodes) == sorted(u for u, v in vertex_path if u is not None)
1004 #assert sorted(G2.nodes) == sorted(v for u, v in vertex_path if v is not None)
1005 #assert sorted(G1.edges) == sorted(g for g, h in edge_path if g is not None)
1006 #assert sorted(G2.edges) == sorted(h for g, h in edge_path if h is not None)
1007 #print(vertex_path, edge_path, cost, file = sys.stderr)
1008 #assert cost == maxcost.value
1009 yield list(vertex_path), list(edge_path), cost
1010
1011
1012 def _is_close(d1, d2, atolerance=0, rtolerance=0):
1013 """Determines whether two adjacency matrices are within
1014 a provided tolerance.
1015
1016 Parameters
1017 ----------
1018 d1 : dict
1019 Adjacency dictionary
1020
1021 d2 : dict
1022 Adjacency dictionary
1023
1024 atolerance : float
1025 Some scalar tolerance value to determine closeness
1026
1027 rtolerance : float
1028 A scalar tolerance value that will be some proportion
1029 of ``d2``'s value
1030
1031 Returns
1032 -------
1033 closeness : bool
1034 If all of the nodes within ``d1`` and ``d2`` are within
1035 a predefined tolerance, they are considered "close" and
1036 this method will return True. Otherwise, this method will
1037 return False.
1038
1039 """
1040 # Pre-condition: d1 and d2 have the same keys at each level if they
1041 # are dictionaries.
1042 if not isinstance(d1, dict) and not isinstance(d2, dict):
1043 return abs(d1 - d2) <= atolerance + rtolerance * abs(d2)
1044 return all(all(_is_close(d1[u][v], d2[u][v]) for v in d1[u]) for u in d1)
1045
1046
1047 def simrank_similarity(G, source=None, target=None, importance_factor=0.9,
1048 max_iterations=100, tolerance=1e-4):
1049 """Returns the SimRank similarity of nodes in the graph ``G``.
1050
1051 SimRank is a similarity metric that says "two objects are considered
1052 to be similar if they are referenced by similar objects." [1]_.
1053
1054 The pseudo-code definition from the paper is::
1055
1056 def simrank(G, u, v):
1057 in_neighbors_u = G.predecessors(u)
1058 in_neighbors_v = G.predecessors(v)
1059 scale = C / (len(in_neighbors_u) * len(in_neighbors_v))
1060 return scale * sum(simrank(G, w, x)
1061 for w, x in product(in_neighbors_u,
1062 in_neighbors_v))
1063
1064 where ``G`` is the graph, ``u`` is the source, ``v`` is the target,
1065 and ``C`` is a float decay or importance factor between 0 and 1.
1066
1067 The SimRank algorithm for determining node similarity is defined in
1068 [2]_.
1069
1070 Parameters
1071 ----------
1072 G : NetworkX graph
1073 A NetworkX graph
1074
1075 source : node
1076 If this is specified, the returned dictionary maps each node
1077 ``v`` in the graph to the similarity between ``source`` and
1078 ``v``.
1079
1080 target : node
1081 If both ``source`` and ``target`` are specified, the similarity
1082 value between ``source`` and ``target`` is returned. If
1083 ``target`` is specified but ``source`` is not, this argument is
1084 ignored.
1085
1086 importance_factor : float
1087 The relative importance of indirect neighbors with respect to
1088 direct neighbors.
1089
1090 max_iterations : integer
1091 Maximum number of iterations.
1092
1093 tolerance : float
1094 Error tolerance used to check convergence. When an iteration of
1095 the algorithm finds that no similarity value changes more than
1096 this amount, the algorithm halts.
1097
1098 Returns
1099 -------
1100 similarity : dictionary or float
1101 If ``source`` and ``target`` are both ``None``, this returns a
1102 dictionary of dictionaries, where keys are node pairs and value
1103 are similarity of the pair of nodes.
1104
1105 If ``source`` is not ``None`` but ``target`` is, this returns a
1106 dictionary mapping node to the similarity of ``source`` and that
1107 node.
1108
1109 If neither ``source`` nor ``target`` is ``None``, this returns
1110 the similarity value for the given pair of nodes.
1111
1112 Examples
1113 --------
1114 If the nodes of the graph are numbered from zero to *n - 1*, where *n*
1115 is the number of nodes in the graph, you can create a SimRank matrix
1116 from the return value of this function where the node numbers are
1117 the row and column indices of the matrix::
1118
1119 >>> import networkx as nx
1120 >>> from numpy import array
1121 >>> G = nx.cycle_graph(4)
1122 >>> sim = nx.simrank_similarity(G)
1123 >>> lol = [[sim[u][v] for v in sorted(sim[u])] for u in sorted(sim)]
1124 >>> sim_array = array(lol)
1125
1126 References
1127 ----------
1128 .. [1] https://en.wikipedia.org/wiki/SimRank
1129 .. [2] G. Jeh and J. Widom.
1130 "SimRank: a measure of structural-context similarity",
1131 In KDD'02: Proceedings of the Eighth ACM SIGKDD
1132 International Conference on Knowledge Discovery and Data Mining,
1133 pp. 538--543. ACM Press, 2002.
1134 """
1135 prevsim = None
1136
1137 # build up our similarity adjacency dictionary output
1138 newsim = {u: {v: 1 if u == v else 0 for v in G} for u in G}
1139
1140 # These functions compute the update to the similarity value of the nodes
1141 # `u` and `v` with respect to the previous similarity values.
1142 avg_sim = lambda s: sum(newsim[w][x] for (w, x) in s) / len(s) if s else 0.0
1143 sim = lambda u, v: importance_factor * avg_sim(list(product(G[u], G[v])))
1144
1145 for _ in range(max_iterations):
1146 if prevsim and _is_close(prevsim, newsim, tolerance):
1147 break
1148 prevsim = newsim
1149 newsim = {u: {v: sim(u, v) if u is not v else 1
1150 for v in newsim[u]} for u in newsim}
1151
1152 if source is not None and target is not None:
1153 return newsim[source][target]
1154 if source is not None:
1155 return newsim[source]
1156 return newsim
1157
1158
1159 def simrank_similarity_numpy(G, source=None, target=None, importance_factor=0.9,
1160 max_iterations=100, tolerance=1e-4):
1161 """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``.
1162
1163 The SimRank algorithm for determining node similarity is defined in
1164 [1]_.
1165
1166 Parameters
1167 ----------
1168 G : NetworkX graph
1169 A NetworkX graph
1170
1171 source : node
1172 If this is specified, the returned dictionary maps each node
1173 ``v`` in the graph to the similarity between ``source`` and
1174 ``v``.
1175
1176 target : node
1177 If both ``source`` and ``target`` are specified, the similarity
1178 value between ``source`` and ``target`` is returned. If
1179 ``target`` is specified but ``source`` is not, this argument is
1180 ignored.
1181
1182 importance_factor : float
1183 The relative importance of indirect neighbors with respect to
1184 direct neighbors.
1185
1186 max_iterations : integer
1187 Maximum number of iterations.
1188
1189 tolerance : float
1190 Error tolerance used to check convergence. When an iteration of
1191 the algorithm finds that no similarity value changes more than
1192 this amount, the algorithm halts.
1193
1194 Returns
1195 -------
1196 similarity : dictionary or float
1197 If ``source`` and ``target`` are both ``None``, this returns a
1198 dictionary of dictionaries, where keys are node pairs and value
1199 are similarity of the pair of nodes.
1200
1201 If ``source`` is not ``None`` but ``target`` is, this returns a
1202 dictionary mapping node to the similarity of ``source`` and that
1203 node.
1204
1205 If neither ``source`` nor ``target`` is ``None``, this returns
1206 the similarity value for the given pair of nodes.
1207
1208 Examples
1209 --------
1210 >>> import networkx as nx
1211 >>> from numpy import array
1212 >>> G = nx.cycle_graph(4)
1213 >>> sim = nx.simrank_similarity_numpy(G)
1214
1215 References
1216 ----------
1217 .. [1] G. Jeh and J. Widom.
1218 "SimRank: a measure of structural-context similarity",
1219 In KDD'02: Proceedings of the Eighth ACM SIGKDD
1220 International Conference on Knowledge Discovery and Data Mining,
1221 pp. 538--543. ACM Press, 2002.
1222 """
1223 # This algorithm follows roughly
1224 #
1225 # S = max{C * (A.T * S * A), I}
1226 #
1227 # where C is the importance factor, A is the column normalized
1228 # adjacency matrix, and I is the identity matrix.
1229 import numpy as np
1230 adjacency_matrix = nx.to_numpy_array(G)
1231
1232 # column-normalize the ``adjacency_matrix``
1233 adjacency_matrix /= adjacency_matrix.sum(axis=0)
1234
1235 newsim = np.eye(adjacency_matrix.shape[0], dtype=np.float64)
1236 for _ in range(max_iterations):
1237 prevsim = np.copy(newsim)
1238 newsim = importance_factor * np.matmul(
1239 np.matmul(adjacency_matrix.T, prevsim), adjacency_matrix)
1240 np.fill_diagonal(newsim, 1.0)
1241
1242 if np.allclose(prevsim, newsim, atol=tolerance):
1243 break
1244
1245 if source is not None and target is not None:
1246 return newsim[source, target]
1247 if source is not None:
1248 return newsim[source]
1249 return newsim
1250
1251
1252 # fixture for pytest
1253 def setup_module(module):
1254 import pytest
1255 numpy = pytest.importorskip('numpy')
1256 scipy = pytest.importorskip('scipy')