Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/networkx/algorithms/similarity.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
| author | guerler | 
|---|---|
| date | Fri, 31 Jul 2020 00:32:28 -0400 | 
| parents | |
| children | 
   comparison
  equal
  deleted
  inserted
  replaced
| 0:d30785e31577 | 1:56ad4e20f292 | 
|---|---|
| 1 # -*- coding: utf-8 -*- | |
| 2 # Copyright (C) 2010 by | |
| 3 # Aric Hagberg <hagberg@lanl.gov> | |
| 4 # Dan Schult <dschult@colgate.edu> | |
| 5 # Pieter Swart <swart@lanl.gov> | |
| 6 # All rights reserved. | |
| 7 # BSD license. | |
| 8 # | |
| 9 # Author: Andrey Paramonov <paramon@acdlabs.ru> | |
| 10 """ Functions measuring similarity using graph edit distance. | |
| 11 | |
| 12 The graph edit distance is the number of edge/node changes needed | |
| 13 to make two graphs isomorphic. | |
| 14 | |
| 15 The default algorithm/implementation is sub-optimal for some graphs. | |
| 16 The problem of finding the exact Graph Edit Distance (GED) is NP-hard | |
| 17 so it is often slow. If the simple interface `graph_edit_distance` | |
| 18 takes too long for your graph, try `optimize_graph_edit_distance` | |
| 19 and/or `optimize_edit_paths`. | |
| 20 | |
| 21 At the same time, I encourage capable people to investigate | |
| 22 alternative GED algorithms, in order to improve the choices available. | |
| 23 """ | |
| 24 from itertools import product | |
| 25 import math | |
| 26 import networkx as nx | |
| 27 from operator import * | |
| 28 import sys | |
| 29 | |
| 30 __author__ = 'Andrey Paramonov <paramon@acdlabs.ru>' | |
| 31 | |
| 32 __all__ = [ | |
| 33 'graph_edit_distance', | |
| 34 'optimal_edit_paths', | |
| 35 'optimize_graph_edit_distance', | |
| 36 'optimize_edit_paths', | |
| 37 'simrank_similarity', | |
| 38 'simrank_similarity_numpy', | |
| 39 ] | |
| 40 | |
| 41 | |
| 42 def debug_print(*args, **kwargs): | |
| 43 print(*args, **kwargs) | |
| 44 | |
| 45 | |
| 46 def graph_edit_distance(G1, G2, node_match=None, edge_match=None, | |
| 47 node_subst_cost=None, node_del_cost=None, | |
| 48 node_ins_cost=None, | |
| 49 edge_subst_cost=None, edge_del_cost=None, | |
| 50 edge_ins_cost=None, | |
| 51 upper_bound=None): | |
| 52 """Returns GED (graph edit distance) between graphs G1 and G2. | |
| 53 | |
| 54 Graph edit distance is a graph similarity measure analogous to | |
| 55 Levenshtein distance for strings. It is defined as minimum cost | |
| 56 of edit path (sequence of node and edge edit operations) | |
| 57 transforming graph G1 to graph isomorphic to G2. | |
| 58 | |
| 59 Parameters | |
| 60 ---------- | |
| 61 G1, G2: graphs | |
| 62 The two graphs G1 and G2 must be of the same type. | |
| 63 | |
| 64 node_match : callable | |
| 65 A function that returns True if node n1 in G1 and n2 in G2 | |
| 66 should be considered equal during matching. | |
| 67 | |
| 68 The function will be called like | |
| 69 | |
| 70 node_match(G1.nodes[n1], G2.nodes[n2]). | |
| 71 | |
| 72 That is, the function will receive the node attribute | |
| 73 dictionaries for n1 and n2 as inputs. | |
| 74 | |
| 75 Ignored if node_subst_cost is specified. If neither | |
| 76 node_match nor node_subst_cost are specified then node | |
| 77 attributes are not considered. | |
| 78 | |
| 79 edge_match : callable | |
| 80 A function that returns True if the edge attribute dictionaries | |
| 81 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should | |
| 82 be considered equal during matching. | |
| 83 | |
| 84 The function will be called like | |
| 85 | |
| 86 edge_match(G1[u1][v1], G2[u2][v2]). | |
| 87 | |
| 88 That is, the function will receive the edge attribute | |
| 89 dictionaries of the edges under consideration. | |
| 90 | |
| 91 Ignored if edge_subst_cost is specified. If neither | |
| 92 edge_match nor edge_subst_cost are specified then edge | |
| 93 attributes are not considered. | |
| 94 | |
| 95 node_subst_cost, node_del_cost, node_ins_cost : callable | |
| 96 Functions that return the costs of node substitution, node | |
| 97 deletion, and node insertion, respectively. | |
| 98 | |
| 99 The functions will be called like | |
| 100 | |
| 101 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), | |
| 102 node_del_cost(G1.nodes[n1]), | |
| 103 node_ins_cost(G2.nodes[n2]). | |
| 104 | |
| 105 That is, the functions will receive the node attribute | |
| 106 dictionaries as inputs. The functions are expected to return | |
| 107 positive numeric values. | |
| 108 | |
| 109 Function node_subst_cost overrides node_match if specified. | |
| 110 If neither node_match nor node_subst_cost are specified then | |
| 111 default node substitution cost of 0 is used (node attributes | |
| 112 are not considered during matching). | |
| 113 | |
| 114 If node_del_cost is not specified then default node deletion | |
| 115 cost of 1 is used. If node_ins_cost is not specified then | |
| 116 default node insertion cost of 1 is used. | |
| 117 | |
| 118 edge_subst_cost, edge_del_cost, edge_ins_cost : callable | |
| 119 Functions that return the costs of edge substitution, edge | |
| 120 deletion, and edge insertion, respectively. | |
| 121 | |
| 122 The functions will be called like | |
| 123 | |
| 124 edge_subst_cost(G1[u1][v1], G2[u2][v2]), | |
| 125 edge_del_cost(G1[u1][v1]), | |
| 126 edge_ins_cost(G2[u2][v2]). | |
| 127 | |
| 128 That is, the functions will receive the edge attribute | |
| 129 dictionaries as inputs. The functions are expected to return | |
| 130 positive numeric values. | |
| 131 | |
| 132 Function edge_subst_cost overrides edge_match if specified. | |
| 133 If neither edge_match nor edge_subst_cost are specified then | |
| 134 default edge substitution cost of 0 is used (edge attributes | |
| 135 are not considered during matching). | |
| 136 | |
| 137 If edge_del_cost is not specified then default edge deletion | |
| 138 cost of 1 is used. If edge_ins_cost is not specified then | |
| 139 default edge insertion cost of 1 is used. | |
| 140 | |
| 141 upper_bound : numeric | |
| 142 Maximum edit distance to consider. Return None if no edit | |
| 143 distance under or equal to upper_bound exists. | |
| 144 | |
| 145 Examples | |
| 146 -------- | |
| 147 >>> G1 = nx.cycle_graph(6) | |
| 148 >>> G2 = nx.wheel_graph(7) | |
| 149 >>> nx.graph_edit_distance(G1, G2) | |
| 150 7.0 | |
| 151 | |
| 152 See Also | |
| 153 -------- | |
| 154 optimal_edit_paths, optimize_graph_edit_distance, | |
| 155 | |
| 156 is_isomorphic (test for graph edit distance of 0) | |
| 157 | |
| 158 References | |
| 159 ---------- | |
| 160 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick | |
| 161 Martineau. An Exact Graph Edit Distance Algorithm for Solving | |
| 162 Pattern Recognition Problems. 4th International Conference on | |
| 163 Pattern Recognition Applications and Methods 2015, Jan 2015, | |
| 164 Lisbon, Portugal. 2015, | |
| 165 <10.5220/0005209202710278>. <hal-01168816> | |
| 166 https://hal.archives-ouvertes.fr/hal-01168816 | |
| 167 | |
| 168 """ | |
| 169 bestcost = None | |
| 170 for vertex_path, edge_path, cost in \ | |
| 171 optimize_edit_paths(G1, G2, node_match, edge_match, | |
| 172 node_subst_cost, node_del_cost, node_ins_cost, | |
| 173 edge_subst_cost, edge_del_cost, edge_ins_cost, | |
| 174 upper_bound, True): | |
| 175 #assert bestcost is None or cost < bestcost | |
| 176 bestcost = cost | |
| 177 return bestcost | |
| 178 | |
| 179 | |
| 180 def optimal_edit_paths(G1, G2, node_match=None, edge_match=None, | |
| 181 node_subst_cost=None, node_del_cost=None, | |
| 182 node_ins_cost=None, | |
| 183 edge_subst_cost=None, edge_del_cost=None, | |
| 184 edge_ins_cost=None, | |
| 185 upper_bound=None): | |
| 186 """Returns all minimum-cost edit paths transforming G1 to G2. | |
| 187 | |
| 188 Graph edit path is a sequence of node and edge edit operations | |
| 189 transforming graph G1 to graph isomorphic to G2. Edit operations | |
| 190 include substitutions, deletions, and insertions. | |
| 191 | |
| 192 Parameters | |
| 193 ---------- | |
| 194 G1, G2: graphs | |
| 195 The two graphs G1 and G2 must be of the same type. | |
| 196 | |
| 197 node_match : callable | |
| 198 A function that returns True if node n1 in G1 and n2 in G2 | |
| 199 should be considered equal during matching. | |
| 200 | |
| 201 The function will be called like | |
| 202 | |
| 203 node_match(G1.nodes[n1], G2.nodes[n2]). | |
| 204 | |
| 205 That is, the function will receive the node attribute | |
| 206 dictionaries for n1 and n2 as inputs. | |
| 207 | |
| 208 Ignored if node_subst_cost is specified. If neither | |
| 209 node_match nor node_subst_cost are specified then node | |
| 210 attributes are not considered. | |
| 211 | |
| 212 edge_match : callable | |
| 213 A function that returns True if the edge attribute dictionaries | |
| 214 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should | |
| 215 be considered equal during matching. | |
| 216 | |
| 217 The function will be called like | |
| 218 | |
| 219 edge_match(G1[u1][v1], G2[u2][v2]). | |
| 220 | |
| 221 That is, the function will receive the edge attribute | |
| 222 dictionaries of the edges under consideration. | |
| 223 | |
| 224 Ignored if edge_subst_cost is specified. If neither | |
| 225 edge_match nor edge_subst_cost are specified then edge | |
| 226 attributes are not considered. | |
| 227 | |
| 228 node_subst_cost, node_del_cost, node_ins_cost : callable | |
| 229 Functions that return the costs of node substitution, node | |
| 230 deletion, and node insertion, respectively. | |
| 231 | |
| 232 The functions will be called like | |
| 233 | |
| 234 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), | |
| 235 node_del_cost(G1.nodes[n1]), | |
| 236 node_ins_cost(G2.nodes[n2]). | |
| 237 | |
| 238 That is, the functions will receive the node attribute | |
| 239 dictionaries as inputs. The functions are expected to return | |
| 240 positive numeric values. | |
| 241 | |
| 242 Function node_subst_cost overrides node_match if specified. | |
| 243 If neither node_match nor node_subst_cost are specified then | |
| 244 default node substitution cost of 0 is used (node attributes | |
| 245 are not considered during matching). | |
| 246 | |
| 247 If node_del_cost is not specified then default node deletion | |
| 248 cost of 1 is used. If node_ins_cost is not specified then | |
| 249 default node insertion cost of 1 is used. | |
| 250 | |
| 251 edge_subst_cost, edge_del_cost, edge_ins_cost : callable | |
| 252 Functions that return the costs of edge substitution, edge | |
| 253 deletion, and edge insertion, respectively. | |
| 254 | |
| 255 The functions will be called like | |
| 256 | |
| 257 edge_subst_cost(G1[u1][v1], G2[u2][v2]), | |
| 258 edge_del_cost(G1[u1][v1]), | |
| 259 edge_ins_cost(G2[u2][v2]). | |
| 260 | |
| 261 That is, the functions will receive the edge attribute | |
| 262 dictionaries as inputs. The functions are expected to return | |
| 263 positive numeric values. | |
| 264 | |
| 265 Function edge_subst_cost overrides edge_match if specified. | |
| 266 If neither edge_match nor edge_subst_cost are specified then | |
| 267 default edge substitution cost of 0 is used (edge attributes | |
| 268 are not considered during matching). | |
| 269 | |
| 270 If edge_del_cost is not specified then default edge deletion | |
| 271 cost of 1 is used. If edge_ins_cost is not specified then | |
| 272 default edge insertion cost of 1 is used. | |
| 273 | |
| 274 upper_bound : numeric | |
| 275 Maximum edit distance to consider. | |
| 276 | |
| 277 Returns | |
| 278 ------- | |
| 279 edit_paths : list of tuples (node_edit_path, edge_edit_path) | |
| 280 node_edit_path : list of tuples (u, v) | |
| 281 edge_edit_path : list of tuples ((u1, v1), (u2, v2)) | |
| 282 | |
| 283 cost : numeric | |
| 284 Optimal edit path cost (graph edit distance). | |
| 285 | |
| 286 Examples | |
| 287 -------- | |
| 288 >>> G1 = nx.cycle_graph(6) | |
| 289 >>> G2 = nx.wheel_graph(7) | |
| 290 >>> paths, cost = nx.optimal_edit_paths(G1, G2) | |
| 291 >>> len(paths) | |
| 292 84 | |
| 293 >>> cost | |
| 294 7.0 | |
| 295 | |
| 296 See Also | |
| 297 -------- | |
| 298 graph_edit_distance, optimize_edit_paths | |
| 299 | |
| 300 References | |
| 301 ---------- | |
| 302 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick | |
| 303 Martineau. An Exact Graph Edit Distance Algorithm for Solving | |
| 304 Pattern Recognition Problems. 4th International Conference on | |
| 305 Pattern Recognition Applications and Methods 2015, Jan 2015, | |
| 306 Lisbon, Portugal. 2015, | |
| 307 <10.5220/0005209202710278>. <hal-01168816> | |
| 308 https://hal.archives-ouvertes.fr/hal-01168816 | |
| 309 | |
| 310 """ | |
| 311 paths = list() | |
| 312 bestcost = None | |
| 313 for vertex_path, edge_path, cost in \ | |
| 314 optimize_edit_paths(G1, G2, node_match, edge_match, | |
| 315 node_subst_cost, node_del_cost, node_ins_cost, | |
| 316 edge_subst_cost, edge_del_cost, edge_ins_cost, | |
| 317 upper_bound, False): | |
| 318 #assert bestcost is None or cost <= bestcost | |
| 319 if bestcost is not None and cost < bestcost: | |
| 320 paths = list() | |
| 321 paths.append((vertex_path, edge_path)) | |
| 322 bestcost = cost | |
| 323 return paths, bestcost | |
| 324 | |
| 325 | |
| 326 def optimize_graph_edit_distance(G1, G2, node_match=None, edge_match=None, | |
| 327 node_subst_cost=None, node_del_cost=None, | |
| 328 node_ins_cost=None, | |
| 329 edge_subst_cost=None, edge_del_cost=None, | |
| 330 edge_ins_cost=None, | |
| 331 upper_bound=None): | |
| 332 """Returns consecutive approximations of GED (graph edit distance) | |
| 333 between graphs G1 and G2. | |
| 334 | |
| 335 Graph edit distance is a graph similarity measure analogous to | |
| 336 Levenshtein distance for strings. It is defined as minimum cost | |
| 337 of edit path (sequence of node and edge edit operations) | |
| 338 transforming graph G1 to graph isomorphic to G2. | |
| 339 | |
| 340 Parameters | |
| 341 ---------- | |
| 342 G1, G2: graphs | |
| 343 The two graphs G1 and G2 must be of the same type. | |
| 344 | |
| 345 node_match : callable | |
| 346 A function that returns True if node n1 in G1 and n2 in G2 | |
| 347 should be considered equal during matching. | |
| 348 | |
| 349 The function will be called like | |
| 350 | |
| 351 node_match(G1.nodes[n1], G2.nodes[n2]). | |
| 352 | |
| 353 That is, the function will receive the node attribute | |
| 354 dictionaries for n1 and n2 as inputs. | |
| 355 | |
| 356 Ignored if node_subst_cost is specified. If neither | |
| 357 node_match nor node_subst_cost are specified then node | |
| 358 attributes are not considered. | |
| 359 | |
| 360 edge_match : callable | |
| 361 A function that returns True if the edge attribute dictionaries | |
| 362 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should | |
| 363 be considered equal during matching. | |
| 364 | |
| 365 The function will be called like | |
| 366 | |
| 367 edge_match(G1[u1][v1], G2[u2][v2]). | |
| 368 | |
| 369 That is, the function will receive the edge attribute | |
| 370 dictionaries of the edges under consideration. | |
| 371 | |
| 372 Ignored if edge_subst_cost is specified. If neither | |
| 373 edge_match nor edge_subst_cost are specified then edge | |
| 374 attributes are not considered. | |
| 375 | |
| 376 node_subst_cost, node_del_cost, node_ins_cost : callable | |
| 377 Functions that return the costs of node substitution, node | |
| 378 deletion, and node insertion, respectively. | |
| 379 | |
| 380 The functions will be called like | |
| 381 | |
| 382 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), | |
| 383 node_del_cost(G1.nodes[n1]), | |
| 384 node_ins_cost(G2.nodes[n2]). | |
| 385 | |
| 386 That is, the functions will receive the node attribute | |
| 387 dictionaries as inputs. The functions are expected to return | |
| 388 positive numeric values. | |
| 389 | |
| 390 Function node_subst_cost overrides node_match if specified. | |
| 391 If neither node_match nor node_subst_cost are specified then | |
| 392 default node substitution cost of 0 is used (node attributes | |
| 393 are not considered during matching). | |
| 394 | |
| 395 If node_del_cost is not specified then default node deletion | |
| 396 cost of 1 is used. If node_ins_cost is not specified then | |
| 397 default node insertion cost of 1 is used. | |
| 398 | |
| 399 edge_subst_cost, edge_del_cost, edge_ins_cost : callable | |
| 400 Functions that return the costs of edge substitution, edge | |
| 401 deletion, and edge insertion, respectively. | |
| 402 | |
| 403 The functions will be called like | |
| 404 | |
| 405 edge_subst_cost(G1[u1][v1], G2[u2][v2]), | |
| 406 edge_del_cost(G1[u1][v1]), | |
| 407 edge_ins_cost(G2[u2][v2]). | |
| 408 | |
| 409 That is, the functions will receive the edge attribute | |
| 410 dictionaries as inputs. The functions are expected to return | |
| 411 positive numeric values. | |
| 412 | |
| 413 Function edge_subst_cost overrides edge_match if specified. | |
| 414 If neither edge_match nor edge_subst_cost are specified then | |
| 415 default edge substitution cost of 0 is used (edge attributes | |
| 416 are not considered during matching). | |
| 417 | |
| 418 If edge_del_cost is not specified then default edge deletion | |
| 419 cost of 1 is used. If edge_ins_cost is not specified then | |
| 420 default edge insertion cost of 1 is used. | |
| 421 | |
| 422 upper_bound : numeric | |
| 423 Maximum edit distance to consider. | |
| 424 | |
| 425 Returns | |
| 426 ------- | |
| 427 Generator of consecutive approximations of graph edit distance. | |
| 428 | |
| 429 Examples | |
| 430 -------- | |
| 431 >>> G1 = nx.cycle_graph(6) | |
| 432 >>> G2 = nx.wheel_graph(7) | |
| 433 >>> for v in nx.optimize_graph_edit_distance(G1, G2): | |
| 434 ... minv = v | |
| 435 >>> minv | |
| 436 7.0 | |
| 437 | |
| 438 See Also | |
| 439 -------- | |
| 440 graph_edit_distance, optimize_edit_paths | |
| 441 | |
| 442 References | |
| 443 ---------- | |
| 444 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick | |
| 445 Martineau. An Exact Graph Edit Distance Algorithm for Solving | |
| 446 Pattern Recognition Problems. 4th International Conference on | |
| 447 Pattern Recognition Applications and Methods 2015, Jan 2015, | |
| 448 Lisbon, Portugal. 2015, | |
| 449 <10.5220/0005209202710278>. <hal-01168816> | |
| 450 https://hal.archives-ouvertes.fr/hal-01168816 | |
| 451 """ | |
| 452 for vertex_path, edge_path, cost in \ | |
| 453 optimize_edit_paths(G1, G2, node_match, edge_match, | |
| 454 node_subst_cost, node_del_cost, node_ins_cost, | |
| 455 edge_subst_cost, edge_del_cost, edge_ins_cost, | |
| 456 upper_bound, True): | |
| 457 yield cost | |
| 458 | |
| 459 | |
| 460 def optimize_edit_paths(G1, G2, node_match=None, edge_match=None, | |
| 461 node_subst_cost=None, node_del_cost=None, | |
| 462 node_ins_cost=None, | |
| 463 edge_subst_cost=None, edge_del_cost=None, | |
| 464 edge_ins_cost=None, | |
| 465 upper_bound=None, strictly_decreasing=True): | |
| 466 """GED (graph edit distance) calculation: advanced interface. | |
| 467 | |
| 468 Graph edit path is a sequence of node and edge edit operations | |
| 469 transforming graph G1 to graph isomorphic to G2. Edit operations | |
| 470 include substitutions, deletions, and insertions. | |
| 471 | |
| 472 Graph edit distance is defined as minimum cost of edit path. | |
| 473 | |
| 474 Parameters | |
| 475 ---------- | |
| 476 G1, G2: graphs | |
| 477 The two graphs G1 and G2 must be of the same type. | |
| 478 | |
| 479 node_match : callable | |
| 480 A function that returns True if node n1 in G1 and n2 in G2 | |
| 481 should be considered equal during matching. | |
| 482 | |
| 483 The function will be called like | |
| 484 | |
| 485 node_match(G1.nodes[n1], G2.nodes[n2]). | |
| 486 | |
| 487 That is, the function will receive the node attribute | |
| 488 dictionaries for n1 and n2 as inputs. | |
| 489 | |
| 490 Ignored if node_subst_cost is specified. If neither | |
| 491 node_match nor node_subst_cost are specified then node | |
| 492 attributes are not considered. | |
| 493 | |
| 494 edge_match : callable | |
| 495 A function that returns True if the edge attribute dictionaries | |
| 496 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should | |
| 497 be considered equal during matching. | |
| 498 | |
| 499 The function will be called like | |
| 500 | |
| 501 edge_match(G1[u1][v1], G2[u2][v2]). | |
| 502 | |
| 503 That is, the function will receive the edge attribute | |
| 504 dictionaries of the edges under consideration. | |
| 505 | |
| 506 Ignored if edge_subst_cost is specified. If neither | |
| 507 edge_match nor edge_subst_cost are specified then edge | |
| 508 attributes are not considered. | |
| 509 | |
| 510 node_subst_cost, node_del_cost, node_ins_cost : callable | |
| 511 Functions that return the costs of node substitution, node | |
| 512 deletion, and node insertion, respectively. | |
| 513 | |
| 514 The functions will be called like | |
| 515 | |
| 516 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), | |
| 517 node_del_cost(G1.nodes[n1]), | |
| 518 node_ins_cost(G2.nodes[n2]). | |
| 519 | |
| 520 That is, the functions will receive the node attribute | |
| 521 dictionaries as inputs. The functions are expected to return | |
| 522 positive numeric values. | |
| 523 | |
| 524 Function node_subst_cost overrides node_match if specified. | |
| 525 If neither node_match nor node_subst_cost are specified then | |
| 526 default node substitution cost of 0 is used (node attributes | |
| 527 are not considered during matching). | |
| 528 | |
| 529 If node_del_cost is not specified then default node deletion | |
| 530 cost of 1 is used. If node_ins_cost is not specified then | |
| 531 default node insertion cost of 1 is used. | |
| 532 | |
| 533 edge_subst_cost, edge_del_cost, edge_ins_cost : callable | |
| 534 Functions that return the costs of edge substitution, edge | |
| 535 deletion, and edge insertion, respectively. | |
| 536 | |
| 537 The functions will be called like | |
| 538 | |
| 539 edge_subst_cost(G1[u1][v1], G2[u2][v2]), | |
| 540 edge_del_cost(G1[u1][v1]), | |
| 541 edge_ins_cost(G2[u2][v2]). | |
| 542 | |
| 543 That is, the functions will receive the edge attribute | |
| 544 dictionaries as inputs. The functions are expected to return | |
| 545 positive numeric values. | |
| 546 | |
| 547 Function edge_subst_cost overrides edge_match if specified. | |
| 548 If neither edge_match nor edge_subst_cost are specified then | |
| 549 default edge substitution cost of 0 is used (edge attributes | |
| 550 are not considered during matching). | |
| 551 | |
| 552 If edge_del_cost is not specified then default edge deletion | |
| 553 cost of 1 is used. If edge_ins_cost is not specified then | |
| 554 default edge insertion cost of 1 is used. | |
| 555 | |
| 556 upper_bound : numeric | |
| 557 Maximum edit distance to consider. | |
| 558 | |
| 559 strictly_decreasing : bool | |
| 560 If True, return consecutive approximations of strictly | |
| 561 decreasing cost. Otherwise, return all edit paths of cost | |
| 562 less than or equal to the previous minimum cost. | |
| 563 | |
| 564 Returns | |
| 565 ------- | |
| 566 Generator of tuples (node_edit_path, edge_edit_path, cost) | |
| 567 node_edit_path : list of tuples (u, v) | |
| 568 edge_edit_path : list of tuples ((u1, v1), (u2, v2)) | |
| 569 cost : numeric | |
| 570 | |
| 571 See Also | |
| 572 -------- | |
| 573 graph_edit_distance, optimize_graph_edit_distance, optimal_edit_paths | |
| 574 | |
| 575 References | |
| 576 ---------- | |
| 577 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick | |
| 578 Martineau. An Exact Graph Edit Distance Algorithm for Solving | |
| 579 Pattern Recognition Problems. 4th International Conference on | |
| 580 Pattern Recognition Applications and Methods 2015, Jan 2015, | |
| 581 Lisbon, Portugal. 2015, | |
| 582 <10.5220/0005209202710278>. <hal-01168816> | |
| 583 https://hal.archives-ouvertes.fr/hal-01168816 | |
| 584 | |
| 585 """ | |
| 586 # TODO: support DiGraph | |
| 587 | |
| 588 import numpy as np | |
| 589 from scipy.optimize import linear_sum_assignment | |
| 590 | |
| 591 class CostMatrix: | |
| 592 def __init__(self, C, lsa_row_ind, lsa_col_ind, ls): | |
| 593 #assert C.shape[0] == len(lsa_row_ind) | |
| 594 #assert C.shape[1] == len(lsa_col_ind) | |
| 595 #assert len(lsa_row_ind) == len(lsa_col_ind) | |
| 596 #assert set(lsa_row_ind) == set(range(len(lsa_row_ind))) | |
| 597 #assert set(lsa_col_ind) == set(range(len(lsa_col_ind))) | |
| 598 #assert ls == C[lsa_row_ind, lsa_col_ind].sum() | |
| 599 self.C = C | |
| 600 self.lsa_row_ind = lsa_row_ind | |
| 601 self.lsa_col_ind = lsa_col_ind | |
| 602 self.ls = ls | |
| 603 | |
| 604 def make_CostMatrix(C, m, n): | |
| 605 #assert(C.shape == (m + n, m + n)) | |
| 606 lsa_row_ind, lsa_col_ind = linear_sum_assignment(C) | |
| 607 | |
| 608 # Fixup dummy assignments: | |
| 609 # each substitution i<->j should have dummy assignment m+j<->n+i | |
| 610 # NOTE: fast reduce of Cv relies on it | |
| 611 #assert len(lsa_row_ind) == len(lsa_col_ind) | |
| 612 indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind) | |
| 613 subst_ind = list(k for k, i, j in indexes if i < m and j < n) | |
| 614 indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind) | |
| 615 dummy_ind = list(k for k, i, j in indexes if i >= m and j >= n) | |
| 616 #assert len(subst_ind) == len(dummy_ind) | |
| 617 lsa_row_ind[dummy_ind] = lsa_col_ind[subst_ind] + m | |
| 618 lsa_col_ind[dummy_ind] = lsa_row_ind[subst_ind] + n | |
| 619 | |
| 620 return CostMatrix(C, lsa_row_ind, lsa_col_ind, | |
| 621 C[lsa_row_ind, lsa_col_ind].sum()) | |
| 622 | |
| 623 def extract_C(C, i, j, m, n): | |
| 624 #assert(C.shape == (m + n, m + n)) | |
| 625 row_ind = [k in i or k - m in j for k in range(m + n)] | |
| 626 col_ind = [k in j or k - n in i for k in range(m + n)] | |
| 627 return C[row_ind, :][:, col_ind] | |
| 628 | |
| 629 def reduce_C(C, i, j, m, n): | |
| 630 #assert(C.shape == (m + n, m + n)) | |
| 631 row_ind = [k not in i and k - m not in j for k in range(m + n)] | |
| 632 col_ind = [k not in j and k - n not in i for k in range(m + n)] | |
| 633 return C[row_ind, :][:, col_ind] | |
| 634 | |
| 635 def reduce_ind(ind, i): | |
| 636 #assert set(ind) == set(range(len(ind))) | |
| 637 rind = ind[[k not in i for k in ind]] | |
| 638 for k in set(i): | |
| 639 rind[rind >= k] -= 1 | |
| 640 return rind | |
| 641 | |
| 642 def match_edges(u, v, pending_g, pending_h, Ce, matched_uv=[]): | |
| 643 """ | |
| 644 Parameters: | |
| 645 u, v: matched vertices, u=None or v=None for | |
| 646 deletion/insertion | |
| 647 pending_g, pending_h: lists of edges not yet mapped | |
| 648 Ce: CostMatrix of pending edge mappings | |
| 649 matched_uv: partial vertex edit path | |
| 650 list of tuples (u, v) of previously matched vertex | |
| 651 mappings u<->v, u=None or v=None for | |
| 652 deletion/insertion | |
| 653 | |
| 654 Returns: | |
| 655 list of (i, j): indices of edge mappings g<->h | |
| 656 localCe: local CostMatrix of edge mappings | |
| 657 (basically submatrix of Ce at cross of rows i, cols j) | |
| 658 """ | |
| 659 M = len(pending_g) | |
| 660 N = len(pending_h) | |
| 661 #assert Ce.C.shape == (M + N, M + N) | |
| 662 | |
| 663 g_ind = [i for i in range(M) if pending_g[i][:2] == (u, u) or | |
| 664 any(pending_g[i][:2] in ((p, u), (u, p)) | |
| 665 for p, q in matched_uv)] | |
| 666 h_ind = [j for j in range(N) if pending_h[j][:2] == (v, v) or | |
| 667 any(pending_h[j][:2] in ((q, v), (v, q)) | |
| 668 for p, q in matched_uv)] | |
| 669 m = len(g_ind) | |
| 670 n = len(h_ind) | |
| 671 | |
| 672 if m or n: | |
| 673 C = extract_C(Ce.C, g_ind, h_ind, M, N) | |
| 674 #assert C.shape == (m + n, m + n) | |
| 675 | |
| 676 # Forbid structurally invalid matches | |
| 677 # NOTE: inf remembered from Ce construction | |
| 678 for k, i in zip(range(m), g_ind): | |
| 679 g = pending_g[i][:2] | |
| 680 for l, j in zip(range(n), h_ind): | |
| 681 h = pending_h[j][:2] | |
| 682 if nx.is_directed(G1) or nx.is_directed(G2): | |
| 683 if any(g == (p, u) and h == (q, v) or | |
| 684 g == (u, p) and h == (v, q) | |
| 685 for p, q in matched_uv): | |
| 686 continue | |
| 687 else: | |
| 688 if any(g in ((p, u), (u, p)) and h in ((q, v), (v, q)) | |
| 689 for p, q in matched_uv): | |
| 690 continue | |
| 691 if g == (u, u): | |
| 692 continue | |
| 693 if h == (v, v): | |
| 694 continue | |
| 695 C[k, l] = inf | |
| 696 | |
| 697 localCe = make_CostMatrix(C, m, n) | |
| 698 ij = list((g_ind[k] if k < m else M + h_ind[l], | |
| 699 h_ind[l] if l < n else N + g_ind[k]) | |
| 700 for k, l in zip(localCe.lsa_row_ind, localCe.lsa_col_ind) | |
| 701 if k < m or l < n) | |
| 702 | |
| 703 else: | |
| 704 ij = [] | |
| 705 localCe = CostMatrix(np.empty((0, 0)), [], [], 0) | |
| 706 | |
| 707 return ij, localCe | |
| 708 | |
| 709 def reduce_Ce(Ce, ij, m, n): | |
| 710 if len(ij): | |
| 711 i, j = zip(*ij) | |
| 712 m_i = m - sum(1 for t in i if t < m) | |
| 713 n_j = n - sum(1 for t in j if t < n) | |
| 714 return make_CostMatrix(reduce_C(Ce.C, i, j, m, n), m_i, n_j) | |
| 715 else: | |
| 716 return Ce | |
| 717 | |
| 718 def get_edit_ops(matched_uv, pending_u, pending_v, Cv, | |
| 719 pending_g, pending_h, Ce, matched_cost): | |
| 720 """ | |
| 721 Parameters: | |
| 722 matched_uv: partial vertex edit path | |
| 723 list of tuples (u, v) of vertex mappings u<->v, | |
| 724 u=None or v=None for deletion/insertion | |
| 725 pending_u, pending_v: lists of vertices not yet mapped | |
| 726 Cv: CostMatrix of pending vertex mappings | |
| 727 pending_g, pending_h: lists of edges not yet mapped | |
| 728 Ce: CostMatrix of pending edge mappings | |
| 729 matched_cost: cost of partial edit path | |
| 730 | |
| 731 Returns: | |
| 732 sequence of | |
| 733 (i, j): indices of vertex mapping u<->v | |
| 734 Cv_ij: reduced CostMatrix of pending vertex mappings | |
| 735 (basically Cv with row i, col j removed) | |
| 736 list of (x, y): indices of edge mappings g<->h | |
| 737 Ce_xy: reduced CostMatrix of pending edge mappings | |
| 738 (basically Ce with rows x, cols y removed) | |
| 739 cost: total cost of edit operation | |
| 740 NOTE: most promising ops first | |
| 741 """ | |
| 742 m = len(pending_u) | |
| 743 n = len(pending_v) | |
| 744 #assert Cv.C.shape == (m + n, m + n) | |
| 745 | |
| 746 # 1) a vertex mapping from optimal linear sum assignment | |
| 747 i, j = min((k, l) for k, l in zip(Cv.lsa_row_ind, Cv.lsa_col_ind) | |
| 748 if k < m or l < n) | |
| 749 xy, localCe = match_edges(pending_u[i] if i < m else None, | |
| 750 pending_v[j] if j < n else None, | |
| 751 pending_g, pending_h, Ce, matched_uv) | |
| 752 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h)) | |
| 753 #assert Ce.ls <= localCe.ls + Ce_xy.ls | |
| 754 if prune(matched_cost + Cv.ls + localCe.ls + Ce_xy.ls): | |
| 755 pass | |
| 756 else: | |
| 757 # get reduced Cv efficiently | |
| 758 Cv_ij = CostMatrix(reduce_C(Cv.C, (i,), (j,), m, n), | |
| 759 reduce_ind(Cv.lsa_row_ind, (i, m + j)), | |
| 760 reduce_ind(Cv.lsa_col_ind, (j, n + i)), | |
| 761 Cv.ls - Cv.C[i, j]) | |
| 762 yield (i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls | |
| 763 | |
| 764 # 2) other candidates, sorted by lower-bound cost estimate | |
| 765 other = list() | |
| 766 fixed_i, fixed_j = i, j | |
| 767 if m <= n: | |
| 768 candidates = ((t, fixed_j) for t in range(m + n) | |
| 769 if t != fixed_i and (t < m or t == m + fixed_j)) | |
| 770 else: | |
| 771 candidates = ((fixed_i, t) for t in range(m + n) | |
| 772 if t != fixed_j and (t < n or t == n + fixed_i)) | |
| 773 for i, j in candidates: | |
| 774 if prune(matched_cost + Cv.C[i, j] + Ce.ls): | |
| 775 continue | |
| 776 Cv_ij = make_CostMatrix(reduce_C(Cv.C, (i,), (j,), m, n), | |
| 777 m - 1 if i < m else m, | |
| 778 n - 1 if j < n else n) | |
| 779 #assert Cv.ls <= Cv.C[i, j] + Cv_ij.ls | |
| 780 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + Ce.ls): | |
| 781 continue | |
| 782 xy, localCe = match_edges(pending_u[i] if i < m else None, | |
| 783 pending_v[j] if j < n else None, | |
| 784 pending_g, pending_h, Ce, matched_uv) | |
| 785 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls): | |
| 786 continue | |
| 787 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h)) | |
| 788 #assert Ce.ls <= localCe.ls + Ce_xy.ls | |
| 789 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls + | |
| 790 Ce_xy.ls): | |
| 791 continue | |
| 792 other.append(((i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls)) | |
| 793 | |
| 794 # yield from | |
| 795 for t in sorted(other, key=lambda t: t[4] + t[1].ls + t[3].ls): | |
| 796 yield t | |
| 797 | |
| 798 def get_edit_paths(matched_uv, pending_u, pending_v, Cv, | |
| 799 matched_gh, pending_g, pending_h, Ce, matched_cost): | |
| 800 """ | |
| 801 Parameters: | |
| 802 matched_uv: partial vertex edit path | |
| 803 list of tuples (u, v) of vertex mappings u<->v, | |
| 804 u=None or v=None for deletion/insertion | |
| 805 pending_u, pending_v: lists of vertices not yet mapped | |
| 806 Cv: CostMatrix of pending vertex mappings | |
| 807 matched_gh: partial edge edit path | |
| 808 list of tuples (g, h) of edge mappings g<->h, | |
| 809 g=None or h=None for deletion/insertion | |
| 810 pending_g, pending_h: lists of edges not yet mapped | |
| 811 Ce: CostMatrix of pending edge mappings | |
| 812 matched_cost: cost of partial edit path | |
| 813 | |
| 814 Returns: | |
| 815 sequence of (vertex_path, edge_path, cost) | |
| 816 vertex_path: complete vertex edit path | |
| 817 list of tuples (u, v) of vertex mappings u<->v, | |
| 818 u=None or v=None for deletion/insertion | |
| 819 edge_path: complete edge edit path | |
| 820 list of tuples (g, h) of edge mappings g<->h, | |
| 821 g=None or h=None for deletion/insertion | |
| 822 cost: total cost of edit path | |
| 823 NOTE: path costs are non-increasing | |
| 824 """ | |
| 825 #debug_print('matched-uv:', matched_uv) | |
| 826 #debug_print('matched-gh:', matched_gh) | |
| 827 #debug_print('matched-cost:', matched_cost) | |
| 828 #debug_print('pending-u:', pending_u) | |
| 829 #debug_print('pending-v:', pending_v) | |
| 830 #debug_print(Cv.C) | |
| 831 #assert list(sorted(G1.nodes)) == list(sorted(list(u for u, v in matched_uv if u is not None) + pending_u)) | |
| 832 #assert list(sorted(G2.nodes)) == list(sorted(list(v for u, v in matched_uv if v is not None) + pending_v)) | |
| 833 #debug_print('pending-g:', pending_g) | |
| 834 #debug_print('pending-h:', pending_h) | |
| 835 #debug_print(Ce.C) | |
| 836 #assert list(sorted(G1.edges)) == list(sorted(list(g for g, h in matched_gh if g is not None) + pending_g)) | |
| 837 #assert list(sorted(G2.edges)) == list(sorted(list(h for g, h in matched_gh if h is not None) + pending_h)) | |
| 838 #debug_print() | |
| 839 | |
| 840 if prune(matched_cost + Cv.ls + Ce.ls): | |
| 841 return | |
| 842 | |
| 843 if not max(len(pending_u), len(pending_v)): | |
| 844 #assert not len(pending_g) | |
| 845 #assert not len(pending_h) | |
| 846 # path completed! | |
| 847 #assert matched_cost <= maxcost.value | |
| 848 maxcost.value = min(maxcost.value, matched_cost) | |
| 849 yield matched_uv, matched_gh, matched_cost | |
| 850 | |
| 851 else: | |
| 852 edit_ops = get_edit_ops(matched_uv, pending_u, pending_v, Cv, | |
| 853 pending_g, pending_h, Ce, matched_cost) | |
| 854 for ij, Cv_ij, xy, Ce_xy, edit_cost in edit_ops: | |
| 855 i, j = ij | |
| 856 #assert Cv.C[i, j] + sum(Ce.C[t] for t in xy) == edit_cost | |
| 857 if prune(matched_cost + edit_cost + Cv_ij.ls + Ce_xy.ls): | |
| 858 continue | |
| 859 | |
| 860 # dive deeper | |
| 861 u = pending_u.pop(i) if i < len(pending_u) else None | |
| 862 v = pending_v.pop(j) if j < len(pending_v) else None | |
| 863 matched_uv.append((u, v)) | |
| 864 for x, y in xy: | |
| 865 len_g = len(pending_g) | |
| 866 len_h = len(pending_h) | |
| 867 matched_gh.append((pending_g[x] if x < len_g else None, | |
| 868 pending_h[y] if y < len_h else None)) | |
| 869 sortedx = list(sorted(x for x, y in xy)) | |
| 870 sortedy = list(sorted(y for x, y in xy)) | |
| 871 G = list((pending_g.pop(x) if x < len(pending_g) else None) | |
| 872 for x in reversed(sortedx)) | |
| 873 H = list((pending_h.pop(y) if y < len(pending_h) else None) | |
| 874 for y in reversed(sortedy)) | |
| 875 | |
| 876 # yield from | |
| 877 for t in get_edit_paths(matched_uv, pending_u, pending_v, | |
| 878 Cv_ij, | |
| 879 matched_gh, pending_g, pending_h, | |
| 880 Ce_xy, | |
| 881 matched_cost + edit_cost): | |
| 882 yield t | |
| 883 | |
| 884 # backtrack | |
| 885 if u is not None: | |
| 886 pending_u.insert(i, u) | |
| 887 if v is not None: | |
| 888 pending_v.insert(j, v) | |
| 889 matched_uv.pop() | |
| 890 for x, g in zip(sortedx, reversed(G)): | |
| 891 if g is not None: | |
| 892 pending_g.insert(x, g) | |
| 893 for y, h in zip(sortedy, reversed(H)): | |
| 894 if h is not None: | |
| 895 pending_h.insert(y, h) | |
| 896 for t in xy: | |
| 897 matched_gh.pop() | |
| 898 | |
| 899 # Initialization | |
| 900 | |
| 901 pending_u = list(G1.nodes) | |
| 902 pending_v = list(G2.nodes) | |
| 903 | |
| 904 # cost matrix of vertex mappings | |
| 905 m = len(pending_u) | |
| 906 n = len(pending_v) | |
| 907 C = np.zeros((m + n, m + n)) | |
| 908 if node_subst_cost: | |
| 909 C[0:m, 0:n] = np.array([node_subst_cost(G1.nodes[u], G2.nodes[v]) | |
| 910 for u in pending_u for v in pending_v] | |
| 911 ).reshape(m, n) | |
| 912 elif node_match: | |
| 913 C[0:m, 0:n] = np.array([1 - int(node_match(G1.nodes[u], G2.nodes[v])) | |
| 914 for u in pending_u for v in pending_v] | |
| 915 ).reshape(m, n) | |
| 916 else: | |
| 917 # all zeroes | |
| 918 pass | |
| 919 #assert not min(m, n) or C[0:m, 0:n].min() >= 0 | |
| 920 if node_del_cost: | |
| 921 del_costs = [node_del_cost(G1.nodes[u]) for u in pending_u] | |
| 922 else: | |
| 923 del_costs = [1] * len(pending_u) | |
| 924 #assert not m or min(del_costs) >= 0 | |
| 925 if node_ins_cost: | |
| 926 ins_costs = [node_ins_cost(G2.nodes[v]) for v in pending_v] | |
| 927 else: | |
| 928 ins_costs = [1] * len(pending_v) | |
| 929 #assert not n or min(ins_costs) >= 0 | |
| 930 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1 | |
| 931 C[0:m, n:n + m] = np.array([del_costs[i] if i == j else inf | |
| 932 for i in range(m) for j in range(m)] | |
| 933 ).reshape(m, m) | |
| 934 C[m:m + n, 0:n] = np.array([ins_costs[i] if i == j else inf | |
| 935 for i in range(n) for j in range(n)] | |
| 936 ).reshape(n, n) | |
| 937 Cv = make_CostMatrix(C, m, n) | |
| 938 #debug_print('Cv: {} x {}'.format(m, n)) | |
| 939 #debug_print(Cv.C) | |
| 940 | |
| 941 pending_g = list(G1.edges) | |
| 942 pending_h = list(G2.edges) | |
| 943 | |
| 944 # cost matrix of edge mappings | |
| 945 m = len(pending_g) | |
| 946 n = len(pending_h) | |
| 947 C = np.zeros((m + n, m + n)) | |
| 948 if edge_subst_cost: | |
| 949 C[0:m, 0:n] = np.array([edge_subst_cost(G1.edges[g], G2.edges[h]) | |
| 950 for g in pending_g for h in pending_h] | |
| 951 ).reshape(m, n) | |
| 952 elif edge_match: | |
| 953 C[0:m, 0:n] = np.array([1 - int(edge_match(G1.edges[g], G2.edges[h])) | |
| 954 for g in pending_g for h in pending_h] | |
| 955 ).reshape(m, n) | |
| 956 else: | |
| 957 # all zeroes | |
| 958 pass | |
| 959 #assert not min(m, n) or C[0:m, 0:n].min() >= 0 | |
| 960 if edge_del_cost: | |
| 961 del_costs = [edge_del_cost(G1.edges[g]) for g in pending_g] | |
| 962 else: | |
| 963 del_costs = [1] * len(pending_g) | |
| 964 #assert not m or min(del_costs) >= 0 | |
| 965 if edge_ins_cost: | |
| 966 ins_costs = [edge_ins_cost(G2.edges[h]) for h in pending_h] | |
| 967 else: | |
| 968 ins_costs = [1] * len(pending_h) | |
| 969 #assert not n or min(ins_costs) >= 0 | |
| 970 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1 | |
| 971 C[0:m, n:n + m] = np.array([del_costs[i] if i == j else inf | |
| 972 for i in range(m) for j in range(m)] | |
| 973 ).reshape(m, m) | |
| 974 C[m:m + n, 0:n] = np.array([ins_costs[i] if i == j else inf | |
| 975 for i in range(n) for j in range(n)] | |
| 976 ).reshape(n, n) | |
| 977 Ce = make_CostMatrix(C, m, n) | |
| 978 #debug_print('Ce: {} x {}'.format(m, n)) | |
| 979 #debug_print(Ce.C) | |
| 980 #debug_print() | |
| 981 | |
| 982 class MaxCost: | |
| 983 def __init__(self): | |
| 984 # initial upper-bound estimate | |
| 985 # NOTE: should work for empty graph | |
| 986 self.value = Cv.C.sum() + Ce.C.sum() + 1 | |
| 987 maxcost = MaxCost() | |
| 988 | |
| 989 def prune(cost): | |
| 990 if upper_bound is not None: | |
| 991 if cost > upper_bound: | |
| 992 return True | |
| 993 if cost > maxcost.value: | |
| 994 return True | |
| 995 elif strictly_decreasing and cost >= maxcost.value: | |
| 996 return True | |
| 997 | |
| 998 # Now go! | |
| 999 | |
| 1000 for vertex_path, edge_path, cost in \ | |
| 1001 get_edit_paths([], pending_u, pending_v, Cv, | |
| 1002 [], pending_g, pending_h, Ce, 0): | |
| 1003 #assert sorted(G1.nodes) == sorted(u for u, v in vertex_path if u is not None) | |
| 1004 #assert sorted(G2.nodes) == sorted(v for u, v in vertex_path if v is not None) | |
| 1005 #assert sorted(G1.edges) == sorted(g for g, h in edge_path if g is not None) | |
| 1006 #assert sorted(G2.edges) == sorted(h for g, h in edge_path if h is not None) | |
| 1007 #print(vertex_path, edge_path, cost, file = sys.stderr) | |
| 1008 #assert cost == maxcost.value | |
| 1009 yield list(vertex_path), list(edge_path), cost | |
| 1010 | |
| 1011 | |
| 1012 def _is_close(d1, d2, atolerance=0, rtolerance=0): | |
| 1013 """Determines whether two adjacency matrices are within | |
| 1014 a provided tolerance. | |
| 1015 | |
| 1016 Parameters | |
| 1017 ---------- | |
| 1018 d1 : dict | |
| 1019 Adjacency dictionary | |
| 1020 | |
| 1021 d2 : dict | |
| 1022 Adjacency dictionary | |
| 1023 | |
| 1024 atolerance : float | |
| 1025 Some scalar tolerance value to determine closeness | |
| 1026 | |
| 1027 rtolerance : float | |
| 1028 A scalar tolerance value that will be some proportion | |
| 1029 of ``d2``'s value | |
| 1030 | |
| 1031 Returns | |
| 1032 ------- | |
| 1033 closeness : bool | |
| 1034 If all of the nodes within ``d1`` and ``d2`` are within | |
| 1035 a predefined tolerance, they are considered "close" and | |
| 1036 this method will return True. Otherwise, this method will | |
| 1037 return False. | |
| 1038 | |
| 1039 """ | |
| 1040 # Pre-condition: d1 and d2 have the same keys at each level if they | |
| 1041 # are dictionaries. | |
| 1042 if not isinstance(d1, dict) and not isinstance(d2, dict): | |
| 1043 return abs(d1 - d2) <= atolerance + rtolerance * abs(d2) | |
| 1044 return all(all(_is_close(d1[u][v], d2[u][v]) for v in d1[u]) for u in d1) | |
| 1045 | |
| 1046 | |
| 1047 def simrank_similarity(G, source=None, target=None, importance_factor=0.9, | |
| 1048 max_iterations=100, tolerance=1e-4): | |
| 1049 """Returns the SimRank similarity of nodes in the graph ``G``. | |
| 1050 | |
| 1051 SimRank is a similarity metric that says "two objects are considered | |
| 1052 to be similar if they are referenced by similar objects." [1]_. | |
| 1053 | |
| 1054 The pseudo-code definition from the paper is:: | |
| 1055 | |
| 1056 def simrank(G, u, v): | |
| 1057 in_neighbors_u = G.predecessors(u) | |
| 1058 in_neighbors_v = G.predecessors(v) | |
| 1059 scale = C / (len(in_neighbors_u) * len(in_neighbors_v)) | |
| 1060 return scale * sum(simrank(G, w, x) | |
| 1061 for w, x in product(in_neighbors_u, | |
| 1062 in_neighbors_v)) | |
| 1063 | |
| 1064 where ``G`` is the graph, ``u`` is the source, ``v`` is the target, | |
| 1065 and ``C`` is a float decay or importance factor between 0 and 1. | |
| 1066 | |
| 1067 The SimRank algorithm for determining node similarity is defined in | |
| 1068 [2]_. | |
| 1069 | |
| 1070 Parameters | |
| 1071 ---------- | |
| 1072 G : NetworkX graph | |
| 1073 A NetworkX graph | |
| 1074 | |
| 1075 source : node | |
| 1076 If this is specified, the returned dictionary maps each node | |
| 1077 ``v`` in the graph to the similarity between ``source`` and | |
| 1078 ``v``. | |
| 1079 | |
| 1080 target : node | |
| 1081 If both ``source`` and ``target`` are specified, the similarity | |
| 1082 value between ``source`` and ``target`` is returned. If | |
| 1083 ``target`` is specified but ``source`` is not, this argument is | |
| 1084 ignored. | |
| 1085 | |
| 1086 importance_factor : float | |
| 1087 The relative importance of indirect neighbors with respect to | |
| 1088 direct neighbors. | |
| 1089 | |
| 1090 max_iterations : integer | |
| 1091 Maximum number of iterations. | |
| 1092 | |
| 1093 tolerance : float | |
| 1094 Error tolerance used to check convergence. When an iteration of | |
| 1095 the algorithm finds that no similarity value changes more than | |
| 1096 this amount, the algorithm halts. | |
| 1097 | |
| 1098 Returns | |
| 1099 ------- | |
| 1100 similarity : dictionary or float | |
| 1101 If ``source`` and ``target`` are both ``None``, this returns a | |
| 1102 dictionary of dictionaries, where keys are node pairs and value | |
| 1103 are similarity of the pair of nodes. | |
| 1104 | |
| 1105 If ``source`` is not ``None`` but ``target`` is, this returns a | |
| 1106 dictionary mapping node to the similarity of ``source`` and that | |
| 1107 node. | |
| 1108 | |
| 1109 If neither ``source`` nor ``target`` is ``None``, this returns | |
| 1110 the similarity value for the given pair of nodes. | |
| 1111 | |
| 1112 Examples | |
| 1113 -------- | |
| 1114 If the nodes of the graph are numbered from zero to *n - 1*, where *n* | |
| 1115 is the number of nodes in the graph, you can create a SimRank matrix | |
| 1116 from the return value of this function where the node numbers are | |
| 1117 the row and column indices of the matrix:: | |
| 1118 | |
| 1119 >>> import networkx as nx | |
| 1120 >>> from numpy import array | |
| 1121 >>> G = nx.cycle_graph(4) | |
| 1122 >>> sim = nx.simrank_similarity(G) | |
| 1123 >>> lol = [[sim[u][v] for v in sorted(sim[u])] for u in sorted(sim)] | |
| 1124 >>> sim_array = array(lol) | |
| 1125 | |
| 1126 References | |
| 1127 ---------- | |
| 1128 .. [1] https://en.wikipedia.org/wiki/SimRank | |
| 1129 .. [2] G. Jeh and J. Widom. | |
| 1130 "SimRank: a measure of structural-context similarity", | |
| 1131 In KDD'02: Proceedings of the Eighth ACM SIGKDD | |
| 1132 International Conference on Knowledge Discovery and Data Mining, | |
| 1133 pp. 538--543. ACM Press, 2002. | |
| 1134 """ | |
| 1135 prevsim = None | |
| 1136 | |
| 1137 # build up our similarity adjacency dictionary output | |
| 1138 newsim = {u: {v: 1 if u == v else 0 for v in G} for u in G} | |
| 1139 | |
| 1140 # These functions compute the update to the similarity value of the nodes | |
| 1141 # `u` and `v` with respect to the previous similarity values. | |
| 1142 avg_sim = lambda s: sum(newsim[w][x] for (w, x) in s) / len(s) if s else 0.0 | |
| 1143 sim = lambda u, v: importance_factor * avg_sim(list(product(G[u], G[v]))) | |
| 1144 | |
| 1145 for _ in range(max_iterations): | |
| 1146 if prevsim and _is_close(prevsim, newsim, tolerance): | |
| 1147 break | |
| 1148 prevsim = newsim | |
| 1149 newsim = {u: {v: sim(u, v) if u is not v else 1 | |
| 1150 for v in newsim[u]} for u in newsim} | |
| 1151 | |
| 1152 if source is not None and target is not None: | |
| 1153 return newsim[source][target] | |
| 1154 if source is not None: | |
| 1155 return newsim[source] | |
| 1156 return newsim | |
| 1157 | |
| 1158 | |
| 1159 def simrank_similarity_numpy(G, source=None, target=None, importance_factor=0.9, | |
| 1160 max_iterations=100, tolerance=1e-4): | |
| 1161 """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``. | |
| 1162 | |
| 1163 The SimRank algorithm for determining node similarity is defined in | |
| 1164 [1]_. | |
| 1165 | |
| 1166 Parameters | |
| 1167 ---------- | |
| 1168 G : NetworkX graph | |
| 1169 A NetworkX graph | |
| 1170 | |
| 1171 source : node | |
| 1172 If this is specified, the returned dictionary maps each node | |
| 1173 ``v`` in the graph to the similarity between ``source`` and | |
| 1174 ``v``. | |
| 1175 | |
| 1176 target : node | |
| 1177 If both ``source`` and ``target`` are specified, the similarity | |
| 1178 value between ``source`` and ``target`` is returned. If | |
| 1179 ``target`` is specified but ``source`` is not, this argument is | |
| 1180 ignored. | |
| 1181 | |
| 1182 importance_factor : float | |
| 1183 The relative importance of indirect neighbors with respect to | |
| 1184 direct neighbors. | |
| 1185 | |
| 1186 max_iterations : integer | |
| 1187 Maximum number of iterations. | |
| 1188 | |
| 1189 tolerance : float | |
| 1190 Error tolerance used to check convergence. When an iteration of | |
| 1191 the algorithm finds that no similarity value changes more than | |
| 1192 this amount, the algorithm halts. | |
| 1193 | |
| 1194 Returns | |
| 1195 ------- | |
| 1196 similarity : dictionary or float | |
| 1197 If ``source`` and ``target`` are both ``None``, this returns a | |
| 1198 dictionary of dictionaries, where keys are node pairs and value | |
| 1199 are similarity of the pair of nodes. | |
| 1200 | |
| 1201 If ``source`` is not ``None`` but ``target`` is, this returns a | |
| 1202 dictionary mapping node to the similarity of ``source`` and that | |
| 1203 node. | |
| 1204 | |
| 1205 If neither ``source`` nor ``target`` is ``None``, this returns | |
| 1206 the similarity value for the given pair of nodes. | |
| 1207 | |
| 1208 Examples | |
| 1209 -------- | |
| 1210 >>> import networkx as nx | |
| 1211 >>> from numpy import array | |
| 1212 >>> G = nx.cycle_graph(4) | |
| 1213 >>> sim = nx.simrank_similarity_numpy(G) | |
| 1214 | |
| 1215 References | |
| 1216 ---------- | |
| 1217 .. [1] G. Jeh and J. Widom. | |
| 1218 "SimRank: a measure of structural-context similarity", | |
| 1219 In KDD'02: Proceedings of the Eighth ACM SIGKDD | |
| 1220 International Conference on Knowledge Discovery and Data Mining, | |
| 1221 pp. 538--543. ACM Press, 2002. | |
| 1222 """ | |
| 1223 # This algorithm follows roughly | |
| 1224 # | |
| 1225 # S = max{C * (A.T * S * A), I} | |
| 1226 # | |
| 1227 # where C is the importance factor, A is the column normalized | |
| 1228 # adjacency matrix, and I is the identity matrix. | |
| 1229 import numpy as np | |
| 1230 adjacency_matrix = nx.to_numpy_array(G) | |
| 1231 | |
| 1232 # column-normalize the ``adjacency_matrix`` | |
| 1233 adjacency_matrix /= adjacency_matrix.sum(axis=0) | |
| 1234 | |
| 1235 newsim = np.eye(adjacency_matrix.shape[0], dtype=np.float64) | |
| 1236 for _ in range(max_iterations): | |
| 1237 prevsim = np.copy(newsim) | |
| 1238 newsim = importance_factor * np.matmul( | |
| 1239 np.matmul(adjacency_matrix.T, prevsim), adjacency_matrix) | |
| 1240 np.fill_diagonal(newsim, 1.0) | |
| 1241 | |
| 1242 if np.allclose(prevsim, newsim, atol=tolerance): | |
| 1243 break | |
| 1244 | |
| 1245 if source is not None and target is not None: | |
| 1246 return newsim[source, target] | |
| 1247 if source is not None: | |
| 1248 return newsim[source] | |
| 1249 return newsim | |
| 1250 | |
| 1251 | |
| 1252 # fixture for pytest | |
| 1253 def setup_module(module): | |
| 1254 import pytest | |
| 1255 numpy = pytest.importorskip('numpy') | |
| 1256 scipy = pytest.importorskip('scipy') | 
