Mercurial > repos > guerler > springsuite
diff planemo/lib/python3.7/site-packages/networkx/algorithms/similarity.py @ 1:56ad4e20f292 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:32:28 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/planemo/lib/python3.7/site-packages/networkx/algorithms/similarity.py Fri Jul 31 00:32:28 2020 -0400 @@ -0,0 +1,1256 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2010 by +# Aric Hagberg <hagberg@lanl.gov> +# Dan Schult <dschult@colgate.edu> +# Pieter Swart <swart@lanl.gov> +# All rights reserved. +# BSD license. +# +# Author: Andrey Paramonov <paramon@acdlabs.ru> +""" Functions measuring similarity using graph edit distance. + +The graph edit distance is the number of edge/node changes needed +to make two graphs isomorphic. + +The default algorithm/implementation is sub-optimal for some graphs. +The problem of finding the exact Graph Edit Distance (GED) is NP-hard +so it is often slow. If the simple interface `graph_edit_distance` +takes too long for your graph, try `optimize_graph_edit_distance` +and/or `optimize_edit_paths`. + +At the same time, I encourage capable people to investigate +alternative GED algorithms, in order to improve the choices available. +""" +from itertools import product +import math +import networkx as nx +from operator import * +import sys + +__author__ = 'Andrey Paramonov <paramon@acdlabs.ru>' + +__all__ = [ + 'graph_edit_distance', + 'optimal_edit_paths', + 'optimize_graph_edit_distance', + 'optimize_edit_paths', + 'simrank_similarity', + 'simrank_similarity_numpy', +] + + +def debug_print(*args, **kwargs): + print(*args, **kwargs) + + +def graph_edit_distance(G1, G2, node_match=None, edge_match=None, + node_subst_cost=None, node_del_cost=None, + node_ins_cost=None, + edge_subst_cost=None, edge_del_cost=None, + edge_ins_cost=None, + upper_bound=None): + """Returns GED (graph edit distance) between graphs G1 and G2. + + Graph edit distance is a graph similarity measure analogous to + Levenshtein distance for strings. It is defined as minimum cost + of edit path (sequence of node and edge edit operations) + transforming graph G1 to graph isomorphic to G2. + + Parameters + ---------- + G1, G2: graphs + The two graphs G1 and G2 must be of the same type. + + node_match : callable + A function that returns True if node n1 in G1 and n2 in G2 + should be considered equal during matching. + + The function will be called like + + node_match(G1.nodes[n1], G2.nodes[n2]). + + That is, the function will receive the node attribute + dictionaries for n1 and n2 as inputs. + + Ignored if node_subst_cost is specified. If neither + node_match nor node_subst_cost are specified then node + attributes are not considered. + + edge_match : callable + A function that returns True if the edge attribute dictionaries + for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should + be considered equal during matching. + + The function will be called like + + edge_match(G1[u1][v1], G2[u2][v2]). + + That is, the function will receive the edge attribute + dictionaries of the edges under consideration. + + Ignored if edge_subst_cost is specified. If neither + edge_match nor edge_subst_cost are specified then edge + attributes are not considered. + + node_subst_cost, node_del_cost, node_ins_cost : callable + Functions that return the costs of node substitution, node + deletion, and node insertion, respectively. + + The functions will be called like + + node_subst_cost(G1.nodes[n1], G2.nodes[n2]), + node_del_cost(G1.nodes[n1]), + node_ins_cost(G2.nodes[n2]). + + That is, the functions will receive the node attribute + dictionaries as inputs. The functions are expected to return + positive numeric values. + + Function node_subst_cost overrides node_match if specified. + If neither node_match nor node_subst_cost are specified then + default node substitution cost of 0 is used (node attributes + are not considered during matching). + + If node_del_cost is not specified then default node deletion + cost of 1 is used. If node_ins_cost is not specified then + default node insertion cost of 1 is used. + + edge_subst_cost, edge_del_cost, edge_ins_cost : callable + Functions that return the costs of edge substitution, edge + deletion, and edge insertion, respectively. + + The functions will be called like + + edge_subst_cost(G1[u1][v1], G2[u2][v2]), + edge_del_cost(G1[u1][v1]), + edge_ins_cost(G2[u2][v2]). + + That is, the functions will receive the edge attribute + dictionaries as inputs. The functions are expected to return + positive numeric values. + + Function edge_subst_cost overrides edge_match if specified. + If neither edge_match nor edge_subst_cost are specified then + default edge substitution cost of 0 is used (edge attributes + are not considered during matching). + + If edge_del_cost is not specified then default edge deletion + cost of 1 is used. If edge_ins_cost is not specified then + default edge insertion cost of 1 is used. + + upper_bound : numeric + Maximum edit distance to consider. Return None if no edit + distance under or equal to upper_bound exists. + + Examples + -------- + >>> G1 = nx.cycle_graph(6) + >>> G2 = nx.wheel_graph(7) + >>> nx.graph_edit_distance(G1, G2) + 7.0 + + See Also + -------- + optimal_edit_paths, optimize_graph_edit_distance, + + is_isomorphic (test for graph edit distance of 0) + + References + ---------- + .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick + Martineau. An Exact Graph Edit Distance Algorithm for Solving + Pattern Recognition Problems. 4th International Conference on + Pattern Recognition Applications and Methods 2015, Jan 2015, + Lisbon, Portugal. 2015, + <10.5220/0005209202710278>. <hal-01168816> + https://hal.archives-ouvertes.fr/hal-01168816 + + """ + bestcost = None + for vertex_path, edge_path, cost in \ + optimize_edit_paths(G1, G2, node_match, edge_match, + node_subst_cost, node_del_cost, node_ins_cost, + edge_subst_cost, edge_del_cost, edge_ins_cost, + upper_bound, True): + #assert bestcost is None or cost < bestcost + bestcost = cost + return bestcost + + +def optimal_edit_paths(G1, G2, node_match=None, edge_match=None, + node_subst_cost=None, node_del_cost=None, + node_ins_cost=None, + edge_subst_cost=None, edge_del_cost=None, + edge_ins_cost=None, + upper_bound=None): + """Returns all minimum-cost edit paths transforming G1 to G2. + + Graph edit path is a sequence of node and edge edit operations + transforming graph G1 to graph isomorphic to G2. Edit operations + include substitutions, deletions, and insertions. + + Parameters + ---------- + G1, G2: graphs + The two graphs G1 and G2 must be of the same type. + + node_match : callable + A function that returns True if node n1 in G1 and n2 in G2 + should be considered equal during matching. + + The function will be called like + + node_match(G1.nodes[n1], G2.nodes[n2]). + + That is, the function will receive the node attribute + dictionaries for n1 and n2 as inputs. + + Ignored if node_subst_cost is specified. If neither + node_match nor node_subst_cost are specified then node + attributes are not considered. + + edge_match : callable + A function that returns True if the edge attribute dictionaries + for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should + be considered equal during matching. + + The function will be called like + + edge_match(G1[u1][v1], G2[u2][v2]). + + That is, the function will receive the edge attribute + dictionaries of the edges under consideration. + + Ignored if edge_subst_cost is specified. If neither + edge_match nor edge_subst_cost are specified then edge + attributes are not considered. + + node_subst_cost, node_del_cost, node_ins_cost : callable + Functions that return the costs of node substitution, node + deletion, and node insertion, respectively. + + The functions will be called like + + node_subst_cost(G1.nodes[n1], G2.nodes[n2]), + node_del_cost(G1.nodes[n1]), + node_ins_cost(G2.nodes[n2]). + + That is, the functions will receive the node attribute + dictionaries as inputs. The functions are expected to return + positive numeric values. + + Function node_subst_cost overrides node_match if specified. + If neither node_match nor node_subst_cost are specified then + default node substitution cost of 0 is used (node attributes + are not considered during matching). + + If node_del_cost is not specified then default node deletion + cost of 1 is used. If node_ins_cost is not specified then + default node insertion cost of 1 is used. + + edge_subst_cost, edge_del_cost, edge_ins_cost : callable + Functions that return the costs of edge substitution, edge + deletion, and edge insertion, respectively. + + The functions will be called like + + edge_subst_cost(G1[u1][v1], G2[u2][v2]), + edge_del_cost(G1[u1][v1]), + edge_ins_cost(G2[u2][v2]). + + That is, the functions will receive the edge attribute + dictionaries as inputs. The functions are expected to return + positive numeric values. + + Function edge_subst_cost overrides edge_match if specified. + If neither edge_match nor edge_subst_cost are specified then + default edge substitution cost of 0 is used (edge attributes + are not considered during matching). + + If edge_del_cost is not specified then default edge deletion + cost of 1 is used. If edge_ins_cost is not specified then + default edge insertion cost of 1 is used. + + upper_bound : numeric + Maximum edit distance to consider. + + Returns + ------- + edit_paths : list of tuples (node_edit_path, edge_edit_path) + node_edit_path : list of tuples (u, v) + edge_edit_path : list of tuples ((u1, v1), (u2, v2)) + + cost : numeric + Optimal edit path cost (graph edit distance). + + Examples + -------- + >>> G1 = nx.cycle_graph(6) + >>> G2 = nx.wheel_graph(7) + >>> paths, cost = nx.optimal_edit_paths(G1, G2) + >>> len(paths) + 84 + >>> cost + 7.0 + + See Also + -------- + graph_edit_distance, optimize_edit_paths + + References + ---------- + .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick + Martineau. An Exact Graph Edit Distance Algorithm for Solving + Pattern Recognition Problems. 4th International Conference on + Pattern Recognition Applications and Methods 2015, Jan 2015, + Lisbon, Portugal. 2015, + <10.5220/0005209202710278>. <hal-01168816> + https://hal.archives-ouvertes.fr/hal-01168816 + + """ + paths = list() + bestcost = None + for vertex_path, edge_path, cost in \ + optimize_edit_paths(G1, G2, node_match, edge_match, + node_subst_cost, node_del_cost, node_ins_cost, + edge_subst_cost, edge_del_cost, edge_ins_cost, + upper_bound, False): + #assert bestcost is None or cost <= bestcost + if bestcost is not None and cost < bestcost: + paths = list() + paths.append((vertex_path, edge_path)) + bestcost = cost + return paths, bestcost + + +def optimize_graph_edit_distance(G1, G2, node_match=None, edge_match=None, + node_subst_cost=None, node_del_cost=None, + node_ins_cost=None, + edge_subst_cost=None, edge_del_cost=None, + edge_ins_cost=None, + upper_bound=None): + """Returns consecutive approximations of GED (graph edit distance) + between graphs G1 and G2. + + Graph edit distance is a graph similarity measure analogous to + Levenshtein distance for strings. It is defined as minimum cost + of edit path (sequence of node and edge edit operations) + transforming graph G1 to graph isomorphic to G2. + + Parameters + ---------- + G1, G2: graphs + The two graphs G1 and G2 must be of the same type. + + node_match : callable + A function that returns True if node n1 in G1 and n2 in G2 + should be considered equal during matching. + + The function will be called like + + node_match(G1.nodes[n1], G2.nodes[n2]). + + That is, the function will receive the node attribute + dictionaries for n1 and n2 as inputs. + + Ignored if node_subst_cost is specified. If neither + node_match nor node_subst_cost are specified then node + attributes are not considered. + + edge_match : callable + A function that returns True if the edge attribute dictionaries + for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should + be considered equal during matching. + + The function will be called like + + edge_match(G1[u1][v1], G2[u2][v2]). + + That is, the function will receive the edge attribute + dictionaries of the edges under consideration. + + Ignored if edge_subst_cost is specified. If neither + edge_match nor edge_subst_cost are specified then edge + attributes are not considered. + + node_subst_cost, node_del_cost, node_ins_cost : callable + Functions that return the costs of node substitution, node + deletion, and node insertion, respectively. + + The functions will be called like + + node_subst_cost(G1.nodes[n1], G2.nodes[n2]), + node_del_cost(G1.nodes[n1]), + node_ins_cost(G2.nodes[n2]). + + That is, the functions will receive the node attribute + dictionaries as inputs. The functions are expected to return + positive numeric values. + + Function node_subst_cost overrides node_match if specified. + If neither node_match nor node_subst_cost are specified then + default node substitution cost of 0 is used (node attributes + are not considered during matching). + + If node_del_cost is not specified then default node deletion + cost of 1 is used. If node_ins_cost is not specified then + default node insertion cost of 1 is used. + + edge_subst_cost, edge_del_cost, edge_ins_cost : callable + Functions that return the costs of edge substitution, edge + deletion, and edge insertion, respectively. + + The functions will be called like + + edge_subst_cost(G1[u1][v1], G2[u2][v2]), + edge_del_cost(G1[u1][v1]), + edge_ins_cost(G2[u2][v2]). + + That is, the functions will receive the edge attribute + dictionaries as inputs. The functions are expected to return + positive numeric values. + + Function edge_subst_cost overrides edge_match if specified. + If neither edge_match nor edge_subst_cost are specified then + default edge substitution cost of 0 is used (edge attributes + are not considered during matching). + + If edge_del_cost is not specified then default edge deletion + cost of 1 is used. If edge_ins_cost is not specified then + default edge insertion cost of 1 is used. + + upper_bound : numeric + Maximum edit distance to consider. + + Returns + ------- + Generator of consecutive approximations of graph edit distance. + + Examples + -------- + >>> G1 = nx.cycle_graph(6) + >>> G2 = nx.wheel_graph(7) + >>> for v in nx.optimize_graph_edit_distance(G1, G2): + ... minv = v + >>> minv + 7.0 + + See Also + -------- + graph_edit_distance, optimize_edit_paths + + References + ---------- + .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick + Martineau. An Exact Graph Edit Distance Algorithm for Solving + Pattern Recognition Problems. 4th International Conference on + Pattern Recognition Applications and Methods 2015, Jan 2015, + Lisbon, Portugal. 2015, + <10.5220/0005209202710278>. <hal-01168816> + https://hal.archives-ouvertes.fr/hal-01168816 + """ + for vertex_path, edge_path, cost in \ + optimize_edit_paths(G1, G2, node_match, edge_match, + node_subst_cost, node_del_cost, node_ins_cost, + edge_subst_cost, edge_del_cost, edge_ins_cost, + upper_bound, True): + yield cost + + +def optimize_edit_paths(G1, G2, node_match=None, edge_match=None, + node_subst_cost=None, node_del_cost=None, + node_ins_cost=None, + edge_subst_cost=None, edge_del_cost=None, + edge_ins_cost=None, + upper_bound=None, strictly_decreasing=True): + """GED (graph edit distance) calculation: advanced interface. + + Graph edit path is a sequence of node and edge edit operations + transforming graph G1 to graph isomorphic to G2. Edit operations + include substitutions, deletions, and insertions. + + Graph edit distance is defined as minimum cost of edit path. + + Parameters + ---------- + G1, G2: graphs + The two graphs G1 and G2 must be of the same type. + + node_match : callable + A function that returns True if node n1 in G1 and n2 in G2 + should be considered equal during matching. + + The function will be called like + + node_match(G1.nodes[n1], G2.nodes[n2]). + + That is, the function will receive the node attribute + dictionaries for n1 and n2 as inputs. + + Ignored if node_subst_cost is specified. If neither + node_match nor node_subst_cost are specified then node + attributes are not considered. + + edge_match : callable + A function that returns True if the edge attribute dictionaries + for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should + be considered equal during matching. + + The function will be called like + + edge_match(G1[u1][v1], G2[u2][v2]). + + That is, the function will receive the edge attribute + dictionaries of the edges under consideration. + + Ignored if edge_subst_cost is specified. If neither + edge_match nor edge_subst_cost are specified then edge + attributes are not considered. + + node_subst_cost, node_del_cost, node_ins_cost : callable + Functions that return the costs of node substitution, node + deletion, and node insertion, respectively. + + The functions will be called like + + node_subst_cost(G1.nodes[n1], G2.nodes[n2]), + node_del_cost(G1.nodes[n1]), + node_ins_cost(G2.nodes[n2]). + + That is, the functions will receive the node attribute + dictionaries as inputs. The functions are expected to return + positive numeric values. + + Function node_subst_cost overrides node_match if specified. + If neither node_match nor node_subst_cost are specified then + default node substitution cost of 0 is used (node attributes + are not considered during matching). + + If node_del_cost is not specified then default node deletion + cost of 1 is used. If node_ins_cost is not specified then + default node insertion cost of 1 is used. + + edge_subst_cost, edge_del_cost, edge_ins_cost : callable + Functions that return the costs of edge substitution, edge + deletion, and edge insertion, respectively. + + The functions will be called like + + edge_subst_cost(G1[u1][v1], G2[u2][v2]), + edge_del_cost(G1[u1][v1]), + edge_ins_cost(G2[u2][v2]). + + That is, the functions will receive the edge attribute + dictionaries as inputs. The functions are expected to return + positive numeric values. + + Function edge_subst_cost overrides edge_match if specified. + If neither edge_match nor edge_subst_cost are specified then + default edge substitution cost of 0 is used (edge attributes + are not considered during matching). + + If edge_del_cost is not specified then default edge deletion + cost of 1 is used. If edge_ins_cost is not specified then + default edge insertion cost of 1 is used. + + upper_bound : numeric + Maximum edit distance to consider. + + strictly_decreasing : bool + If True, return consecutive approximations of strictly + decreasing cost. Otherwise, return all edit paths of cost + less than or equal to the previous minimum cost. + + Returns + ------- + Generator of tuples (node_edit_path, edge_edit_path, cost) + node_edit_path : list of tuples (u, v) + edge_edit_path : list of tuples ((u1, v1), (u2, v2)) + cost : numeric + + See Also + -------- + graph_edit_distance, optimize_graph_edit_distance, optimal_edit_paths + + References + ---------- + .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick + Martineau. An Exact Graph Edit Distance Algorithm for Solving + Pattern Recognition Problems. 4th International Conference on + Pattern Recognition Applications and Methods 2015, Jan 2015, + Lisbon, Portugal. 2015, + <10.5220/0005209202710278>. <hal-01168816> + https://hal.archives-ouvertes.fr/hal-01168816 + + """ + # TODO: support DiGraph + + import numpy as np + from scipy.optimize import linear_sum_assignment + + class CostMatrix: + def __init__(self, C, lsa_row_ind, lsa_col_ind, ls): + #assert C.shape[0] == len(lsa_row_ind) + #assert C.shape[1] == len(lsa_col_ind) + #assert len(lsa_row_ind) == len(lsa_col_ind) + #assert set(lsa_row_ind) == set(range(len(lsa_row_ind))) + #assert set(lsa_col_ind) == set(range(len(lsa_col_ind))) + #assert ls == C[lsa_row_ind, lsa_col_ind].sum() + self.C = C + self.lsa_row_ind = lsa_row_ind + self.lsa_col_ind = lsa_col_ind + self.ls = ls + + def make_CostMatrix(C, m, n): + #assert(C.shape == (m + n, m + n)) + lsa_row_ind, lsa_col_ind = linear_sum_assignment(C) + + # Fixup dummy assignments: + # each substitution i<->j should have dummy assignment m+j<->n+i + # NOTE: fast reduce of Cv relies on it + #assert len(lsa_row_ind) == len(lsa_col_ind) + indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind) + subst_ind = list(k for k, i, j in indexes if i < m and j < n) + indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind) + dummy_ind = list(k for k, i, j in indexes if i >= m and j >= n) + #assert len(subst_ind) == len(dummy_ind) + lsa_row_ind[dummy_ind] = lsa_col_ind[subst_ind] + m + lsa_col_ind[dummy_ind] = lsa_row_ind[subst_ind] + n + + return CostMatrix(C, lsa_row_ind, lsa_col_ind, + C[lsa_row_ind, lsa_col_ind].sum()) + + def extract_C(C, i, j, m, n): + #assert(C.shape == (m + n, m + n)) + row_ind = [k in i or k - m in j for k in range(m + n)] + col_ind = [k in j or k - n in i for k in range(m + n)] + return C[row_ind, :][:, col_ind] + + def reduce_C(C, i, j, m, n): + #assert(C.shape == (m + n, m + n)) + row_ind = [k not in i and k - m not in j for k in range(m + n)] + col_ind = [k not in j and k - n not in i for k in range(m + n)] + return C[row_ind, :][:, col_ind] + + def reduce_ind(ind, i): + #assert set(ind) == set(range(len(ind))) + rind = ind[[k not in i for k in ind]] + for k in set(i): + rind[rind >= k] -= 1 + return rind + + def match_edges(u, v, pending_g, pending_h, Ce, matched_uv=[]): + """ + Parameters: + u, v: matched vertices, u=None or v=None for + deletion/insertion + pending_g, pending_h: lists of edges not yet mapped + Ce: CostMatrix of pending edge mappings + matched_uv: partial vertex edit path + list of tuples (u, v) of previously matched vertex + mappings u<->v, u=None or v=None for + deletion/insertion + + Returns: + list of (i, j): indices of edge mappings g<->h + localCe: local CostMatrix of edge mappings + (basically submatrix of Ce at cross of rows i, cols j) + """ + M = len(pending_g) + N = len(pending_h) + #assert Ce.C.shape == (M + N, M + N) + + g_ind = [i for i in range(M) if pending_g[i][:2] == (u, u) or + any(pending_g[i][:2] in ((p, u), (u, p)) + for p, q in matched_uv)] + h_ind = [j for j in range(N) if pending_h[j][:2] == (v, v) or + any(pending_h[j][:2] in ((q, v), (v, q)) + for p, q in matched_uv)] + m = len(g_ind) + n = len(h_ind) + + if m or n: + C = extract_C(Ce.C, g_ind, h_ind, M, N) + #assert C.shape == (m + n, m + n) + + # Forbid structurally invalid matches + # NOTE: inf remembered from Ce construction + for k, i in zip(range(m), g_ind): + g = pending_g[i][:2] + for l, j in zip(range(n), h_ind): + h = pending_h[j][:2] + if nx.is_directed(G1) or nx.is_directed(G2): + if any(g == (p, u) and h == (q, v) or + g == (u, p) and h == (v, q) + for p, q in matched_uv): + continue + else: + if any(g in ((p, u), (u, p)) and h in ((q, v), (v, q)) + for p, q in matched_uv): + continue + if g == (u, u): + continue + if h == (v, v): + continue + C[k, l] = inf + + localCe = make_CostMatrix(C, m, n) + ij = list((g_ind[k] if k < m else M + h_ind[l], + h_ind[l] if l < n else N + g_ind[k]) + for k, l in zip(localCe.lsa_row_ind, localCe.lsa_col_ind) + if k < m or l < n) + + else: + ij = [] + localCe = CostMatrix(np.empty((0, 0)), [], [], 0) + + return ij, localCe + + def reduce_Ce(Ce, ij, m, n): + if len(ij): + i, j = zip(*ij) + m_i = m - sum(1 for t in i if t < m) + n_j = n - sum(1 for t in j if t < n) + return make_CostMatrix(reduce_C(Ce.C, i, j, m, n), m_i, n_j) + else: + return Ce + + def get_edit_ops(matched_uv, pending_u, pending_v, Cv, + pending_g, pending_h, Ce, matched_cost): + """ + Parameters: + matched_uv: partial vertex edit path + list of tuples (u, v) of vertex mappings u<->v, + u=None or v=None for deletion/insertion + pending_u, pending_v: lists of vertices not yet mapped + Cv: CostMatrix of pending vertex mappings + pending_g, pending_h: lists of edges not yet mapped + Ce: CostMatrix of pending edge mappings + matched_cost: cost of partial edit path + + Returns: + sequence of + (i, j): indices of vertex mapping u<->v + Cv_ij: reduced CostMatrix of pending vertex mappings + (basically Cv with row i, col j removed) + list of (x, y): indices of edge mappings g<->h + Ce_xy: reduced CostMatrix of pending edge mappings + (basically Ce with rows x, cols y removed) + cost: total cost of edit operation + NOTE: most promising ops first + """ + m = len(pending_u) + n = len(pending_v) + #assert Cv.C.shape == (m + n, m + n) + + # 1) a vertex mapping from optimal linear sum assignment + i, j = min((k, l) for k, l in zip(Cv.lsa_row_ind, Cv.lsa_col_ind) + if k < m or l < n) + xy, localCe = match_edges(pending_u[i] if i < m else None, + pending_v[j] if j < n else None, + pending_g, pending_h, Ce, matched_uv) + Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h)) + #assert Ce.ls <= localCe.ls + Ce_xy.ls + if prune(matched_cost + Cv.ls + localCe.ls + Ce_xy.ls): + pass + else: + # get reduced Cv efficiently + Cv_ij = CostMatrix(reduce_C(Cv.C, (i,), (j,), m, n), + reduce_ind(Cv.lsa_row_ind, (i, m + j)), + reduce_ind(Cv.lsa_col_ind, (j, n + i)), + Cv.ls - Cv.C[i, j]) + yield (i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls + + # 2) other candidates, sorted by lower-bound cost estimate + other = list() + fixed_i, fixed_j = i, j + if m <= n: + candidates = ((t, fixed_j) for t in range(m + n) + if t != fixed_i and (t < m or t == m + fixed_j)) + else: + candidates = ((fixed_i, t) for t in range(m + n) + if t != fixed_j and (t < n or t == n + fixed_i)) + for i, j in candidates: + if prune(matched_cost + Cv.C[i, j] + Ce.ls): + continue + Cv_ij = make_CostMatrix(reduce_C(Cv.C, (i,), (j,), m, n), + m - 1 if i < m else m, + n - 1 if j < n else n) + #assert Cv.ls <= Cv.C[i, j] + Cv_ij.ls + if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + Ce.ls): + continue + xy, localCe = match_edges(pending_u[i] if i < m else None, + pending_v[j] if j < n else None, + pending_g, pending_h, Ce, matched_uv) + if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls): + continue + Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h)) + #assert Ce.ls <= localCe.ls + Ce_xy.ls + if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls + + Ce_xy.ls): + continue + other.append(((i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls)) + + # yield from + for t in sorted(other, key=lambda t: t[4] + t[1].ls + t[3].ls): + yield t + + def get_edit_paths(matched_uv, pending_u, pending_v, Cv, + matched_gh, pending_g, pending_h, Ce, matched_cost): + """ + Parameters: + matched_uv: partial vertex edit path + list of tuples (u, v) of vertex mappings u<->v, + u=None or v=None for deletion/insertion + pending_u, pending_v: lists of vertices not yet mapped + Cv: CostMatrix of pending vertex mappings + matched_gh: partial edge edit path + list of tuples (g, h) of edge mappings g<->h, + g=None or h=None for deletion/insertion + pending_g, pending_h: lists of edges not yet mapped + Ce: CostMatrix of pending edge mappings + matched_cost: cost of partial edit path + + Returns: + sequence of (vertex_path, edge_path, cost) + vertex_path: complete vertex edit path + list of tuples (u, v) of vertex mappings u<->v, + u=None or v=None for deletion/insertion + edge_path: complete edge edit path + list of tuples (g, h) of edge mappings g<->h, + g=None or h=None for deletion/insertion + cost: total cost of edit path + NOTE: path costs are non-increasing + """ + #debug_print('matched-uv:', matched_uv) + #debug_print('matched-gh:', matched_gh) + #debug_print('matched-cost:', matched_cost) + #debug_print('pending-u:', pending_u) + #debug_print('pending-v:', pending_v) + #debug_print(Cv.C) + #assert list(sorted(G1.nodes)) == list(sorted(list(u for u, v in matched_uv if u is not None) + pending_u)) + #assert list(sorted(G2.nodes)) == list(sorted(list(v for u, v in matched_uv if v is not None) + pending_v)) + #debug_print('pending-g:', pending_g) + #debug_print('pending-h:', pending_h) + #debug_print(Ce.C) + #assert list(sorted(G1.edges)) == list(sorted(list(g for g, h in matched_gh if g is not None) + pending_g)) + #assert list(sorted(G2.edges)) == list(sorted(list(h for g, h in matched_gh if h is not None) + pending_h)) + #debug_print() + + if prune(matched_cost + Cv.ls + Ce.ls): + return + + if not max(len(pending_u), len(pending_v)): + #assert not len(pending_g) + #assert not len(pending_h) + # path completed! + #assert matched_cost <= maxcost.value + maxcost.value = min(maxcost.value, matched_cost) + yield matched_uv, matched_gh, matched_cost + + else: + edit_ops = get_edit_ops(matched_uv, pending_u, pending_v, Cv, + pending_g, pending_h, Ce, matched_cost) + for ij, Cv_ij, xy, Ce_xy, edit_cost in edit_ops: + i, j = ij + #assert Cv.C[i, j] + sum(Ce.C[t] for t in xy) == edit_cost + if prune(matched_cost + edit_cost + Cv_ij.ls + Ce_xy.ls): + continue + + # dive deeper + u = pending_u.pop(i) if i < len(pending_u) else None + v = pending_v.pop(j) if j < len(pending_v) else None + matched_uv.append((u, v)) + for x, y in xy: + len_g = len(pending_g) + len_h = len(pending_h) + matched_gh.append((pending_g[x] if x < len_g else None, + pending_h[y] if y < len_h else None)) + sortedx = list(sorted(x for x, y in xy)) + sortedy = list(sorted(y for x, y in xy)) + G = list((pending_g.pop(x) if x < len(pending_g) else None) + for x in reversed(sortedx)) + H = list((pending_h.pop(y) if y < len(pending_h) else None) + for y in reversed(sortedy)) + + # yield from + for t in get_edit_paths(matched_uv, pending_u, pending_v, + Cv_ij, + matched_gh, pending_g, pending_h, + Ce_xy, + matched_cost + edit_cost): + yield t + + # backtrack + if u is not None: + pending_u.insert(i, u) + if v is not None: + pending_v.insert(j, v) + matched_uv.pop() + for x, g in zip(sortedx, reversed(G)): + if g is not None: + pending_g.insert(x, g) + for y, h in zip(sortedy, reversed(H)): + if h is not None: + pending_h.insert(y, h) + for t in xy: + matched_gh.pop() + + # Initialization + + pending_u = list(G1.nodes) + pending_v = list(G2.nodes) + + # cost matrix of vertex mappings + m = len(pending_u) + n = len(pending_v) + C = np.zeros((m + n, m + n)) + if node_subst_cost: + C[0:m, 0:n] = np.array([node_subst_cost(G1.nodes[u], G2.nodes[v]) + for u in pending_u for v in pending_v] + ).reshape(m, n) + elif node_match: + C[0:m, 0:n] = np.array([1 - int(node_match(G1.nodes[u], G2.nodes[v])) + for u in pending_u for v in pending_v] + ).reshape(m, n) + else: + # all zeroes + pass + #assert not min(m, n) or C[0:m, 0:n].min() >= 0 + if node_del_cost: + del_costs = [node_del_cost(G1.nodes[u]) for u in pending_u] + else: + del_costs = [1] * len(pending_u) + #assert not m or min(del_costs) >= 0 + if node_ins_cost: + ins_costs = [node_ins_cost(G2.nodes[v]) for v in pending_v] + else: + ins_costs = [1] * len(pending_v) + #assert not n or min(ins_costs) >= 0 + inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1 + C[0:m, n:n + m] = np.array([del_costs[i] if i == j else inf + for i in range(m) for j in range(m)] + ).reshape(m, m) + C[m:m + n, 0:n] = np.array([ins_costs[i] if i == j else inf + for i in range(n) for j in range(n)] + ).reshape(n, n) + Cv = make_CostMatrix(C, m, n) + #debug_print('Cv: {} x {}'.format(m, n)) + #debug_print(Cv.C) + + pending_g = list(G1.edges) + pending_h = list(G2.edges) + + # cost matrix of edge mappings + m = len(pending_g) + n = len(pending_h) + C = np.zeros((m + n, m + n)) + if edge_subst_cost: + C[0:m, 0:n] = np.array([edge_subst_cost(G1.edges[g], G2.edges[h]) + for g in pending_g for h in pending_h] + ).reshape(m, n) + elif edge_match: + C[0:m, 0:n] = np.array([1 - int(edge_match(G1.edges[g], G2.edges[h])) + for g in pending_g for h in pending_h] + ).reshape(m, n) + else: + # all zeroes + pass + #assert not min(m, n) or C[0:m, 0:n].min() >= 0 + if edge_del_cost: + del_costs = [edge_del_cost(G1.edges[g]) for g in pending_g] + else: + del_costs = [1] * len(pending_g) + #assert not m or min(del_costs) >= 0 + if edge_ins_cost: + ins_costs = [edge_ins_cost(G2.edges[h]) for h in pending_h] + else: + ins_costs = [1] * len(pending_h) + #assert not n or min(ins_costs) >= 0 + inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1 + C[0:m, n:n + m] = np.array([del_costs[i] if i == j else inf + for i in range(m) for j in range(m)] + ).reshape(m, m) + C[m:m + n, 0:n] = np.array([ins_costs[i] if i == j else inf + for i in range(n) for j in range(n)] + ).reshape(n, n) + Ce = make_CostMatrix(C, m, n) + #debug_print('Ce: {} x {}'.format(m, n)) + #debug_print(Ce.C) + #debug_print() + + class MaxCost: + def __init__(self): + # initial upper-bound estimate + # NOTE: should work for empty graph + self.value = Cv.C.sum() + Ce.C.sum() + 1 + maxcost = MaxCost() + + def prune(cost): + if upper_bound is not None: + if cost > upper_bound: + return True + if cost > maxcost.value: + return True + elif strictly_decreasing and cost >= maxcost.value: + return True + + # Now go! + + for vertex_path, edge_path, cost in \ + get_edit_paths([], pending_u, pending_v, Cv, + [], pending_g, pending_h, Ce, 0): + #assert sorted(G1.nodes) == sorted(u for u, v in vertex_path if u is not None) + #assert sorted(G2.nodes) == sorted(v for u, v in vertex_path if v is not None) + #assert sorted(G1.edges) == sorted(g for g, h in edge_path if g is not None) + #assert sorted(G2.edges) == sorted(h for g, h in edge_path if h is not None) + #print(vertex_path, edge_path, cost, file = sys.stderr) + #assert cost == maxcost.value + yield list(vertex_path), list(edge_path), cost + + +def _is_close(d1, d2, atolerance=0, rtolerance=0): + """Determines whether two adjacency matrices are within + a provided tolerance. + + Parameters + ---------- + d1 : dict + Adjacency dictionary + + d2 : dict + Adjacency dictionary + + atolerance : float + Some scalar tolerance value to determine closeness + + rtolerance : float + A scalar tolerance value that will be some proportion + of ``d2``'s value + + Returns + ------- + closeness : bool + If all of the nodes within ``d1`` and ``d2`` are within + a predefined tolerance, they are considered "close" and + this method will return True. Otherwise, this method will + return False. + + """ + # Pre-condition: d1 and d2 have the same keys at each level if they + # are dictionaries. + if not isinstance(d1, dict) and not isinstance(d2, dict): + return abs(d1 - d2) <= atolerance + rtolerance * abs(d2) + return all(all(_is_close(d1[u][v], d2[u][v]) for v in d1[u]) for u in d1) + + +def simrank_similarity(G, source=None, target=None, importance_factor=0.9, + max_iterations=100, tolerance=1e-4): + """Returns the SimRank similarity of nodes in the graph ``G``. + + SimRank is a similarity metric that says "two objects are considered + to be similar if they are referenced by similar objects." [1]_. + + The pseudo-code definition from the paper is:: + + def simrank(G, u, v): + in_neighbors_u = G.predecessors(u) + in_neighbors_v = G.predecessors(v) + scale = C / (len(in_neighbors_u) * len(in_neighbors_v)) + return scale * sum(simrank(G, w, x) + for w, x in product(in_neighbors_u, + in_neighbors_v)) + + where ``G`` is the graph, ``u`` is the source, ``v`` is the target, + and ``C`` is a float decay or importance factor between 0 and 1. + + The SimRank algorithm for determining node similarity is defined in + [2]_. + + Parameters + ---------- + G : NetworkX graph + A NetworkX graph + + source : node + If this is specified, the returned dictionary maps each node + ``v`` in the graph to the similarity between ``source`` and + ``v``. + + target : node + If both ``source`` and ``target`` are specified, the similarity + value between ``source`` and ``target`` is returned. If + ``target`` is specified but ``source`` is not, this argument is + ignored. + + importance_factor : float + The relative importance of indirect neighbors with respect to + direct neighbors. + + max_iterations : integer + Maximum number of iterations. + + tolerance : float + Error tolerance used to check convergence. When an iteration of + the algorithm finds that no similarity value changes more than + this amount, the algorithm halts. + + Returns + ------- + similarity : dictionary or float + If ``source`` and ``target`` are both ``None``, this returns a + dictionary of dictionaries, where keys are node pairs and value + are similarity of the pair of nodes. + + If ``source`` is not ``None`` but ``target`` is, this returns a + dictionary mapping node to the similarity of ``source`` and that + node. + + If neither ``source`` nor ``target`` is ``None``, this returns + the similarity value for the given pair of nodes. + + Examples + -------- + If the nodes of the graph are numbered from zero to *n - 1*, where *n* + is the number of nodes in the graph, you can create a SimRank matrix + from the return value of this function where the node numbers are + the row and column indices of the matrix:: + + >>> import networkx as nx + >>> from numpy import array + >>> G = nx.cycle_graph(4) + >>> sim = nx.simrank_similarity(G) + >>> lol = [[sim[u][v] for v in sorted(sim[u])] for u in sorted(sim)] + >>> sim_array = array(lol) + + References + ---------- + .. [1] https://en.wikipedia.org/wiki/SimRank + .. [2] G. Jeh and J. Widom. + "SimRank: a measure of structural-context similarity", + In KDD'02: Proceedings of the Eighth ACM SIGKDD + International Conference on Knowledge Discovery and Data Mining, + pp. 538--543. ACM Press, 2002. + """ + prevsim = None + + # build up our similarity adjacency dictionary output + newsim = {u: {v: 1 if u == v else 0 for v in G} for u in G} + + # These functions compute the update to the similarity value of the nodes + # `u` and `v` with respect to the previous similarity values. + avg_sim = lambda s: sum(newsim[w][x] for (w, x) in s) / len(s) if s else 0.0 + sim = lambda u, v: importance_factor * avg_sim(list(product(G[u], G[v]))) + + for _ in range(max_iterations): + if prevsim and _is_close(prevsim, newsim, tolerance): + break + prevsim = newsim + newsim = {u: {v: sim(u, v) if u is not v else 1 + for v in newsim[u]} for u in newsim} + + if source is not None and target is not None: + return newsim[source][target] + if source is not None: + return newsim[source] + return newsim + + +def simrank_similarity_numpy(G, source=None, target=None, importance_factor=0.9, + max_iterations=100, tolerance=1e-4): + """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``. + + The SimRank algorithm for determining node similarity is defined in + [1]_. + + Parameters + ---------- + G : NetworkX graph + A NetworkX graph + + source : node + If this is specified, the returned dictionary maps each node + ``v`` in the graph to the similarity between ``source`` and + ``v``. + + target : node + If both ``source`` and ``target`` are specified, the similarity + value between ``source`` and ``target`` is returned. If + ``target`` is specified but ``source`` is not, this argument is + ignored. + + importance_factor : float + The relative importance of indirect neighbors with respect to + direct neighbors. + + max_iterations : integer + Maximum number of iterations. + + tolerance : float + Error tolerance used to check convergence. When an iteration of + the algorithm finds that no similarity value changes more than + this amount, the algorithm halts. + + Returns + ------- + similarity : dictionary or float + If ``source`` and ``target`` are both ``None``, this returns a + dictionary of dictionaries, where keys are node pairs and value + are similarity of the pair of nodes. + + If ``source`` is not ``None`` but ``target`` is, this returns a + dictionary mapping node to the similarity of ``source`` and that + node. + + If neither ``source`` nor ``target`` is ``None``, this returns + the similarity value for the given pair of nodes. + + Examples + -------- + >>> import networkx as nx + >>> from numpy import array + >>> G = nx.cycle_graph(4) + >>> sim = nx.simrank_similarity_numpy(G) + + References + ---------- + .. [1] G. Jeh and J. Widom. + "SimRank: a measure of structural-context similarity", + In KDD'02: Proceedings of the Eighth ACM SIGKDD + International Conference on Knowledge Discovery and Data Mining, + pp. 538--543. ACM Press, 2002. + """ + # This algorithm follows roughly + # + # S = max{C * (A.T * S * A), I} + # + # where C is the importance factor, A is the column normalized + # adjacency matrix, and I is the identity matrix. + import numpy as np + adjacency_matrix = nx.to_numpy_array(G) + + # column-normalize the ``adjacency_matrix`` + adjacency_matrix /= adjacency_matrix.sum(axis=0) + + newsim = np.eye(adjacency_matrix.shape[0], dtype=np.float64) + for _ in range(max_iterations): + prevsim = np.copy(newsim) + newsim = importance_factor * np.matmul( + np.matmul(adjacency_matrix.T, prevsim), adjacency_matrix) + np.fill_diagonal(newsim, 1.0) + + if np.allclose(prevsim, newsim, atol=tolerance): + break + + if source is not None and target is not None: + return newsim[source, target] + if source is not None: + return newsim[source] + return newsim + + +# fixture for pytest +def setup_module(module): + import pytest + numpy = pytest.importorskip('numpy') + scipy = pytest.importorskip('scipy')