1""" Functions measuring similarity using graph edit distance. 2 3The graph edit distance is the number of edge/node changes needed 4to make two graphs isomorphic. 5 6The default algorithm/implementation is sub-optimal for some graphs. 7The problem of finding the exact Graph Edit Distance (GED) is NP-hard 8so it is often slow. If the simple interface `graph_edit_distance` 9takes too long for your graph, try `optimize_graph_edit_distance` 10and/or `optimize_edit_paths`. 11 12At the same time, I encourage capable people to investigate 13alternative GED algorithms, in order to improve the choices available. 14""" 15 16from functools import reduce 17from itertools import product 18import math 19from operator import mul 20import time 21import warnings 22import networkx as nx 23 24__all__ = [ 25 "graph_edit_distance", 26 "optimal_edit_paths", 27 "optimize_graph_edit_distance", 28 "optimize_edit_paths", 29 "simrank_similarity", 30 "simrank_similarity_numpy", 31 "panther_similarity", 32 "generate_random_paths", 33] 34 35 36def debug_print(*args, **kwargs): 37 print(*args, **kwargs) 38 39 40def graph_edit_distance( 41 G1, 42 G2, 43 node_match=None, 44 edge_match=None, 45 node_subst_cost=None, 46 node_del_cost=None, 47 node_ins_cost=None, 48 edge_subst_cost=None, 49 edge_del_cost=None, 50 edge_ins_cost=None, 51 roots=None, 52 upper_bound=None, 53 timeout=None, 54): 55 """Returns GED (graph edit distance) between graphs G1 and G2. 56 57 Graph edit distance is a graph similarity measure analogous to 58 Levenshtein distance for strings. It is defined as minimum cost 59 of edit path (sequence of node and edge edit operations) 60 transforming graph G1 to graph isomorphic to G2. 61 62 Parameters 63 ---------- 64 G1, G2: graphs 65 The two graphs G1 and G2 must be of the same type. 66 67 node_match : callable 68 A function that returns True if node n1 in G1 and n2 in G2 69 should be considered equal during matching. 70 71 The function will be called like 72 73 node_match(G1.nodes[n1], G2.nodes[n2]). 74 75 That is, the function will receive the node attribute 76 dictionaries for n1 and n2 as inputs. 77 78 Ignored if node_subst_cost is specified. If neither 79 node_match nor node_subst_cost are specified then node 80 attributes are not considered. 81 82 edge_match : callable 83 A function that returns True if the edge attribute dictionaries 84 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should 85 be considered equal during matching. 86 87 The function will be called like 88 89 edge_match(G1[u1][v1], G2[u2][v2]). 90 91 That is, the function will receive the edge attribute 92 dictionaries of the edges under consideration. 93 94 Ignored if edge_subst_cost is specified. If neither 95 edge_match nor edge_subst_cost are specified then edge 96 attributes are not considered. 97 98 node_subst_cost, node_del_cost, node_ins_cost : callable 99 Functions that return the costs of node substitution, node 100 deletion, and node insertion, respectively. 101 102 The functions will be called like 103 104 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), 105 node_del_cost(G1.nodes[n1]), 106 node_ins_cost(G2.nodes[n2]). 107 108 That is, the functions will receive the node attribute 109 dictionaries as inputs. The functions are expected to return 110 positive numeric values. 111 112 Function node_subst_cost overrides node_match if specified. 113 If neither node_match nor node_subst_cost are specified then 114 default node substitution cost of 0 is used (node attributes 115 are not considered during matching). 116 117 If node_del_cost is not specified then default node deletion 118 cost of 1 is used. If node_ins_cost is not specified then 119 default node insertion cost of 1 is used. 120 121 edge_subst_cost, edge_del_cost, edge_ins_cost : callable 122 Functions that return the costs of edge substitution, edge 123 deletion, and edge insertion, respectively. 124 125 The functions will be called like 126 127 edge_subst_cost(G1[u1][v1], G2[u2][v2]), 128 edge_del_cost(G1[u1][v1]), 129 edge_ins_cost(G2[u2][v2]). 130 131 That is, the functions will receive the edge attribute 132 dictionaries as inputs. The functions are expected to return 133 positive numeric values. 134 135 Function edge_subst_cost overrides edge_match if specified. 136 If neither edge_match nor edge_subst_cost are specified then 137 default edge substitution cost of 0 is used (edge attributes 138 are not considered during matching). 139 140 If edge_del_cost is not specified then default edge deletion 141 cost of 1 is used. If edge_ins_cost is not specified then 142 default edge insertion cost of 1 is used. 143 144 roots : 2-tuple 145 Tuple where first element is a node in G1 and the second 146 is a node in G2. 147 These nodes are forced to be matched in the comparison to 148 allow comparison between rooted graphs. 149 150 upper_bound : numeric 151 Maximum edit distance to consider. Return None if no edit 152 distance under or equal to upper_bound exists. 153 154 timeout : numeric 155 Maximum number of seconds to execute. 156 After timeout is met, the current best GED is returned. 157 158 Examples 159 -------- 160 >>> G1 = nx.cycle_graph(6) 161 >>> G2 = nx.wheel_graph(7) 162 >>> nx.graph_edit_distance(G1, G2) 163 7.0 164 165 >>> G1 = nx.star_graph(5) 166 >>> G2 = nx.star_graph(5) 167 >>> nx.graph_edit_distance(G1, G2, roots=(0, 0)) 168 0.0 169 >>> nx.graph_edit_distance(G1, G2, roots=(1, 0)) 170 8.0 171 172 See Also 173 -------- 174 optimal_edit_paths, optimize_graph_edit_distance, 175 176 is_isomorphic: test for graph edit distance of 0 177 178 References 179 ---------- 180 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick 181 Martineau. An Exact Graph Edit Distance Algorithm for Solving 182 Pattern Recognition Problems. 4th International Conference on 183 Pattern Recognition Applications and Methods 2015, Jan 2015, 184 Lisbon, Portugal. 2015, 185 <10.5220/0005209202710278>. <hal-01168816> 186 https://hal.archives-ouvertes.fr/hal-01168816 187 188 """ 189 bestcost = None 190 for vertex_path, edge_path, cost in optimize_edit_paths( 191 G1, 192 G2, 193 node_match, 194 edge_match, 195 node_subst_cost, 196 node_del_cost, 197 node_ins_cost, 198 edge_subst_cost, 199 edge_del_cost, 200 edge_ins_cost, 201 upper_bound, 202 True, 203 roots, 204 timeout, 205 ): 206 # assert bestcost is None or cost < bestcost 207 bestcost = cost 208 return bestcost 209 210 211def optimal_edit_paths( 212 G1, 213 G2, 214 node_match=None, 215 edge_match=None, 216 node_subst_cost=None, 217 node_del_cost=None, 218 node_ins_cost=None, 219 edge_subst_cost=None, 220 edge_del_cost=None, 221 edge_ins_cost=None, 222 upper_bound=None, 223): 224 """Returns all minimum-cost edit paths transforming G1 to G2. 225 226 Graph edit path is a sequence of node and edge edit operations 227 transforming graph G1 to graph isomorphic to G2. Edit operations 228 include substitutions, deletions, and insertions. 229 230 Parameters 231 ---------- 232 G1, G2: graphs 233 The two graphs G1 and G2 must be of the same type. 234 235 node_match : callable 236 A function that returns True if node n1 in G1 and n2 in G2 237 should be considered equal during matching. 238 239 The function will be called like 240 241 node_match(G1.nodes[n1], G2.nodes[n2]). 242 243 That is, the function will receive the node attribute 244 dictionaries for n1 and n2 as inputs. 245 246 Ignored if node_subst_cost is specified. If neither 247 node_match nor node_subst_cost are specified then node 248 attributes are not considered. 249 250 edge_match : callable 251 A function that returns True if the edge attribute dictionaries 252 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should 253 be considered equal during matching. 254 255 The function will be called like 256 257 edge_match(G1[u1][v1], G2[u2][v2]). 258 259 That is, the function will receive the edge attribute 260 dictionaries of the edges under consideration. 261 262 Ignored if edge_subst_cost is specified. If neither 263 edge_match nor edge_subst_cost are specified then edge 264 attributes are not considered. 265 266 node_subst_cost, node_del_cost, node_ins_cost : callable 267 Functions that return the costs of node substitution, node 268 deletion, and node insertion, respectively. 269 270 The functions will be called like 271 272 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), 273 node_del_cost(G1.nodes[n1]), 274 node_ins_cost(G2.nodes[n2]). 275 276 That is, the functions will receive the node attribute 277 dictionaries as inputs. The functions are expected to return 278 positive numeric values. 279 280 Function node_subst_cost overrides node_match if specified. 281 If neither node_match nor node_subst_cost are specified then 282 default node substitution cost of 0 is used (node attributes 283 are not considered during matching). 284 285 If node_del_cost is not specified then default node deletion 286 cost of 1 is used. If node_ins_cost is not specified then 287 default node insertion cost of 1 is used. 288 289 edge_subst_cost, edge_del_cost, edge_ins_cost : callable 290 Functions that return the costs of edge substitution, edge 291 deletion, and edge insertion, respectively. 292 293 The functions will be called like 294 295 edge_subst_cost(G1[u1][v1], G2[u2][v2]), 296 edge_del_cost(G1[u1][v1]), 297 edge_ins_cost(G2[u2][v2]). 298 299 That is, the functions will receive the edge attribute 300 dictionaries as inputs. The functions are expected to return 301 positive numeric values. 302 303 Function edge_subst_cost overrides edge_match if specified. 304 If neither edge_match nor edge_subst_cost are specified then 305 default edge substitution cost of 0 is used (edge attributes 306 are not considered during matching). 307 308 If edge_del_cost is not specified then default edge deletion 309 cost of 1 is used. If edge_ins_cost is not specified then 310 default edge insertion cost of 1 is used. 311 312 upper_bound : numeric 313 Maximum edit distance to consider. 314 315 Returns 316 ------- 317 edit_paths : list of tuples (node_edit_path, edge_edit_path) 318 node_edit_path : list of tuples (u, v) 319 edge_edit_path : list of tuples ((u1, v1), (u2, v2)) 320 321 cost : numeric 322 Optimal edit path cost (graph edit distance). 323 324 Examples 325 -------- 326 >>> G1 = nx.cycle_graph(4) 327 >>> G2 = nx.wheel_graph(5) 328 >>> paths, cost = nx.optimal_edit_paths(G1, G2) 329 >>> len(paths) 330 40 331 >>> cost 332 5.0 333 334 See Also 335 -------- 336 graph_edit_distance, optimize_edit_paths 337 338 References 339 ---------- 340 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick 341 Martineau. An Exact Graph Edit Distance Algorithm for Solving 342 Pattern Recognition Problems. 4th International Conference on 343 Pattern Recognition Applications and Methods 2015, Jan 2015, 344 Lisbon, Portugal. 2015, 345 <10.5220/0005209202710278>. <hal-01168816> 346 https://hal.archives-ouvertes.fr/hal-01168816 347 348 """ 349 paths = list() 350 bestcost = None 351 for vertex_path, edge_path, cost in optimize_edit_paths( 352 G1, 353 G2, 354 node_match, 355 edge_match, 356 node_subst_cost, 357 node_del_cost, 358 node_ins_cost, 359 edge_subst_cost, 360 edge_del_cost, 361 edge_ins_cost, 362 upper_bound, 363 False, 364 ): 365 # assert bestcost is None or cost <= bestcost 366 if bestcost is not None and cost < bestcost: 367 paths = list() 368 paths.append((vertex_path, edge_path)) 369 bestcost = cost 370 return paths, bestcost 371 372 373def optimize_graph_edit_distance( 374 G1, 375 G2, 376 node_match=None, 377 edge_match=None, 378 node_subst_cost=None, 379 node_del_cost=None, 380 node_ins_cost=None, 381 edge_subst_cost=None, 382 edge_del_cost=None, 383 edge_ins_cost=None, 384 upper_bound=None, 385): 386 """Returns consecutive approximations of GED (graph edit distance) 387 between graphs G1 and G2. 388 389 Graph edit distance is a graph similarity measure analogous to 390 Levenshtein distance for strings. It is defined as minimum cost 391 of edit path (sequence of node and edge edit operations) 392 transforming graph G1 to graph isomorphic to G2. 393 394 Parameters 395 ---------- 396 G1, G2: graphs 397 The two graphs G1 and G2 must be of the same type. 398 399 node_match : callable 400 A function that returns True if node n1 in G1 and n2 in G2 401 should be considered equal during matching. 402 403 The function will be called like 404 405 node_match(G1.nodes[n1], G2.nodes[n2]). 406 407 That is, the function will receive the node attribute 408 dictionaries for n1 and n2 as inputs. 409 410 Ignored if node_subst_cost is specified. If neither 411 node_match nor node_subst_cost are specified then node 412 attributes are not considered. 413 414 edge_match : callable 415 A function that returns True if the edge attribute dictionaries 416 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should 417 be considered equal during matching. 418 419 The function will be called like 420 421 edge_match(G1[u1][v1], G2[u2][v2]). 422 423 That is, the function will receive the edge attribute 424 dictionaries of the edges under consideration. 425 426 Ignored if edge_subst_cost is specified. If neither 427 edge_match nor edge_subst_cost are specified then edge 428 attributes are not considered. 429 430 node_subst_cost, node_del_cost, node_ins_cost : callable 431 Functions that return the costs of node substitution, node 432 deletion, and node insertion, respectively. 433 434 The functions will be called like 435 436 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), 437 node_del_cost(G1.nodes[n1]), 438 node_ins_cost(G2.nodes[n2]). 439 440 That is, the functions will receive the node attribute 441 dictionaries as inputs. The functions are expected to return 442 positive numeric values. 443 444 Function node_subst_cost overrides node_match if specified. 445 If neither node_match nor node_subst_cost are specified then 446 default node substitution cost of 0 is used (node attributes 447 are not considered during matching). 448 449 If node_del_cost is not specified then default node deletion 450 cost of 1 is used. If node_ins_cost is not specified then 451 default node insertion cost of 1 is used. 452 453 edge_subst_cost, edge_del_cost, edge_ins_cost : callable 454 Functions that return the costs of edge substitution, edge 455 deletion, and edge insertion, respectively. 456 457 The functions will be called like 458 459 edge_subst_cost(G1[u1][v1], G2[u2][v2]), 460 edge_del_cost(G1[u1][v1]), 461 edge_ins_cost(G2[u2][v2]). 462 463 That is, the functions will receive the edge attribute 464 dictionaries as inputs. The functions are expected to return 465 positive numeric values. 466 467 Function edge_subst_cost overrides edge_match if specified. 468 If neither edge_match nor edge_subst_cost are specified then 469 default edge substitution cost of 0 is used (edge attributes 470 are not considered during matching). 471 472 If edge_del_cost is not specified then default edge deletion 473 cost of 1 is used. If edge_ins_cost is not specified then 474 default edge insertion cost of 1 is used. 475 476 upper_bound : numeric 477 Maximum edit distance to consider. 478 479 Returns 480 ------- 481 Generator of consecutive approximations of graph edit distance. 482 483 Examples 484 -------- 485 >>> G1 = nx.cycle_graph(6) 486 >>> G2 = nx.wheel_graph(7) 487 >>> for v in nx.optimize_graph_edit_distance(G1, G2): 488 ... minv = v 489 >>> minv 490 7.0 491 492 See Also 493 -------- 494 graph_edit_distance, optimize_edit_paths 495 496 References 497 ---------- 498 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick 499 Martineau. An Exact Graph Edit Distance Algorithm for Solving 500 Pattern Recognition Problems. 4th International Conference on 501 Pattern Recognition Applications and Methods 2015, Jan 2015, 502 Lisbon, Portugal. 2015, 503 <10.5220/0005209202710278>. <hal-01168816> 504 https://hal.archives-ouvertes.fr/hal-01168816 505 """ 506 for vertex_path, edge_path, cost in optimize_edit_paths( 507 G1, 508 G2, 509 node_match, 510 edge_match, 511 node_subst_cost, 512 node_del_cost, 513 node_ins_cost, 514 edge_subst_cost, 515 edge_del_cost, 516 edge_ins_cost, 517 upper_bound, 518 True, 519 ): 520 yield cost 521 522 523def optimize_edit_paths( 524 G1, 525 G2, 526 node_match=None, 527 edge_match=None, 528 node_subst_cost=None, 529 node_del_cost=None, 530 node_ins_cost=None, 531 edge_subst_cost=None, 532 edge_del_cost=None, 533 edge_ins_cost=None, 534 upper_bound=None, 535 strictly_decreasing=True, 536 roots=None, 537 timeout=None, 538): 539 """GED (graph edit distance) calculation: advanced interface. 540 541 Graph edit path is a sequence of node and edge edit operations 542 transforming graph G1 to graph isomorphic to G2. Edit operations 543 include substitutions, deletions, and insertions. 544 545 Graph edit distance is defined as minimum cost of edit path. 546 547 Parameters 548 ---------- 549 G1, G2: graphs 550 The two graphs G1 and G2 must be of the same type. 551 552 node_match : callable 553 A function that returns True if node n1 in G1 and n2 in G2 554 should be considered equal during matching. 555 556 The function will be called like 557 558 node_match(G1.nodes[n1], G2.nodes[n2]). 559 560 That is, the function will receive the node attribute 561 dictionaries for n1 and n2 as inputs. 562 563 Ignored if node_subst_cost is specified. If neither 564 node_match nor node_subst_cost are specified then node 565 attributes are not considered. 566 567 edge_match : callable 568 A function that returns True if the edge attribute dictionaries 569 for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should 570 be considered equal during matching. 571 572 The function will be called like 573 574 edge_match(G1[u1][v1], G2[u2][v2]). 575 576 That is, the function will receive the edge attribute 577 dictionaries of the edges under consideration. 578 579 Ignored if edge_subst_cost is specified. If neither 580 edge_match nor edge_subst_cost are specified then edge 581 attributes are not considered. 582 583 node_subst_cost, node_del_cost, node_ins_cost : callable 584 Functions that return the costs of node substitution, node 585 deletion, and node insertion, respectively. 586 587 The functions will be called like 588 589 node_subst_cost(G1.nodes[n1], G2.nodes[n2]), 590 node_del_cost(G1.nodes[n1]), 591 node_ins_cost(G2.nodes[n2]). 592 593 That is, the functions will receive the node attribute 594 dictionaries as inputs. The functions are expected to return 595 positive numeric values. 596 597 Function node_subst_cost overrides node_match if specified. 598 If neither node_match nor node_subst_cost are specified then 599 default node substitution cost of 0 is used (node attributes 600 are not considered during matching). 601 602 If node_del_cost is not specified then default node deletion 603 cost of 1 is used. If node_ins_cost is not specified then 604 default node insertion cost of 1 is used. 605 606 edge_subst_cost, edge_del_cost, edge_ins_cost : callable 607 Functions that return the costs of edge substitution, edge 608 deletion, and edge insertion, respectively. 609 610 The functions will be called like 611 612 edge_subst_cost(G1[u1][v1], G2[u2][v2]), 613 edge_del_cost(G1[u1][v1]), 614 edge_ins_cost(G2[u2][v2]). 615 616 That is, the functions will receive the edge attribute 617 dictionaries as inputs. The functions are expected to return 618 positive numeric values. 619 620 Function edge_subst_cost overrides edge_match if specified. 621 If neither edge_match nor edge_subst_cost are specified then 622 default edge substitution cost of 0 is used (edge attributes 623 are not considered during matching). 624 625 If edge_del_cost is not specified then default edge deletion 626 cost of 1 is used. If edge_ins_cost is not specified then 627 default edge insertion cost of 1 is used. 628 629 upper_bound : numeric 630 Maximum edit distance to consider. 631 632 strictly_decreasing : bool 633 If True, return consecutive approximations of strictly 634 decreasing cost. Otherwise, return all edit paths of cost 635 less than or equal to the previous minimum cost. 636 637 roots : 2-tuple 638 Tuple where first element is a node in G1 and the second 639 is a node in G2. 640 These nodes are forced to be matched in the comparison to 641 allow comparison between rooted graphs. 642 643 timeout : numeric 644 Maximum number of seconds to execute. 645 After timeout is met, the current best GED is returned. 646 647 Returns 648 ------- 649 Generator of tuples (node_edit_path, edge_edit_path, cost) 650 node_edit_path : list of tuples (u, v) 651 edge_edit_path : list of tuples ((u1, v1), (u2, v2)) 652 cost : numeric 653 654 See Also 655 -------- 656 graph_edit_distance, optimize_graph_edit_distance, optimal_edit_paths 657 658 References 659 ---------- 660 .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick 661 Martineau. An Exact Graph Edit Distance Algorithm for Solving 662 Pattern Recognition Problems. 4th International Conference on 663 Pattern Recognition Applications and Methods 2015, Jan 2015, 664 Lisbon, Portugal. 2015, 665 <10.5220/0005209202710278>. <hal-01168816> 666 https://hal.archives-ouvertes.fr/hal-01168816 667 668 """ 669 # TODO: support DiGraph 670 671 import numpy as np 672 import scipy as sp 673 import scipy.optimize # call as sp.optimize 674 675 class CostMatrix: 676 def __init__(self, C, lsa_row_ind, lsa_col_ind, ls): 677 # assert C.shape[0] == len(lsa_row_ind) 678 # assert C.shape[1] == len(lsa_col_ind) 679 # assert len(lsa_row_ind) == len(lsa_col_ind) 680 # assert set(lsa_row_ind) == set(range(len(lsa_row_ind))) 681 # assert set(lsa_col_ind) == set(range(len(lsa_col_ind))) 682 # assert ls == C[lsa_row_ind, lsa_col_ind].sum() 683 self.C = C 684 self.lsa_row_ind = lsa_row_ind 685 self.lsa_col_ind = lsa_col_ind 686 self.ls = ls 687 688 def make_CostMatrix(C, m, n): 689 # assert(C.shape == (m + n, m + n)) 690 lsa_row_ind, lsa_col_ind = sp.optimize.linear_sum_assignment(C) 691 692 # Fixup dummy assignments: 693 # each substitution i<->j should have dummy assignment m+j<->n+i 694 # NOTE: fast reduce of Cv relies on it 695 # assert len(lsa_row_ind) == len(lsa_col_ind) 696 indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind) 697 subst_ind = list(k for k, i, j in indexes if i < m and j < n) 698 indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind) 699 dummy_ind = list(k for k, i, j in indexes if i >= m and j >= n) 700 # assert len(subst_ind) == len(dummy_ind) 701 lsa_row_ind[dummy_ind] = lsa_col_ind[subst_ind] + m 702 lsa_col_ind[dummy_ind] = lsa_row_ind[subst_ind] + n 703 704 return CostMatrix( 705 C, lsa_row_ind, lsa_col_ind, C[lsa_row_ind, lsa_col_ind].sum() 706 ) 707 708 def extract_C(C, i, j, m, n): 709 # assert(C.shape == (m + n, m + n)) 710 row_ind = [k in i or k - m in j for k in range(m + n)] 711 col_ind = [k in j or k - n in i for k in range(m + n)] 712 return C[row_ind, :][:, col_ind] 713 714 def reduce_C(C, i, j, m, n): 715 # assert(C.shape == (m + n, m + n)) 716 row_ind = [k not in i and k - m not in j for k in range(m + n)] 717 col_ind = [k not in j and k - n not in i for k in range(m + n)] 718 return C[row_ind, :][:, col_ind] 719 720 def reduce_ind(ind, i): 721 # assert set(ind) == set(range(len(ind))) 722 rind = ind[[k not in i for k in ind]] 723 for k in set(i): 724 rind[rind >= k] -= 1 725 return rind 726 727 def match_edges(u, v, pending_g, pending_h, Ce, matched_uv=[]): 728 """ 729 Parameters: 730 u, v: matched vertices, u=None or v=None for 731 deletion/insertion 732 pending_g, pending_h: lists of edges not yet mapped 733 Ce: CostMatrix of pending edge mappings 734 matched_uv: partial vertex edit path 735 list of tuples (u, v) of previously matched vertex 736 mappings u<->v, u=None or v=None for 737 deletion/insertion 738 739 Returns: 740 list of (i, j): indices of edge mappings g<->h 741 localCe: local CostMatrix of edge mappings 742 (basically submatrix of Ce at cross of rows i, cols j) 743 """ 744 M = len(pending_g) 745 N = len(pending_h) 746 # assert Ce.C.shape == (M + N, M + N) 747 748 g_ind = [ 749 i 750 for i in range(M) 751 if pending_g[i][:2] == (u, u) 752 or any(pending_g[i][:2] in ((p, u), (u, p)) for p, q in matched_uv) 753 ] 754 h_ind = [ 755 j 756 for j in range(N) 757 if pending_h[j][:2] == (v, v) 758 or any(pending_h[j][:2] in ((q, v), (v, q)) for p, q in matched_uv) 759 ] 760 m = len(g_ind) 761 n = len(h_ind) 762 763 if m or n: 764 C = extract_C(Ce.C, g_ind, h_ind, M, N) 765 # assert C.shape == (m + n, m + n) 766 767 # Forbid structurally invalid matches 768 # NOTE: inf remembered from Ce construction 769 for k, i in zip(range(m), g_ind): 770 g = pending_g[i][:2] 771 for l, j in zip(range(n), h_ind): 772 h = pending_h[j][:2] 773 if nx.is_directed(G1) or nx.is_directed(G2): 774 if any( 775 g == (p, u) and h == (q, v) or g == (u, p) and h == (v, q) 776 for p, q in matched_uv 777 ): 778 continue 779 else: 780 if any( 781 g in ((p, u), (u, p)) and h in ((q, v), (v, q)) 782 for p, q in matched_uv 783 ): 784 continue 785 if g == (u, u): 786 continue 787 if h == (v, v): 788 continue 789 C[k, l] = inf 790 791 localCe = make_CostMatrix(C, m, n) 792 ij = list( 793 ( 794 g_ind[k] if k < m else M + h_ind[l], 795 h_ind[l] if l < n else N + g_ind[k], 796 ) 797 for k, l in zip(localCe.lsa_row_ind, localCe.lsa_col_ind) 798 if k < m or l < n 799 ) 800 801 else: 802 ij = [] 803 localCe = CostMatrix(np.empty((0, 0)), [], [], 0) 804 805 return ij, localCe 806 807 def reduce_Ce(Ce, ij, m, n): 808 if len(ij): 809 i, j = zip(*ij) 810 m_i = m - sum(1 for t in i if t < m) 811 n_j = n - sum(1 for t in j if t < n) 812 return make_CostMatrix(reduce_C(Ce.C, i, j, m, n), m_i, n_j) 813 else: 814 return Ce 815 816 def get_edit_ops( 817 matched_uv, pending_u, pending_v, Cv, pending_g, pending_h, Ce, matched_cost 818 ): 819 """ 820 Parameters: 821 matched_uv: partial vertex edit path 822 list of tuples (u, v) of vertex mappings u<->v, 823 u=None or v=None for deletion/insertion 824 pending_u, pending_v: lists of vertices not yet mapped 825 Cv: CostMatrix of pending vertex mappings 826 pending_g, pending_h: lists of edges not yet mapped 827 Ce: CostMatrix of pending edge mappings 828 matched_cost: cost of partial edit path 829 830 Returns: 831 sequence of 832 (i, j): indices of vertex mapping u<->v 833 Cv_ij: reduced CostMatrix of pending vertex mappings 834 (basically Cv with row i, col j removed) 835 list of (x, y): indices of edge mappings g<->h 836 Ce_xy: reduced CostMatrix of pending edge mappings 837 (basically Ce with rows x, cols y removed) 838 cost: total cost of edit operation 839 NOTE: most promising ops first 840 """ 841 m = len(pending_u) 842 n = len(pending_v) 843 # assert Cv.C.shape == (m + n, m + n) 844 845 # 1) a vertex mapping from optimal linear sum assignment 846 i, j = min( 847 (k, l) for k, l in zip(Cv.lsa_row_ind, Cv.lsa_col_ind) if k < m or l < n 848 ) 849 xy, localCe = match_edges( 850 pending_u[i] if i < m else None, 851 pending_v[j] if j < n else None, 852 pending_g, 853 pending_h, 854 Ce, 855 matched_uv, 856 ) 857 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h)) 858 # assert Ce.ls <= localCe.ls + Ce_xy.ls 859 if prune(matched_cost + Cv.ls + localCe.ls + Ce_xy.ls): 860 pass 861 else: 862 # get reduced Cv efficiently 863 Cv_ij = CostMatrix( 864 reduce_C(Cv.C, (i,), (j,), m, n), 865 reduce_ind(Cv.lsa_row_ind, (i, m + j)), 866 reduce_ind(Cv.lsa_col_ind, (j, n + i)), 867 Cv.ls - Cv.C[i, j], 868 ) 869 yield (i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls 870 871 # 2) other candidates, sorted by lower-bound cost estimate 872 other = list() 873 fixed_i, fixed_j = i, j 874 if m <= n: 875 candidates = ( 876 (t, fixed_j) 877 for t in range(m + n) 878 if t != fixed_i and (t < m or t == m + fixed_j) 879 ) 880 else: 881 candidates = ( 882 (fixed_i, t) 883 for t in range(m + n) 884 if t != fixed_j and (t < n or t == n + fixed_i) 885 ) 886 for i, j in candidates: 887 if prune(matched_cost + Cv.C[i, j] + Ce.ls): 888 continue 889 Cv_ij = make_CostMatrix( 890 reduce_C(Cv.C, (i,), (j,), m, n), 891 m - 1 if i < m else m, 892 n - 1 if j < n else n, 893 ) 894 # assert Cv.ls <= Cv.C[i, j] + Cv_ij.ls 895 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + Ce.ls): 896 continue 897 xy, localCe = match_edges( 898 pending_u[i] if i < m else None, 899 pending_v[j] if j < n else None, 900 pending_g, 901 pending_h, 902 Ce, 903 matched_uv, 904 ) 905 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls): 906 continue 907 Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h)) 908 # assert Ce.ls <= localCe.ls + Ce_xy.ls 909 if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls + Ce_xy.ls): 910 continue 911 other.append(((i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls)) 912 913 yield from sorted(other, key=lambda t: t[4] + t[1].ls + t[3].ls) 914 915 def get_edit_paths( 916 matched_uv, 917 pending_u, 918 pending_v, 919 Cv, 920 matched_gh, 921 pending_g, 922 pending_h, 923 Ce, 924 matched_cost, 925 ): 926 """ 927 Parameters: 928 matched_uv: partial vertex edit path 929 list of tuples (u, v) of vertex mappings u<->v, 930 u=None or v=None for deletion/insertion 931 pending_u, pending_v: lists of vertices not yet mapped 932 Cv: CostMatrix of pending vertex mappings 933 matched_gh: partial edge edit path 934 list of tuples (g, h) of edge mappings g<->h, 935 g=None or h=None for deletion/insertion 936 pending_g, pending_h: lists of edges not yet mapped 937 Ce: CostMatrix of pending edge mappings 938 matched_cost: cost of partial edit path 939 940 Returns: 941 sequence of (vertex_path, edge_path, cost) 942 vertex_path: complete vertex edit path 943 list of tuples (u, v) of vertex mappings u<->v, 944 u=None or v=None for deletion/insertion 945 edge_path: complete edge edit path 946 list of tuples (g, h) of edge mappings g<->h, 947 g=None or h=None for deletion/insertion 948 cost: total cost of edit path 949 NOTE: path costs are non-increasing 950 """ 951 # debug_print('matched-uv:', matched_uv) 952 # debug_print('matched-gh:', matched_gh) 953 # debug_print('matched-cost:', matched_cost) 954 # debug_print('pending-u:', pending_u) 955 # debug_print('pending-v:', pending_v) 956 # debug_print(Cv.C) 957 # assert list(sorted(G1.nodes)) == list(sorted(list(u for u, v in matched_uv if u is not None) + pending_u)) 958 # assert list(sorted(G2.nodes)) == list(sorted(list(v for u, v in matched_uv if v is not None) + pending_v)) 959 # debug_print('pending-g:', pending_g) 960 # debug_print('pending-h:', pending_h) 961 # debug_print(Ce.C) 962 # assert list(sorted(G1.edges)) == list(sorted(list(g for g, h in matched_gh if g is not None) + pending_g)) 963 # assert list(sorted(G2.edges)) == list(sorted(list(h for g, h in matched_gh if h is not None) + pending_h)) 964 # debug_print() 965 966 if prune(matched_cost + Cv.ls + Ce.ls): 967 return 968 969 if not max(len(pending_u), len(pending_v)): 970 # assert not len(pending_g) 971 # assert not len(pending_h) 972 # path completed! 973 # assert matched_cost <= maxcost.value 974 maxcost.value = min(maxcost.value, matched_cost) 975 yield matched_uv, matched_gh, matched_cost 976 977 else: 978 edit_ops = get_edit_ops( 979 matched_uv, 980 pending_u, 981 pending_v, 982 Cv, 983 pending_g, 984 pending_h, 985 Ce, 986 matched_cost, 987 ) 988 for ij, Cv_ij, xy, Ce_xy, edit_cost in edit_ops: 989 i, j = ij 990 # assert Cv.C[i, j] + sum(Ce.C[t] for t in xy) == edit_cost 991 if prune(matched_cost + edit_cost + Cv_ij.ls + Ce_xy.ls): 992 continue 993 994 # dive deeper 995 u = pending_u.pop(i) if i < len(pending_u) else None 996 v = pending_v.pop(j) if j < len(pending_v) else None 997 matched_uv.append((u, v)) 998 for x, y in xy: 999 len_g = len(pending_g) 1000 len_h = len(pending_h) 1001 matched_gh.append( 1002 ( 1003 pending_g[x] if x < len_g else None, 1004 pending_h[y] if y < len_h else None, 1005 ) 1006 ) 1007 sortedx = list(sorted(x for x, y in xy)) 1008 sortedy = list(sorted(y for x, y in xy)) 1009 G = list( 1010 (pending_g.pop(x) if x < len(pending_g) else None) 1011 for x in reversed(sortedx) 1012 ) 1013 H = list( 1014 (pending_h.pop(y) if y < len(pending_h) else None) 1015 for y in reversed(sortedy) 1016 ) 1017 1018 yield from get_edit_paths( 1019 matched_uv, 1020 pending_u, 1021 pending_v, 1022 Cv_ij, 1023 matched_gh, 1024 pending_g, 1025 pending_h, 1026 Ce_xy, 1027 matched_cost + edit_cost, 1028 ) 1029 1030 # backtrack 1031 if u is not None: 1032 pending_u.insert(i, u) 1033 if v is not None: 1034 pending_v.insert(j, v) 1035 matched_uv.pop() 1036 for x, g in zip(sortedx, reversed(G)): 1037 if g is not None: 1038 pending_g.insert(x, g) 1039 for y, h in zip(sortedy, reversed(H)): 1040 if h is not None: 1041 pending_h.insert(y, h) 1042 for t in xy: 1043 matched_gh.pop() 1044 1045 # Initialization 1046 1047 pending_u = list(G1.nodes) 1048 pending_v = list(G2.nodes) 1049 1050 initial_cost = 0 1051 if roots: 1052 root_u, root_v = roots 1053 if root_u not in pending_u or root_v not in pending_v: 1054 raise nx.NodeNotFound("Root node not in graph.") 1055 1056 # remove roots from pending 1057 pending_u.remove(root_u) 1058 pending_v.remove(root_v) 1059 1060 # cost matrix of vertex mappings 1061 m = len(pending_u) 1062 n = len(pending_v) 1063 C = np.zeros((m + n, m + n)) 1064 if node_subst_cost: 1065 C[0:m, 0:n] = np.array( 1066 [ 1067 node_subst_cost(G1.nodes[u], G2.nodes[v]) 1068 for u in pending_u 1069 for v in pending_v 1070 ] 1071 ).reshape(m, n) 1072 if roots: 1073 initial_cost = node_subst_cost(G1.nodes[root_u], G2.nodes[root_v]) 1074 elif node_match: 1075 C[0:m, 0:n] = np.array( 1076 [ 1077 1 - int(node_match(G1.nodes[u], G2.nodes[v])) 1078 for u in pending_u 1079 for v in pending_v 1080 ] 1081 ).reshape(m, n) 1082 if roots: 1083 initial_cost = 1 - node_match(G1.nodes[root_u], G2.nodes[root_v]) 1084 else: 1085 # all zeroes 1086 pass 1087 # assert not min(m, n) or C[0:m, 0:n].min() >= 0 1088 if node_del_cost: 1089 del_costs = [node_del_cost(G1.nodes[u]) for u in pending_u] 1090 else: 1091 del_costs = [1] * len(pending_u) 1092 # assert not m or min(del_costs) >= 0 1093 if node_ins_cost: 1094 ins_costs = [node_ins_cost(G2.nodes[v]) for v in pending_v] 1095 else: 1096 ins_costs = [1] * len(pending_v) 1097 # assert not n or min(ins_costs) >= 0 1098 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1 1099 C[0:m, n : n + m] = np.array( 1100 [del_costs[i] if i == j else inf for i in range(m) for j in range(m)] 1101 ).reshape(m, m) 1102 C[m : m + n, 0:n] = np.array( 1103 [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)] 1104 ).reshape(n, n) 1105 Cv = make_CostMatrix(C, m, n) 1106 # debug_print(f"Cv: {m} x {n}") 1107 # debug_print(Cv.C) 1108 1109 pending_g = list(G1.edges) 1110 pending_h = list(G2.edges) 1111 1112 # cost matrix of edge mappings 1113 m = len(pending_g) 1114 n = len(pending_h) 1115 C = np.zeros((m + n, m + n)) 1116 if edge_subst_cost: 1117 C[0:m, 0:n] = np.array( 1118 [ 1119 edge_subst_cost(G1.edges[g], G2.edges[h]) 1120 for g in pending_g 1121 for h in pending_h 1122 ] 1123 ).reshape(m, n) 1124 elif edge_match: 1125 C[0:m, 0:n] = np.array( 1126 [ 1127 1 - int(edge_match(G1.edges[g], G2.edges[h])) 1128 for g in pending_g 1129 for h in pending_h 1130 ] 1131 ).reshape(m, n) 1132 else: 1133 # all zeroes 1134 pass 1135 # assert not min(m, n) or C[0:m, 0:n].min() >= 0 1136 if edge_del_cost: 1137 del_costs = [edge_del_cost(G1.edges[g]) for g in pending_g] 1138 else: 1139 del_costs = [1] * len(pending_g) 1140 # assert not m or min(del_costs) >= 0 1141 if edge_ins_cost: 1142 ins_costs = [edge_ins_cost(G2.edges[h]) for h in pending_h] 1143 else: 1144 ins_costs = [1] * len(pending_h) 1145 # assert not n or min(ins_costs) >= 0 1146 inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1 1147 C[0:m, n : n + m] = np.array( 1148 [del_costs[i] if i == j else inf for i in range(m) for j in range(m)] 1149 ).reshape(m, m) 1150 C[m : m + n, 0:n] = np.array( 1151 [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)] 1152 ).reshape(n, n) 1153 Ce = make_CostMatrix(C, m, n) 1154 # debug_print(f'Ce: {m} x {n}') 1155 # debug_print(Ce.C) 1156 # debug_print() 1157 1158 class MaxCost: 1159 def __init__(self): 1160 # initial upper-bound estimate 1161 # NOTE: should work for empty graph 1162 self.value = Cv.C.sum() + Ce.C.sum() + 1 1163 1164 maxcost = MaxCost() 1165 1166 if timeout is not None: 1167 if timeout <= 0: 1168 raise nx.NetworkXError("Timeout value must be greater than 0") 1169 start = time.perf_counter() 1170 1171 def prune(cost): 1172 if timeout is not None: 1173 if time.perf_counter() - start > timeout: 1174 return True 1175 if upper_bound is not None: 1176 if cost > upper_bound: 1177 return True 1178 if cost > maxcost.value: 1179 return True 1180 elif strictly_decreasing and cost >= maxcost.value: 1181 return True 1182 1183 # Now go! 1184 1185 done_uv = [] if roots is None else [roots] 1186 1187 for vertex_path, edge_path, cost in get_edit_paths( 1188 done_uv, pending_u, pending_v, Cv, [], pending_g, pending_h, Ce, initial_cost 1189 ): 1190 # assert sorted(G1.nodes) == sorted(u for u, v in vertex_path if u is not None) 1191 # assert sorted(G2.nodes) == sorted(v for u, v in vertex_path if v is not None) 1192 # assert sorted(G1.edges) == sorted(g for g, h in edge_path if g is not None) 1193 # assert sorted(G2.edges) == sorted(h for g, h in edge_path if h is not None) 1194 # print(vertex_path, edge_path, cost, file = sys.stderr) 1195 # assert cost == maxcost.value 1196 yield list(vertex_path), list(edge_path), cost 1197 1198 1199def simrank_similarity( 1200 G, 1201 source=None, 1202 target=None, 1203 importance_factor=0.9, 1204 max_iterations=1000, 1205 tolerance=1e-4, 1206): 1207 """Returns the SimRank similarity of nodes in the graph ``G``. 1208 1209 SimRank is a similarity metric that says "two objects are considered 1210 to be similar if they are referenced by similar objects." [1]_. 1211 1212 The pseudo-code definition from the paper is:: 1213 1214 def simrank(G, u, v): 1215 in_neighbors_u = G.predecessors(u) 1216 in_neighbors_v = G.predecessors(v) 1217 scale = C / (len(in_neighbors_u) * len(in_neighbors_v)) 1218 return scale * sum(simrank(G, w, x) 1219 for w, x in product(in_neighbors_u, 1220 in_neighbors_v)) 1221 1222 where ``G`` is the graph, ``u`` is the source, ``v`` is the target, 1223 and ``C`` is a float decay or importance factor between 0 and 1. 1224 1225 The SimRank algorithm for determining node similarity is defined in 1226 [2]_. 1227 1228 Parameters 1229 ---------- 1230 G : NetworkX graph 1231 A NetworkX graph 1232 1233 source : node 1234 If this is specified, the returned dictionary maps each node 1235 ``v`` in the graph to the similarity between ``source`` and 1236 ``v``. 1237 1238 target : node 1239 If both ``source`` and ``target`` are specified, the similarity 1240 value between ``source`` and ``target`` is returned. If 1241 ``target`` is specified but ``source`` is not, this argument is 1242 ignored. 1243 1244 importance_factor : float 1245 The relative importance of indirect neighbors with respect to 1246 direct neighbors. 1247 1248 max_iterations : integer 1249 Maximum number of iterations. 1250 1251 tolerance : float 1252 Error tolerance used to check convergence. When an iteration of 1253 the algorithm finds that no similarity value changes more than 1254 this amount, the algorithm halts. 1255 1256 Returns 1257 ------- 1258 similarity : dictionary or float 1259 If ``source`` and ``target`` are both ``None``, this returns a 1260 dictionary of dictionaries, where keys are node pairs and value 1261 are similarity of the pair of nodes. 1262 1263 If ``source`` is not ``None`` but ``target`` is, this returns a 1264 dictionary mapping node to the similarity of ``source`` and that 1265 node. 1266 1267 If neither ``source`` nor ``target`` is ``None``, this returns 1268 the similarity value for the given pair of nodes. 1269 1270 Examples 1271 -------- 1272 >>> G = nx.cycle_graph(2) 1273 >>> nx.simrank_similarity(G) 1274 {0: {0: 1.0, 1: 0.0}, 1: {0: 0.0, 1: 1.0}} 1275 >>> nx.simrank_similarity(G, source=0) 1276 {0: 1.0, 1: 0.0} 1277 >>> nx.simrank_similarity(G, source=0, target=0) 1278 1.0 1279 1280 The result of this function can be converted to a numpy array 1281 representing the SimRank matrix by using the node order of the 1282 graph to determine which row and column represent each node. 1283 Other ordering of nodes is also possible. 1284 1285 >>> import numpy as np 1286 >>> sim = nx.simrank_similarity(G) 1287 >>> np.array([[sim[u][v] for v in G] for u in G]) 1288 array([[1., 0.], 1289 [0., 1.]]) 1290 >>> sim_1d = nx.simrank_similarity(G, source=0) 1291 >>> np.array([sim[0][v] for v in G]) 1292 array([1., 0.]) 1293 1294 References 1295 ---------- 1296 .. [1] https://en.wikipedia.org/wiki/SimRank 1297 .. [2] G. Jeh and J. Widom. 1298 "SimRank: a measure of structural-context similarity", 1299 In KDD'02: Proceedings of the Eighth ACM SIGKDD 1300 International Conference on Knowledge Discovery and Data Mining, 1301 pp. 538--543. ACM Press, 2002. 1302 """ 1303 import numpy as np 1304 1305 nodelist = list(G) 1306 s_indx = None if source is None else nodelist.index(source) 1307 t_indx = None if target is None else nodelist.index(target) 1308 1309 x = _simrank_similarity_numpy( 1310 G, s_indx, t_indx, importance_factor, max_iterations, tolerance 1311 ) 1312 1313 if isinstance(x, np.ndarray): 1314 if x.ndim == 1: 1315 return {node: val for node, val in zip(G, x)} 1316 else: # x.ndim == 2: 1317 return {u: dict(zip(G, row)) for u, row in zip(G, x)} 1318 return x 1319 1320 1321def _simrank_similarity_python( 1322 G, 1323 source=None, 1324 target=None, 1325 importance_factor=0.9, 1326 max_iterations=1000, 1327 tolerance=1e-4, 1328): 1329 """Returns the SimRank similarity of nodes in the graph ``G``. 1330 1331 This pure Python version is provided for pedagogical purposes. 1332 1333 Examples 1334 -------- 1335 >>> G = nx.cycle_graph(2) 1336 >>> nx.similarity._simrank_similarity_python(G) 1337 {0: {0: 1, 1: 0.0}, 1: {0: 0.0, 1: 1}} 1338 >>> nx.similarity._simrank_similarity_python(G, source=0) 1339 {0: 1, 1: 0.0} 1340 >>> nx.similarity._simrank_similarity_python(G, source=0, target=0) 1341 1 1342 """ 1343 # build up our similarity adjacency dictionary output 1344 newsim = {u: {v: 1 if u == v else 0 for v in G} for u in G} 1345 1346 # These functions compute the update to the similarity value of the nodes 1347 # `u` and `v` with respect to the previous similarity values. 1348 def avg_sim(s): 1349 return sum(newsim[w][x] for (w, x) in s) / len(s) if s else 0.0 1350 1351 Gadj = G.pred if G.is_directed() else G.adj 1352 1353 def sim(u, v): 1354 return importance_factor * avg_sim(list(product(Gadj[u], Gadj[v]))) 1355 1356 for its in range(max_iterations): 1357 oldsim = newsim 1358 newsim = {u: {v: sim(u, v) if u is not v else 1 for v in G} for u in G} 1359 is_close = all( 1360 all( 1361 abs(newsim[u][v] - old) <= tolerance * (1 + abs(old)) 1362 for v, old in nbrs.items() 1363 ) 1364 for u, nbrs in oldsim.items() 1365 ) 1366 if is_close: 1367 break 1368 1369 if its + 1 == max_iterations: 1370 raise nx.ExceededMaxIterations( 1371 f"simrank did not converge after {max_iterations} iterations." 1372 ) 1373 1374 if source is not None and target is not None: 1375 return newsim[source][target] 1376 if source is not None: 1377 return newsim[source] 1378 return newsim 1379 1380 1381def _simrank_similarity_numpy( 1382 G, 1383 source=None, 1384 target=None, 1385 importance_factor=0.9, 1386 max_iterations=1000, 1387 tolerance=1e-4, 1388): 1389 """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``. 1390 1391 The SimRank algorithm for determining node similarity is defined in 1392 [1]_. 1393 1394 Parameters 1395 ---------- 1396 G : NetworkX graph 1397 A NetworkX graph 1398 1399 source : node 1400 If this is specified, the returned dictionary maps each node 1401 ``v`` in the graph to the similarity between ``source`` and 1402 ``v``. 1403 1404 target : node 1405 If both ``source`` and ``target`` are specified, the similarity 1406 value between ``source`` and ``target`` is returned. If 1407 ``target`` is specified but ``source`` is not, this argument is 1408 ignored. 1409 1410 importance_factor : float 1411 The relative importance of indirect neighbors with respect to 1412 direct neighbors. 1413 1414 max_iterations : integer 1415 Maximum number of iterations. 1416 1417 tolerance : float 1418 Error tolerance used to check convergence. When an iteration of 1419 the algorithm finds that no similarity value changes more than 1420 this amount, the algorithm halts. 1421 1422 Returns 1423 ------- 1424 similarity : numpy matrix, numpy array or float 1425 If ``source`` and ``target`` are both ``None``, this returns a 1426 Matrix containing SimRank scores of the nodes. 1427 1428 If ``source`` is not ``None`` but ``target`` is, this returns an 1429 Array containing SimRank scores of ``source`` and that 1430 node. 1431 1432 If neither ``source`` nor ``target`` is ``None``, this returns 1433 the similarity value for the given pair of nodes. 1434 1435 Examples 1436 -------- 1437 >>> G = nx.cycle_graph(2) 1438 >>> nx.similarity._simrank_similarity_numpy(G) 1439 array([[1., 0.], 1440 [0., 1.]]) 1441 >>> nx.similarity._simrank_similarity_numpy(G, source=0) 1442 array([1., 0.]) 1443 >>> nx.similarity._simrank_similarity_numpy(G, source=0, target=0) 1444 1.0 1445 1446 References 1447 ---------- 1448 .. [1] G. Jeh and J. Widom. 1449 "SimRank: a measure of structural-context similarity", 1450 In KDD'02: Proceedings of the Eighth ACM SIGKDD 1451 International Conference on Knowledge Discovery and Data Mining, 1452 pp. 538--543. ACM Press, 2002. 1453 """ 1454 # This algorithm follows roughly 1455 # 1456 # S = max{C * (A.T * S * A), I} 1457 # 1458 # where C is the importance factor, A is the column normalized 1459 # adjacency matrix, and I is the identity matrix. 1460 import numpy as np 1461 1462 adjacency_matrix = nx.to_numpy_array(G) 1463 1464 # column-normalize the ``adjacency_matrix`` 1465 s = np.array(adjacency_matrix.sum(axis=0)) 1466 s[s == 0] = 1 1467 adjacency_matrix /= s # adjacency_matrix.sum(axis=0) 1468 1469 newsim = np.eye(len(G), dtype=np.float64) 1470 for its in range(max_iterations): 1471 prevsim = newsim.copy() 1472 newsim = importance_factor * ((adjacency_matrix.T @ prevsim) @ adjacency_matrix) 1473 np.fill_diagonal(newsim, 1.0) 1474 1475 if np.allclose(prevsim, newsim, atol=tolerance): 1476 break 1477 1478 if its + 1 == max_iterations: 1479 raise nx.ExceededMaxIterations( 1480 f"simrank did not converge after {max_iterations} iterations." 1481 ) 1482 1483 if source is not None and target is not None: 1484 return newsim[source, target] 1485 if source is not None: 1486 return newsim[source] 1487 return newsim 1488 1489 1490def simrank_similarity_numpy( 1491 G, 1492 source=None, 1493 target=None, 1494 importance_factor=0.9, 1495 max_iterations=100, 1496 tolerance=1e-4, 1497): 1498 """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``. 1499 1500 .. deprecated:: 2.6 1501 simrank_similarity_numpy is deprecated and will be removed in networkx 3.0. 1502 Use simrank_similarity 1503 1504 """ 1505 warnings.warn( 1506 ( 1507 "networkx.simrank_similarity_numpy is deprecated and will be removed" 1508 "in NetworkX 3.0, use networkx.simrank_similarity instead." 1509 ), 1510 DeprecationWarning, 1511 stacklevel=2, 1512 ) 1513 return _simrank_similarity_numpy( 1514 G, 1515 source, 1516 target, 1517 importance_factor, 1518 max_iterations, 1519 tolerance, 1520 ) 1521 1522 1523# TODO replace w/ math.comb(n, k) for Python 3.8+ 1524def _n_choose_k(n, k): 1525 """Pure Python implementation of the binomial coefficient 1526 1527 The general equation is n! / (k! * (n - k)!). The below 1528 implementation is a more efficient version. 1529 1530 Note: this will be removed in favor of Python 3.8's ``math.comb(n, k)``. 1531 1532 Parameters 1533 ---------- 1534 n : int 1535 Set of ``n`` elements 1536 k : int 1537 Unordered chosen subset of length ``k`` 1538 1539 Returns 1540 ------- 1541 binomial_coeff : int 1542 The number of ways (disregarding order) that k objects 1543 can be chosen from among n objects. 1544 1545 Examples 1546 -------- 1547 >>> _n_choose_k(5, 2) 1548 10 1549 >>> _n_choose_k(5, 4) 1550 5 1551 >>> _n_choose_k(100, 100) 1552 1 1553 1554 """ 1555 if k > n: 1556 return 0 1557 if n == k: 1558 return 1 1559 elif k < n - k: 1560 return reduce(mul, range(n - k + 1, n + 1)) // math.factorial(k) 1561 else: 1562 return reduce(mul, range(k + 1, n + 1)) // math.factorial(n - k) 1563 1564 1565def panther_similarity(G, source, k=5, path_length=5, c=0.5, delta=0.1, eps=None): 1566 r"""Returns the Panther similarity of nodes in the graph `G` to node ``v``. 1567 1568 Panther is a similarity metric that says "two objects are considered 1569 to be similar if they frequently appear on the same paths." [1]_. 1570 1571 Parameters 1572 ---------- 1573 G : NetworkX graph 1574 A NetworkX graph 1575 source : node 1576 Source node for which to find the top `k` similar other nodes 1577 k : int (default = 5) 1578 The number of most similar nodes to return 1579 path_length : int (default = 5) 1580 How long the randomly generated paths should be (``T`` in [1]_) 1581 c : float (default = 0.5) 1582 A universal positive constant used to scale the number 1583 of sample random paths to generate. 1584 delta : float (default = 0.1) 1585 The probability that the similarity $S$ is not an epsilon-approximation to (R, phi), 1586 where $R$ is the number of random paths and $\phi$ is the probability 1587 that an element sampled from a set $A \subseteq D$, where $D$ is the domain. 1588 eps : float or None (default = None) 1589 The error bound. Per [1]_, a good value is ``sqrt(1/|E|)``. Therefore, 1590 if no value is provided, the recommended computed value will be used. 1591 1592 Returns 1593 ------- 1594 similarity : dictionary 1595 Dictionary of nodes to similarity scores (as floats). Note: 1596 the self-similarity (i.e., ``v``) will not be included in 1597 the returned dictionary. 1598 1599 Examples 1600 -------- 1601 >>> G = nx.star_graph(10) 1602 >>> sim = nx.panther_similarity(G, 0) 1603 1604 References 1605 ---------- 1606 .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J. 1607 Panther: Fast top-k similarity search on large networks. 1608 In Proceedings of the ACM SIGKDD International Conference 1609 on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454). 1610 Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267. 1611 """ 1612 import numpy as np 1613 1614 num_nodes = G.number_of_nodes() 1615 if num_nodes < k: 1616 warnings.warn( 1617 f"Number of nodes is {num_nodes}, but requested k is {k}. " 1618 "Setting k to number of nodes." 1619 ) 1620 k = num_nodes 1621 # According to [1], they empirically determined 1622 # a good value for ``eps`` to be sqrt( 1 / |E| ) 1623 if eps is None: 1624 eps = np.sqrt(1.0 / G.number_of_edges()) 1625 1626 inv_node_map = {name: index for index, name in enumerate(G.nodes)} 1627 node_map = np.array(G) 1628 1629 # Calculate the sample size ``R`` for how many paths 1630 # to randomly generate 1631 t_choose_2 = _n_choose_k(path_length, 2) 1632 sample_size = int((c / eps ** 2) * (np.log2(t_choose_2) + 1 + np.log(1 / delta))) 1633 index_map = {} 1634 _ = list( 1635 generate_random_paths( 1636 G, sample_size, path_length=path_length, index_map=index_map 1637 ) 1638 ) 1639 S = np.zeros(num_nodes) 1640 1641 inv_sample_size = 1 / sample_size 1642 1643 source_paths = set(index_map[source]) 1644 1645 # Calculate the path similarities 1646 # between ``source`` (v) and ``node`` (v_j) 1647 # using our inverted index mapping of 1648 # vertices to paths 1649 for node, paths in index_map.items(): 1650 # Only consider paths where both 1651 # ``node`` and ``source`` are present 1652 common_paths = source_paths.intersection(paths) 1653 S[inv_node_map[node]] = len(common_paths) * inv_sample_size 1654 1655 # Retrieve top ``k`` similar 1656 # Note: the below performed anywhere from 4-10x faster 1657 # (depending on input sizes) vs the equivalent ``np.argsort(S)[::-1]`` 1658 top_k_unsorted = np.argpartition(S, -k)[-k:] 1659 top_k_sorted = top_k_unsorted[np.argsort(S[top_k_unsorted])][::-1] 1660 1661 # Add back the similarity scores 1662 top_k_sorted_names = map(lambda n: node_map[n], top_k_sorted) 1663 top_k_with_val = dict(zip(top_k_sorted_names, S[top_k_sorted])) 1664 1665 # Remove the self-similarity 1666 top_k_with_val.pop(source, None) 1667 return top_k_with_val 1668 1669 1670def generate_random_paths(G, sample_size, path_length=5, index_map=None): 1671 """Randomly generate `sample_size` paths of length `path_length`. 1672 1673 Parameters 1674 ---------- 1675 G : NetworkX graph 1676 A NetworkX graph 1677 sample_size : integer 1678 The number of paths to generate. This is ``R`` in [1]_. 1679 path_length : integer (default = 5) 1680 The maximum size of the path to randomly generate. 1681 This is ``T`` in [1]_. According to the paper, ``T >= 5`` is 1682 recommended. 1683 index_map : dictionary, optional 1684 If provided, this will be populated with the inverted 1685 index of nodes mapped to the set of generated random path 1686 indices within ``paths``. 1687 1688 Returns 1689 ------- 1690 paths : generator of lists 1691 Generator of `sample_size` paths each with length `path_length`. 1692 1693 Examples 1694 -------- 1695 Note that the return value is the list of paths: 1696 1697 >>> G = nx.star_graph(3) 1698 >>> random_path = nx.generate_random_paths(G, 2) 1699 1700 By passing a dictionary into `index_map`, it will build an 1701 inverted index mapping of nodes to the paths in which that node is present: 1702 1703 >>> G = nx.star_graph(3) 1704 >>> index_map = {} 1705 >>> random_path = nx.generate_random_paths(G, 3, index_map=index_map) 1706 >>> paths_containing_node_0 = [random_path[path_idx] for path_idx in index_map.get(0, [])] 1707 1708 References 1709 ---------- 1710 .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J. 1711 Panther: Fast top-k similarity search on large networks. 1712 In Proceedings of the ACM SIGKDD International Conference 1713 on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454). 1714 Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267. 1715 """ 1716 import numpy as np 1717 1718 # Calculate transition probabilities between 1719 # every pair of vertices according to Eq. (3) 1720 adj_mat = nx.to_numpy_array(G) 1721 inv_row_sums = np.reciprocal(adj_mat.sum(axis=1)).reshape(-1, 1) 1722 transition_probabilities = adj_mat * inv_row_sums 1723 1724 node_map = np.array(G) 1725 num_nodes = G.number_of_nodes() 1726 1727 for path_index in range(sample_size): 1728 # Sample current vertex v = v_i uniformly at random 1729 node_index = np.random.randint(0, high=num_nodes) 1730 node = node_map[node_index] 1731 1732 # Add v into p_r and add p_r into the path set 1733 # of v, i.e., P_v 1734 path = [node] 1735 1736 # Build the inverted index (P_v) of vertices to paths 1737 if index_map is not None: 1738 if node in index_map: 1739 index_map[node].add(path_index) 1740 else: 1741 index_map[node] = {path_index} 1742 1743 starting_index = node_index 1744 for _ in range(path_length): 1745 # Randomly sample a neighbor (v_j) according 1746 # to transition probabilities from ``node`` (v) to its neighbors 1747 neighbor_index = np.random.choice( 1748 num_nodes, p=transition_probabilities[starting_index] 1749 ) 1750 1751 # Set current vertex (v = v_j) 1752 starting_index = neighbor_index 1753 1754 # Add v into p_r 1755 neighbor_node = node_map[neighbor_index] 1756 path.append(neighbor_node) 1757 1758 # Add p_r into P_v 1759 if index_map is not None: 1760 if neighbor_node in index_map: 1761 index_map[neighbor_node].add(path_index) 1762 else: 1763 index_map[neighbor_node] = {path_index} 1764 1765 yield path 1766