1""" Functions measuring similarity using graph edit distance.
2
3The graph edit distance is the number of edge/node changes needed
4to make two graphs isomorphic.
5
6The default algorithm/implementation is sub-optimal for some graphs.
7The problem of finding the exact Graph Edit Distance (GED) is NP-hard
8so it is often slow. If the simple interface `graph_edit_distance`
9takes too long for your graph, try `optimize_graph_edit_distance`
10and/or `optimize_edit_paths`.
11
12At the same time, I encourage capable people to investigate
13alternative GED algorithms, in order to improve the choices available.
14"""
15
16from functools import reduce
17from itertools import product
18import math
19from operator import mul
20import time
21import warnings
22import networkx as nx
23
24__all__ = [
25    "graph_edit_distance",
26    "optimal_edit_paths",
27    "optimize_graph_edit_distance",
28    "optimize_edit_paths",
29    "simrank_similarity",
30    "simrank_similarity_numpy",
31    "panther_similarity",
32    "generate_random_paths",
33]
34
35
36def debug_print(*args, **kwargs):
37    print(*args, **kwargs)
38
39
40def graph_edit_distance(
41    G1,
42    G2,
43    node_match=None,
44    edge_match=None,
45    node_subst_cost=None,
46    node_del_cost=None,
47    node_ins_cost=None,
48    edge_subst_cost=None,
49    edge_del_cost=None,
50    edge_ins_cost=None,
51    roots=None,
52    upper_bound=None,
53    timeout=None,
54):
55    """Returns GED (graph edit distance) between graphs G1 and G2.
56
57    Graph edit distance is a graph similarity measure analogous to
58    Levenshtein distance for strings.  It is defined as minimum cost
59    of edit path (sequence of node and edge edit operations)
60    transforming graph G1 to graph isomorphic to G2.
61
62    Parameters
63    ----------
64    G1, G2: graphs
65        The two graphs G1 and G2 must be of the same type.
66
67    node_match : callable
68        A function that returns True if node n1 in G1 and n2 in G2
69        should be considered equal during matching.
70
71        The function will be called like
72
73           node_match(G1.nodes[n1], G2.nodes[n2]).
74
75        That is, the function will receive the node attribute
76        dictionaries for n1 and n2 as inputs.
77
78        Ignored if node_subst_cost is specified.  If neither
79        node_match nor node_subst_cost are specified then node
80        attributes are not considered.
81
82    edge_match : callable
83        A function that returns True if the edge attribute dictionaries
84        for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
85        be considered equal during matching.
86
87        The function will be called like
88
89           edge_match(G1[u1][v1], G2[u2][v2]).
90
91        That is, the function will receive the edge attribute
92        dictionaries of the edges under consideration.
93
94        Ignored if edge_subst_cost is specified.  If neither
95        edge_match nor edge_subst_cost are specified then edge
96        attributes are not considered.
97
98    node_subst_cost, node_del_cost, node_ins_cost : callable
99        Functions that return the costs of node substitution, node
100        deletion, and node insertion, respectively.
101
102        The functions will be called like
103
104           node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
105           node_del_cost(G1.nodes[n1]),
106           node_ins_cost(G2.nodes[n2]).
107
108        That is, the functions will receive the node attribute
109        dictionaries as inputs.  The functions are expected to return
110        positive numeric values.
111
112        Function node_subst_cost overrides node_match if specified.
113        If neither node_match nor node_subst_cost are specified then
114        default node substitution cost of 0 is used (node attributes
115        are not considered during matching).
116
117        If node_del_cost is not specified then default node deletion
118        cost of 1 is used.  If node_ins_cost is not specified then
119        default node insertion cost of 1 is used.
120
121    edge_subst_cost, edge_del_cost, edge_ins_cost : callable
122        Functions that return the costs of edge substitution, edge
123        deletion, and edge insertion, respectively.
124
125        The functions will be called like
126
127           edge_subst_cost(G1[u1][v1], G2[u2][v2]),
128           edge_del_cost(G1[u1][v1]),
129           edge_ins_cost(G2[u2][v2]).
130
131        That is, the functions will receive the edge attribute
132        dictionaries as inputs.  The functions are expected to return
133        positive numeric values.
134
135        Function edge_subst_cost overrides edge_match if specified.
136        If neither edge_match nor edge_subst_cost are specified then
137        default edge substitution cost of 0 is used (edge attributes
138        are not considered during matching).
139
140        If edge_del_cost is not specified then default edge deletion
141        cost of 1 is used.  If edge_ins_cost is not specified then
142        default edge insertion cost of 1 is used.
143
144    roots : 2-tuple
145        Tuple where first element is a node in G1 and the second
146        is a node in G2.
147        These nodes are forced to be matched in the comparison to
148        allow comparison between rooted graphs.
149
150    upper_bound : numeric
151        Maximum edit distance to consider.  Return None if no edit
152        distance under or equal to upper_bound exists.
153
154    timeout : numeric
155        Maximum number of seconds to execute.
156        After timeout is met, the current best GED is returned.
157
158    Examples
159    --------
160    >>> G1 = nx.cycle_graph(6)
161    >>> G2 = nx.wheel_graph(7)
162    >>> nx.graph_edit_distance(G1, G2)
163    7.0
164
165    >>> G1 = nx.star_graph(5)
166    >>> G2 = nx.star_graph(5)
167    >>> nx.graph_edit_distance(G1, G2, roots=(0, 0))
168    0.0
169    >>> nx.graph_edit_distance(G1, G2, roots=(1, 0))
170    8.0
171
172    See Also
173    --------
174    optimal_edit_paths, optimize_graph_edit_distance,
175
176    is_isomorphic: test for graph edit distance of 0
177
178    References
179    ----------
180    .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
181       Martineau. An Exact Graph Edit Distance Algorithm for Solving
182       Pattern Recognition Problems. 4th International Conference on
183       Pattern Recognition Applications and Methods 2015, Jan 2015,
184       Lisbon, Portugal. 2015,
185       <10.5220/0005209202710278>. <hal-01168816>
186       https://hal.archives-ouvertes.fr/hal-01168816
187
188    """
189    bestcost = None
190    for vertex_path, edge_path, cost in optimize_edit_paths(
191        G1,
192        G2,
193        node_match,
194        edge_match,
195        node_subst_cost,
196        node_del_cost,
197        node_ins_cost,
198        edge_subst_cost,
199        edge_del_cost,
200        edge_ins_cost,
201        upper_bound,
202        True,
203        roots,
204        timeout,
205    ):
206        # assert bestcost is None or cost < bestcost
207        bestcost = cost
208    return bestcost
209
210
211def optimal_edit_paths(
212    G1,
213    G2,
214    node_match=None,
215    edge_match=None,
216    node_subst_cost=None,
217    node_del_cost=None,
218    node_ins_cost=None,
219    edge_subst_cost=None,
220    edge_del_cost=None,
221    edge_ins_cost=None,
222    upper_bound=None,
223):
224    """Returns all minimum-cost edit paths transforming G1 to G2.
225
226    Graph edit path is a sequence of node and edge edit operations
227    transforming graph G1 to graph isomorphic to G2.  Edit operations
228    include substitutions, deletions, and insertions.
229
230    Parameters
231    ----------
232    G1, G2: graphs
233        The two graphs G1 and G2 must be of the same type.
234
235    node_match : callable
236        A function that returns True if node n1 in G1 and n2 in G2
237        should be considered equal during matching.
238
239        The function will be called like
240
241           node_match(G1.nodes[n1], G2.nodes[n2]).
242
243        That is, the function will receive the node attribute
244        dictionaries for n1 and n2 as inputs.
245
246        Ignored if node_subst_cost is specified.  If neither
247        node_match nor node_subst_cost are specified then node
248        attributes are not considered.
249
250    edge_match : callable
251        A function that returns True if the edge attribute dictionaries
252        for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
253        be considered equal during matching.
254
255        The function will be called like
256
257           edge_match(G1[u1][v1], G2[u2][v2]).
258
259        That is, the function will receive the edge attribute
260        dictionaries of the edges under consideration.
261
262        Ignored if edge_subst_cost is specified.  If neither
263        edge_match nor edge_subst_cost are specified then edge
264        attributes are not considered.
265
266    node_subst_cost, node_del_cost, node_ins_cost : callable
267        Functions that return the costs of node substitution, node
268        deletion, and node insertion, respectively.
269
270        The functions will be called like
271
272           node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
273           node_del_cost(G1.nodes[n1]),
274           node_ins_cost(G2.nodes[n2]).
275
276        That is, the functions will receive the node attribute
277        dictionaries as inputs.  The functions are expected to return
278        positive numeric values.
279
280        Function node_subst_cost overrides node_match if specified.
281        If neither node_match nor node_subst_cost are specified then
282        default node substitution cost of 0 is used (node attributes
283        are not considered during matching).
284
285        If node_del_cost is not specified then default node deletion
286        cost of 1 is used.  If node_ins_cost is not specified then
287        default node insertion cost of 1 is used.
288
289    edge_subst_cost, edge_del_cost, edge_ins_cost : callable
290        Functions that return the costs of edge substitution, edge
291        deletion, and edge insertion, respectively.
292
293        The functions will be called like
294
295           edge_subst_cost(G1[u1][v1], G2[u2][v2]),
296           edge_del_cost(G1[u1][v1]),
297           edge_ins_cost(G2[u2][v2]).
298
299        That is, the functions will receive the edge attribute
300        dictionaries as inputs.  The functions are expected to return
301        positive numeric values.
302
303        Function edge_subst_cost overrides edge_match if specified.
304        If neither edge_match nor edge_subst_cost are specified then
305        default edge substitution cost of 0 is used (edge attributes
306        are not considered during matching).
307
308        If edge_del_cost is not specified then default edge deletion
309        cost of 1 is used.  If edge_ins_cost is not specified then
310        default edge insertion cost of 1 is used.
311
312    upper_bound : numeric
313        Maximum edit distance to consider.
314
315    Returns
316    -------
317    edit_paths : list of tuples (node_edit_path, edge_edit_path)
318        node_edit_path : list of tuples (u, v)
319        edge_edit_path : list of tuples ((u1, v1), (u2, v2))
320
321    cost : numeric
322        Optimal edit path cost (graph edit distance).
323
324    Examples
325    --------
326    >>> G1 = nx.cycle_graph(4)
327    >>> G2 = nx.wheel_graph(5)
328    >>> paths, cost = nx.optimal_edit_paths(G1, G2)
329    >>> len(paths)
330    40
331    >>> cost
332    5.0
333
334    See Also
335    --------
336    graph_edit_distance, optimize_edit_paths
337
338    References
339    ----------
340    .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
341       Martineau. An Exact Graph Edit Distance Algorithm for Solving
342       Pattern Recognition Problems. 4th International Conference on
343       Pattern Recognition Applications and Methods 2015, Jan 2015,
344       Lisbon, Portugal. 2015,
345       <10.5220/0005209202710278>. <hal-01168816>
346       https://hal.archives-ouvertes.fr/hal-01168816
347
348    """
349    paths = list()
350    bestcost = None
351    for vertex_path, edge_path, cost in optimize_edit_paths(
352        G1,
353        G2,
354        node_match,
355        edge_match,
356        node_subst_cost,
357        node_del_cost,
358        node_ins_cost,
359        edge_subst_cost,
360        edge_del_cost,
361        edge_ins_cost,
362        upper_bound,
363        False,
364    ):
365        # assert bestcost is None or cost <= bestcost
366        if bestcost is not None and cost < bestcost:
367            paths = list()
368        paths.append((vertex_path, edge_path))
369        bestcost = cost
370    return paths, bestcost
371
372
373def optimize_graph_edit_distance(
374    G1,
375    G2,
376    node_match=None,
377    edge_match=None,
378    node_subst_cost=None,
379    node_del_cost=None,
380    node_ins_cost=None,
381    edge_subst_cost=None,
382    edge_del_cost=None,
383    edge_ins_cost=None,
384    upper_bound=None,
385):
386    """Returns consecutive approximations of GED (graph edit distance)
387    between graphs G1 and G2.
388
389    Graph edit distance is a graph similarity measure analogous to
390    Levenshtein distance for strings.  It is defined as minimum cost
391    of edit path (sequence of node and edge edit operations)
392    transforming graph G1 to graph isomorphic to G2.
393
394    Parameters
395    ----------
396    G1, G2: graphs
397        The two graphs G1 and G2 must be of the same type.
398
399    node_match : callable
400        A function that returns True if node n1 in G1 and n2 in G2
401        should be considered equal during matching.
402
403        The function will be called like
404
405           node_match(G1.nodes[n1], G2.nodes[n2]).
406
407        That is, the function will receive the node attribute
408        dictionaries for n1 and n2 as inputs.
409
410        Ignored if node_subst_cost is specified.  If neither
411        node_match nor node_subst_cost are specified then node
412        attributes are not considered.
413
414    edge_match : callable
415        A function that returns True if the edge attribute dictionaries
416        for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
417        be considered equal during matching.
418
419        The function will be called like
420
421           edge_match(G1[u1][v1], G2[u2][v2]).
422
423        That is, the function will receive the edge attribute
424        dictionaries of the edges under consideration.
425
426        Ignored if edge_subst_cost is specified.  If neither
427        edge_match nor edge_subst_cost are specified then edge
428        attributes are not considered.
429
430    node_subst_cost, node_del_cost, node_ins_cost : callable
431        Functions that return the costs of node substitution, node
432        deletion, and node insertion, respectively.
433
434        The functions will be called like
435
436           node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
437           node_del_cost(G1.nodes[n1]),
438           node_ins_cost(G2.nodes[n2]).
439
440        That is, the functions will receive the node attribute
441        dictionaries as inputs.  The functions are expected to return
442        positive numeric values.
443
444        Function node_subst_cost overrides node_match if specified.
445        If neither node_match nor node_subst_cost are specified then
446        default node substitution cost of 0 is used (node attributes
447        are not considered during matching).
448
449        If node_del_cost is not specified then default node deletion
450        cost of 1 is used.  If node_ins_cost is not specified then
451        default node insertion cost of 1 is used.
452
453    edge_subst_cost, edge_del_cost, edge_ins_cost : callable
454        Functions that return the costs of edge substitution, edge
455        deletion, and edge insertion, respectively.
456
457        The functions will be called like
458
459           edge_subst_cost(G1[u1][v1], G2[u2][v2]),
460           edge_del_cost(G1[u1][v1]),
461           edge_ins_cost(G2[u2][v2]).
462
463        That is, the functions will receive the edge attribute
464        dictionaries as inputs.  The functions are expected to return
465        positive numeric values.
466
467        Function edge_subst_cost overrides edge_match if specified.
468        If neither edge_match nor edge_subst_cost are specified then
469        default edge substitution cost of 0 is used (edge attributes
470        are not considered during matching).
471
472        If edge_del_cost is not specified then default edge deletion
473        cost of 1 is used.  If edge_ins_cost is not specified then
474        default edge insertion cost of 1 is used.
475
476    upper_bound : numeric
477        Maximum edit distance to consider.
478
479    Returns
480    -------
481    Generator of consecutive approximations of graph edit distance.
482
483    Examples
484    --------
485    >>> G1 = nx.cycle_graph(6)
486    >>> G2 = nx.wheel_graph(7)
487    >>> for v in nx.optimize_graph_edit_distance(G1, G2):
488    ...     minv = v
489    >>> minv
490    7.0
491
492    See Also
493    --------
494    graph_edit_distance, optimize_edit_paths
495
496    References
497    ----------
498    .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
499       Martineau. An Exact Graph Edit Distance Algorithm for Solving
500       Pattern Recognition Problems. 4th International Conference on
501       Pattern Recognition Applications and Methods 2015, Jan 2015,
502       Lisbon, Portugal. 2015,
503       <10.5220/0005209202710278>. <hal-01168816>
504       https://hal.archives-ouvertes.fr/hal-01168816
505    """
506    for vertex_path, edge_path, cost in optimize_edit_paths(
507        G1,
508        G2,
509        node_match,
510        edge_match,
511        node_subst_cost,
512        node_del_cost,
513        node_ins_cost,
514        edge_subst_cost,
515        edge_del_cost,
516        edge_ins_cost,
517        upper_bound,
518        True,
519    ):
520        yield cost
521
522
523def optimize_edit_paths(
524    G1,
525    G2,
526    node_match=None,
527    edge_match=None,
528    node_subst_cost=None,
529    node_del_cost=None,
530    node_ins_cost=None,
531    edge_subst_cost=None,
532    edge_del_cost=None,
533    edge_ins_cost=None,
534    upper_bound=None,
535    strictly_decreasing=True,
536    roots=None,
537    timeout=None,
538):
539    """GED (graph edit distance) calculation: advanced interface.
540
541    Graph edit path is a sequence of node and edge edit operations
542    transforming graph G1 to graph isomorphic to G2.  Edit operations
543    include substitutions, deletions, and insertions.
544
545    Graph edit distance is defined as minimum cost of edit path.
546
547    Parameters
548    ----------
549    G1, G2: graphs
550        The two graphs G1 and G2 must be of the same type.
551
552    node_match : callable
553        A function that returns True if node n1 in G1 and n2 in G2
554        should be considered equal during matching.
555
556        The function will be called like
557
558           node_match(G1.nodes[n1], G2.nodes[n2]).
559
560        That is, the function will receive the node attribute
561        dictionaries for n1 and n2 as inputs.
562
563        Ignored if node_subst_cost is specified.  If neither
564        node_match nor node_subst_cost are specified then node
565        attributes are not considered.
566
567    edge_match : callable
568        A function that returns True if the edge attribute dictionaries
569        for the pair of nodes (u1, v1) in G1 and (u2, v2) in G2 should
570        be considered equal during matching.
571
572        The function will be called like
573
574           edge_match(G1[u1][v1], G2[u2][v2]).
575
576        That is, the function will receive the edge attribute
577        dictionaries of the edges under consideration.
578
579        Ignored if edge_subst_cost is specified.  If neither
580        edge_match nor edge_subst_cost are specified then edge
581        attributes are not considered.
582
583    node_subst_cost, node_del_cost, node_ins_cost : callable
584        Functions that return the costs of node substitution, node
585        deletion, and node insertion, respectively.
586
587        The functions will be called like
588
589           node_subst_cost(G1.nodes[n1], G2.nodes[n2]),
590           node_del_cost(G1.nodes[n1]),
591           node_ins_cost(G2.nodes[n2]).
592
593        That is, the functions will receive the node attribute
594        dictionaries as inputs.  The functions are expected to return
595        positive numeric values.
596
597        Function node_subst_cost overrides node_match if specified.
598        If neither node_match nor node_subst_cost are specified then
599        default node substitution cost of 0 is used (node attributes
600        are not considered during matching).
601
602        If node_del_cost is not specified then default node deletion
603        cost of 1 is used.  If node_ins_cost is not specified then
604        default node insertion cost of 1 is used.
605
606    edge_subst_cost, edge_del_cost, edge_ins_cost : callable
607        Functions that return the costs of edge substitution, edge
608        deletion, and edge insertion, respectively.
609
610        The functions will be called like
611
612           edge_subst_cost(G1[u1][v1], G2[u2][v2]),
613           edge_del_cost(G1[u1][v1]),
614           edge_ins_cost(G2[u2][v2]).
615
616        That is, the functions will receive the edge attribute
617        dictionaries as inputs.  The functions are expected to return
618        positive numeric values.
619
620        Function edge_subst_cost overrides edge_match if specified.
621        If neither edge_match nor edge_subst_cost are specified then
622        default edge substitution cost of 0 is used (edge attributes
623        are not considered during matching).
624
625        If edge_del_cost is not specified then default edge deletion
626        cost of 1 is used.  If edge_ins_cost is not specified then
627        default edge insertion cost of 1 is used.
628
629    upper_bound : numeric
630        Maximum edit distance to consider.
631
632    strictly_decreasing : bool
633        If True, return consecutive approximations of strictly
634        decreasing cost.  Otherwise, return all edit paths of cost
635        less than or equal to the previous minimum cost.
636
637    roots : 2-tuple
638        Tuple where first element is a node in G1 and the second
639        is a node in G2.
640        These nodes are forced to be matched in the comparison to
641        allow comparison between rooted graphs.
642
643    timeout : numeric
644        Maximum number of seconds to execute.
645        After timeout is met, the current best GED is returned.
646
647    Returns
648    -------
649    Generator of tuples (node_edit_path, edge_edit_path, cost)
650        node_edit_path : list of tuples (u, v)
651        edge_edit_path : list of tuples ((u1, v1), (u2, v2))
652        cost : numeric
653
654    See Also
655    --------
656    graph_edit_distance, optimize_graph_edit_distance, optimal_edit_paths
657
658    References
659    ----------
660    .. [1] Zeina Abu-Aisheh, Romain Raveaux, Jean-Yves Ramel, Patrick
661       Martineau. An Exact Graph Edit Distance Algorithm for Solving
662       Pattern Recognition Problems. 4th International Conference on
663       Pattern Recognition Applications and Methods 2015, Jan 2015,
664       Lisbon, Portugal. 2015,
665       <10.5220/0005209202710278>. <hal-01168816>
666       https://hal.archives-ouvertes.fr/hal-01168816
667
668    """
669    # TODO: support DiGraph
670
671    import numpy as np
672    import scipy as sp
673    import scipy.optimize  # call as sp.optimize
674
675    class CostMatrix:
676        def __init__(self, C, lsa_row_ind, lsa_col_ind, ls):
677            # assert C.shape[0] == len(lsa_row_ind)
678            # assert C.shape[1] == len(lsa_col_ind)
679            # assert len(lsa_row_ind) == len(lsa_col_ind)
680            # assert set(lsa_row_ind) == set(range(len(lsa_row_ind)))
681            # assert set(lsa_col_ind) == set(range(len(lsa_col_ind)))
682            # assert ls == C[lsa_row_ind, lsa_col_ind].sum()
683            self.C = C
684            self.lsa_row_ind = lsa_row_ind
685            self.lsa_col_ind = lsa_col_ind
686            self.ls = ls
687
688    def make_CostMatrix(C, m, n):
689        # assert(C.shape == (m + n, m + n))
690        lsa_row_ind, lsa_col_ind = sp.optimize.linear_sum_assignment(C)
691
692        # Fixup dummy assignments:
693        # each substitution i<->j should have dummy assignment m+j<->n+i
694        # NOTE: fast reduce of Cv relies on it
695        # assert len(lsa_row_ind) == len(lsa_col_ind)
696        indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind)
697        subst_ind = list(k for k, i, j in indexes if i < m and j < n)
698        indexes = zip(range(len(lsa_row_ind)), lsa_row_ind, lsa_col_ind)
699        dummy_ind = list(k for k, i, j in indexes if i >= m and j >= n)
700        # assert len(subst_ind) == len(dummy_ind)
701        lsa_row_ind[dummy_ind] = lsa_col_ind[subst_ind] + m
702        lsa_col_ind[dummy_ind] = lsa_row_ind[subst_ind] + n
703
704        return CostMatrix(
705            C, lsa_row_ind, lsa_col_ind, C[lsa_row_ind, lsa_col_ind].sum()
706        )
707
708    def extract_C(C, i, j, m, n):
709        # assert(C.shape == (m + n, m + n))
710        row_ind = [k in i or k - m in j for k in range(m + n)]
711        col_ind = [k in j or k - n in i for k in range(m + n)]
712        return C[row_ind, :][:, col_ind]
713
714    def reduce_C(C, i, j, m, n):
715        # assert(C.shape == (m + n, m + n))
716        row_ind = [k not in i and k - m not in j for k in range(m + n)]
717        col_ind = [k not in j and k - n not in i for k in range(m + n)]
718        return C[row_ind, :][:, col_ind]
719
720    def reduce_ind(ind, i):
721        # assert set(ind) == set(range(len(ind)))
722        rind = ind[[k not in i for k in ind]]
723        for k in set(i):
724            rind[rind >= k] -= 1
725        return rind
726
727    def match_edges(u, v, pending_g, pending_h, Ce, matched_uv=[]):
728        """
729        Parameters:
730            u, v: matched vertices, u=None or v=None for
731               deletion/insertion
732            pending_g, pending_h: lists of edges not yet mapped
733            Ce: CostMatrix of pending edge mappings
734            matched_uv: partial vertex edit path
735                list of tuples (u, v) of previously matched vertex
736                    mappings u<->v, u=None or v=None for
737                    deletion/insertion
738
739        Returns:
740            list of (i, j): indices of edge mappings g<->h
741            localCe: local CostMatrix of edge mappings
742                (basically submatrix of Ce at cross of rows i, cols j)
743        """
744        M = len(pending_g)
745        N = len(pending_h)
746        # assert Ce.C.shape == (M + N, M + N)
747
748        g_ind = [
749            i
750            for i in range(M)
751            if pending_g[i][:2] == (u, u)
752            or any(pending_g[i][:2] in ((p, u), (u, p)) for p, q in matched_uv)
753        ]
754        h_ind = [
755            j
756            for j in range(N)
757            if pending_h[j][:2] == (v, v)
758            or any(pending_h[j][:2] in ((q, v), (v, q)) for p, q in matched_uv)
759        ]
760        m = len(g_ind)
761        n = len(h_ind)
762
763        if m or n:
764            C = extract_C(Ce.C, g_ind, h_ind, M, N)
765            # assert C.shape == (m + n, m + n)
766
767            # Forbid structurally invalid matches
768            # NOTE: inf remembered from Ce construction
769            for k, i in zip(range(m), g_ind):
770                g = pending_g[i][:2]
771                for l, j in zip(range(n), h_ind):
772                    h = pending_h[j][:2]
773                    if nx.is_directed(G1) or nx.is_directed(G2):
774                        if any(
775                            g == (p, u) and h == (q, v) or g == (u, p) and h == (v, q)
776                            for p, q in matched_uv
777                        ):
778                            continue
779                    else:
780                        if any(
781                            g in ((p, u), (u, p)) and h in ((q, v), (v, q))
782                            for p, q in matched_uv
783                        ):
784                            continue
785                    if g == (u, u):
786                        continue
787                    if h == (v, v):
788                        continue
789                    C[k, l] = inf
790
791            localCe = make_CostMatrix(C, m, n)
792            ij = list(
793                (
794                    g_ind[k] if k < m else M + h_ind[l],
795                    h_ind[l] if l < n else N + g_ind[k],
796                )
797                for k, l in zip(localCe.lsa_row_ind, localCe.lsa_col_ind)
798                if k < m or l < n
799            )
800
801        else:
802            ij = []
803            localCe = CostMatrix(np.empty((0, 0)), [], [], 0)
804
805        return ij, localCe
806
807    def reduce_Ce(Ce, ij, m, n):
808        if len(ij):
809            i, j = zip(*ij)
810            m_i = m - sum(1 for t in i if t < m)
811            n_j = n - sum(1 for t in j if t < n)
812            return make_CostMatrix(reduce_C(Ce.C, i, j, m, n), m_i, n_j)
813        else:
814            return Ce
815
816    def get_edit_ops(
817        matched_uv, pending_u, pending_v, Cv, pending_g, pending_h, Ce, matched_cost
818    ):
819        """
820        Parameters:
821            matched_uv: partial vertex edit path
822                list of tuples (u, v) of vertex mappings u<->v,
823                u=None or v=None for deletion/insertion
824            pending_u, pending_v: lists of vertices not yet mapped
825            Cv: CostMatrix of pending vertex mappings
826            pending_g, pending_h: lists of edges not yet mapped
827            Ce: CostMatrix of pending edge mappings
828            matched_cost: cost of partial edit path
829
830        Returns:
831            sequence of
832                (i, j): indices of vertex mapping u<->v
833                Cv_ij: reduced CostMatrix of pending vertex mappings
834                    (basically Cv with row i, col j removed)
835                list of (x, y): indices of edge mappings g<->h
836                Ce_xy: reduced CostMatrix of pending edge mappings
837                    (basically Ce with rows x, cols y removed)
838                cost: total cost of edit operation
839            NOTE: most promising ops first
840        """
841        m = len(pending_u)
842        n = len(pending_v)
843        # assert Cv.C.shape == (m + n, m + n)
844
845        # 1) a vertex mapping from optimal linear sum assignment
846        i, j = min(
847            (k, l) for k, l in zip(Cv.lsa_row_ind, Cv.lsa_col_ind) if k < m or l < n
848        )
849        xy, localCe = match_edges(
850            pending_u[i] if i < m else None,
851            pending_v[j] if j < n else None,
852            pending_g,
853            pending_h,
854            Ce,
855            matched_uv,
856        )
857        Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h))
858        # assert Ce.ls <= localCe.ls + Ce_xy.ls
859        if prune(matched_cost + Cv.ls + localCe.ls + Ce_xy.ls):
860            pass
861        else:
862            # get reduced Cv efficiently
863            Cv_ij = CostMatrix(
864                reduce_C(Cv.C, (i,), (j,), m, n),
865                reduce_ind(Cv.lsa_row_ind, (i, m + j)),
866                reduce_ind(Cv.lsa_col_ind, (j, n + i)),
867                Cv.ls - Cv.C[i, j],
868            )
869            yield (i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls
870
871        # 2) other candidates, sorted by lower-bound cost estimate
872        other = list()
873        fixed_i, fixed_j = i, j
874        if m <= n:
875            candidates = (
876                (t, fixed_j)
877                for t in range(m + n)
878                if t != fixed_i and (t < m or t == m + fixed_j)
879            )
880        else:
881            candidates = (
882                (fixed_i, t)
883                for t in range(m + n)
884                if t != fixed_j and (t < n or t == n + fixed_i)
885            )
886        for i, j in candidates:
887            if prune(matched_cost + Cv.C[i, j] + Ce.ls):
888                continue
889            Cv_ij = make_CostMatrix(
890                reduce_C(Cv.C, (i,), (j,), m, n),
891                m - 1 if i < m else m,
892                n - 1 if j < n else n,
893            )
894            # assert Cv.ls <= Cv.C[i, j] + Cv_ij.ls
895            if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + Ce.ls):
896                continue
897            xy, localCe = match_edges(
898                pending_u[i] if i < m else None,
899                pending_v[j] if j < n else None,
900                pending_g,
901                pending_h,
902                Ce,
903                matched_uv,
904            )
905            if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls):
906                continue
907            Ce_xy = reduce_Ce(Ce, xy, len(pending_g), len(pending_h))
908            # assert Ce.ls <= localCe.ls + Ce_xy.ls
909            if prune(matched_cost + Cv.C[i, j] + Cv_ij.ls + localCe.ls + Ce_xy.ls):
910                continue
911            other.append(((i, j), Cv_ij, xy, Ce_xy, Cv.C[i, j] + localCe.ls))
912
913        yield from sorted(other, key=lambda t: t[4] + t[1].ls + t[3].ls)
914
915    def get_edit_paths(
916        matched_uv,
917        pending_u,
918        pending_v,
919        Cv,
920        matched_gh,
921        pending_g,
922        pending_h,
923        Ce,
924        matched_cost,
925    ):
926        """
927        Parameters:
928            matched_uv: partial vertex edit path
929                list of tuples (u, v) of vertex mappings u<->v,
930                u=None or v=None for deletion/insertion
931            pending_u, pending_v: lists of vertices not yet mapped
932            Cv: CostMatrix of pending vertex mappings
933            matched_gh: partial edge edit path
934                list of tuples (g, h) of edge mappings g<->h,
935                g=None or h=None for deletion/insertion
936            pending_g, pending_h: lists of edges not yet mapped
937            Ce: CostMatrix of pending edge mappings
938            matched_cost: cost of partial edit path
939
940        Returns:
941            sequence of (vertex_path, edge_path, cost)
942                vertex_path: complete vertex edit path
943                    list of tuples (u, v) of vertex mappings u<->v,
944                    u=None or v=None for deletion/insertion
945                edge_path: complete edge edit path
946                    list of tuples (g, h) of edge mappings g<->h,
947                    g=None or h=None for deletion/insertion
948                cost: total cost of edit path
949            NOTE: path costs are non-increasing
950        """
951        # debug_print('matched-uv:', matched_uv)
952        # debug_print('matched-gh:', matched_gh)
953        # debug_print('matched-cost:', matched_cost)
954        # debug_print('pending-u:', pending_u)
955        # debug_print('pending-v:', pending_v)
956        # debug_print(Cv.C)
957        # assert list(sorted(G1.nodes)) == list(sorted(list(u for u, v in matched_uv if u is not None) + pending_u))
958        # assert list(sorted(G2.nodes)) == list(sorted(list(v for u, v in matched_uv if v is not None) + pending_v))
959        # debug_print('pending-g:', pending_g)
960        # debug_print('pending-h:', pending_h)
961        # debug_print(Ce.C)
962        # assert list(sorted(G1.edges)) == list(sorted(list(g for g, h in matched_gh if g is not None) + pending_g))
963        # assert list(sorted(G2.edges)) == list(sorted(list(h for g, h in matched_gh if h is not None) + pending_h))
964        # debug_print()
965
966        if prune(matched_cost + Cv.ls + Ce.ls):
967            return
968
969        if not max(len(pending_u), len(pending_v)):
970            # assert not len(pending_g)
971            # assert not len(pending_h)
972            # path completed!
973            # assert matched_cost <= maxcost.value
974            maxcost.value = min(maxcost.value, matched_cost)
975            yield matched_uv, matched_gh, matched_cost
976
977        else:
978            edit_ops = get_edit_ops(
979                matched_uv,
980                pending_u,
981                pending_v,
982                Cv,
983                pending_g,
984                pending_h,
985                Ce,
986                matched_cost,
987            )
988            for ij, Cv_ij, xy, Ce_xy, edit_cost in edit_ops:
989                i, j = ij
990                # assert Cv.C[i, j] + sum(Ce.C[t] for t in xy) == edit_cost
991                if prune(matched_cost + edit_cost + Cv_ij.ls + Ce_xy.ls):
992                    continue
993
994                # dive deeper
995                u = pending_u.pop(i) if i < len(pending_u) else None
996                v = pending_v.pop(j) if j < len(pending_v) else None
997                matched_uv.append((u, v))
998                for x, y in xy:
999                    len_g = len(pending_g)
1000                    len_h = len(pending_h)
1001                    matched_gh.append(
1002                        (
1003                            pending_g[x] if x < len_g else None,
1004                            pending_h[y] if y < len_h else None,
1005                        )
1006                    )
1007                sortedx = list(sorted(x for x, y in xy))
1008                sortedy = list(sorted(y for x, y in xy))
1009                G = list(
1010                    (pending_g.pop(x) if x < len(pending_g) else None)
1011                    for x in reversed(sortedx)
1012                )
1013                H = list(
1014                    (pending_h.pop(y) if y < len(pending_h) else None)
1015                    for y in reversed(sortedy)
1016                )
1017
1018                yield from get_edit_paths(
1019                    matched_uv,
1020                    pending_u,
1021                    pending_v,
1022                    Cv_ij,
1023                    matched_gh,
1024                    pending_g,
1025                    pending_h,
1026                    Ce_xy,
1027                    matched_cost + edit_cost,
1028                )
1029
1030                # backtrack
1031                if u is not None:
1032                    pending_u.insert(i, u)
1033                if v is not None:
1034                    pending_v.insert(j, v)
1035                matched_uv.pop()
1036                for x, g in zip(sortedx, reversed(G)):
1037                    if g is not None:
1038                        pending_g.insert(x, g)
1039                for y, h in zip(sortedy, reversed(H)):
1040                    if h is not None:
1041                        pending_h.insert(y, h)
1042                for t in xy:
1043                    matched_gh.pop()
1044
1045    # Initialization
1046
1047    pending_u = list(G1.nodes)
1048    pending_v = list(G2.nodes)
1049
1050    initial_cost = 0
1051    if roots:
1052        root_u, root_v = roots
1053        if root_u not in pending_u or root_v not in pending_v:
1054            raise nx.NodeNotFound("Root node not in graph.")
1055
1056        # remove roots from pending
1057        pending_u.remove(root_u)
1058        pending_v.remove(root_v)
1059
1060    # cost matrix of vertex mappings
1061    m = len(pending_u)
1062    n = len(pending_v)
1063    C = np.zeros((m + n, m + n))
1064    if node_subst_cost:
1065        C[0:m, 0:n] = np.array(
1066            [
1067                node_subst_cost(G1.nodes[u], G2.nodes[v])
1068                for u in pending_u
1069                for v in pending_v
1070            ]
1071        ).reshape(m, n)
1072        if roots:
1073            initial_cost = node_subst_cost(G1.nodes[root_u], G2.nodes[root_v])
1074    elif node_match:
1075        C[0:m, 0:n] = np.array(
1076            [
1077                1 - int(node_match(G1.nodes[u], G2.nodes[v]))
1078                for u in pending_u
1079                for v in pending_v
1080            ]
1081        ).reshape(m, n)
1082        if roots:
1083            initial_cost = 1 - node_match(G1.nodes[root_u], G2.nodes[root_v])
1084    else:
1085        # all zeroes
1086        pass
1087    # assert not min(m, n) or C[0:m, 0:n].min() >= 0
1088    if node_del_cost:
1089        del_costs = [node_del_cost(G1.nodes[u]) for u in pending_u]
1090    else:
1091        del_costs = [1] * len(pending_u)
1092    # assert not m or min(del_costs) >= 0
1093    if node_ins_cost:
1094        ins_costs = [node_ins_cost(G2.nodes[v]) for v in pending_v]
1095    else:
1096        ins_costs = [1] * len(pending_v)
1097    # assert not n or min(ins_costs) >= 0
1098    inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1
1099    C[0:m, n : n + m] = np.array(
1100        [del_costs[i] if i == j else inf for i in range(m) for j in range(m)]
1101    ).reshape(m, m)
1102    C[m : m + n, 0:n] = np.array(
1103        [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)]
1104    ).reshape(n, n)
1105    Cv = make_CostMatrix(C, m, n)
1106    # debug_print(f"Cv: {m} x {n}")
1107    # debug_print(Cv.C)
1108
1109    pending_g = list(G1.edges)
1110    pending_h = list(G2.edges)
1111
1112    # cost matrix of edge mappings
1113    m = len(pending_g)
1114    n = len(pending_h)
1115    C = np.zeros((m + n, m + n))
1116    if edge_subst_cost:
1117        C[0:m, 0:n] = np.array(
1118            [
1119                edge_subst_cost(G1.edges[g], G2.edges[h])
1120                for g in pending_g
1121                for h in pending_h
1122            ]
1123        ).reshape(m, n)
1124    elif edge_match:
1125        C[0:m, 0:n] = np.array(
1126            [
1127                1 - int(edge_match(G1.edges[g], G2.edges[h]))
1128                for g in pending_g
1129                for h in pending_h
1130            ]
1131        ).reshape(m, n)
1132    else:
1133        # all zeroes
1134        pass
1135    # assert not min(m, n) or C[0:m, 0:n].min() >= 0
1136    if edge_del_cost:
1137        del_costs = [edge_del_cost(G1.edges[g]) for g in pending_g]
1138    else:
1139        del_costs = [1] * len(pending_g)
1140    # assert not m or min(del_costs) >= 0
1141    if edge_ins_cost:
1142        ins_costs = [edge_ins_cost(G2.edges[h]) for h in pending_h]
1143    else:
1144        ins_costs = [1] * len(pending_h)
1145    # assert not n or min(ins_costs) >= 0
1146    inf = C[0:m, 0:n].sum() + sum(del_costs) + sum(ins_costs) + 1
1147    C[0:m, n : n + m] = np.array(
1148        [del_costs[i] if i == j else inf for i in range(m) for j in range(m)]
1149    ).reshape(m, m)
1150    C[m : m + n, 0:n] = np.array(
1151        [ins_costs[i] if i == j else inf for i in range(n) for j in range(n)]
1152    ).reshape(n, n)
1153    Ce = make_CostMatrix(C, m, n)
1154    # debug_print(f'Ce: {m} x {n}')
1155    # debug_print(Ce.C)
1156    # debug_print()
1157
1158    class MaxCost:
1159        def __init__(self):
1160            # initial upper-bound estimate
1161            # NOTE: should work for empty graph
1162            self.value = Cv.C.sum() + Ce.C.sum() + 1
1163
1164    maxcost = MaxCost()
1165
1166    if timeout is not None:
1167        if timeout <= 0:
1168            raise nx.NetworkXError("Timeout value must be greater than 0")
1169        start = time.perf_counter()
1170
1171    def prune(cost):
1172        if timeout is not None:
1173            if time.perf_counter() - start > timeout:
1174                return True
1175        if upper_bound is not None:
1176            if cost > upper_bound:
1177                return True
1178        if cost > maxcost.value:
1179            return True
1180        elif strictly_decreasing and cost >= maxcost.value:
1181            return True
1182
1183    # Now go!
1184
1185    done_uv = [] if roots is None else [roots]
1186
1187    for vertex_path, edge_path, cost in get_edit_paths(
1188        done_uv, pending_u, pending_v, Cv, [], pending_g, pending_h, Ce, initial_cost
1189    ):
1190        # assert sorted(G1.nodes) == sorted(u for u, v in vertex_path if u is not None)
1191        # assert sorted(G2.nodes) == sorted(v for u, v in vertex_path if v is not None)
1192        # assert sorted(G1.edges) == sorted(g for g, h in edge_path if g is not None)
1193        # assert sorted(G2.edges) == sorted(h for g, h in edge_path if h is not None)
1194        # print(vertex_path, edge_path, cost, file = sys.stderr)
1195        # assert cost == maxcost.value
1196        yield list(vertex_path), list(edge_path), cost
1197
1198
1199def simrank_similarity(
1200    G,
1201    source=None,
1202    target=None,
1203    importance_factor=0.9,
1204    max_iterations=1000,
1205    tolerance=1e-4,
1206):
1207    """Returns the SimRank similarity of nodes in the graph ``G``.
1208
1209    SimRank is a similarity metric that says "two objects are considered
1210    to be similar if they are referenced by similar objects." [1]_.
1211
1212    The pseudo-code definition from the paper is::
1213
1214        def simrank(G, u, v):
1215            in_neighbors_u = G.predecessors(u)
1216            in_neighbors_v = G.predecessors(v)
1217            scale = C / (len(in_neighbors_u) * len(in_neighbors_v))
1218            return scale * sum(simrank(G, w, x)
1219                               for w, x in product(in_neighbors_u,
1220                                                   in_neighbors_v))
1221
1222    where ``G`` is the graph, ``u`` is the source, ``v`` is the target,
1223    and ``C`` is a float decay or importance factor between 0 and 1.
1224
1225    The SimRank algorithm for determining node similarity is defined in
1226    [2]_.
1227
1228    Parameters
1229    ----------
1230    G : NetworkX graph
1231        A NetworkX graph
1232
1233    source : node
1234        If this is specified, the returned dictionary maps each node
1235        ``v`` in the graph to the similarity between ``source`` and
1236        ``v``.
1237
1238    target : node
1239        If both ``source`` and ``target`` are specified, the similarity
1240        value between ``source`` and ``target`` is returned. If
1241        ``target`` is specified but ``source`` is not, this argument is
1242        ignored.
1243
1244    importance_factor : float
1245        The relative importance of indirect neighbors with respect to
1246        direct neighbors.
1247
1248    max_iterations : integer
1249        Maximum number of iterations.
1250
1251    tolerance : float
1252        Error tolerance used to check convergence. When an iteration of
1253        the algorithm finds that no similarity value changes more than
1254        this amount, the algorithm halts.
1255
1256    Returns
1257    -------
1258    similarity : dictionary or float
1259        If ``source`` and ``target`` are both ``None``, this returns a
1260        dictionary of dictionaries, where keys are node pairs and value
1261        are similarity of the pair of nodes.
1262
1263        If ``source`` is not ``None`` but ``target`` is, this returns a
1264        dictionary mapping node to the similarity of ``source`` and that
1265        node.
1266
1267        If neither ``source`` nor ``target`` is ``None``, this returns
1268        the similarity value for the given pair of nodes.
1269
1270    Examples
1271    --------
1272    >>> G = nx.cycle_graph(2)
1273    >>> nx.simrank_similarity(G)
1274    {0: {0: 1.0, 1: 0.0}, 1: {0: 0.0, 1: 1.0}}
1275    >>> nx.simrank_similarity(G, source=0)
1276    {0: 1.0, 1: 0.0}
1277    >>> nx.simrank_similarity(G, source=0, target=0)
1278    1.0
1279
1280    The result of this function can be converted to a numpy array
1281    representing the SimRank matrix by using the node order of the
1282    graph to determine which row and column represent each node.
1283    Other ordering of nodes is also possible.
1284
1285    >>> import numpy as np
1286    >>> sim = nx.simrank_similarity(G)
1287    >>> np.array([[sim[u][v] for v in G] for u in G])
1288    array([[1., 0.],
1289           [0., 1.]])
1290    >>> sim_1d = nx.simrank_similarity(G, source=0)
1291    >>> np.array([sim[0][v] for v in G])
1292    array([1., 0.])
1293
1294    References
1295    ----------
1296    .. [1] https://en.wikipedia.org/wiki/SimRank
1297    .. [2] G. Jeh and J. Widom.
1298           "SimRank: a measure of structural-context similarity",
1299           In KDD'02: Proceedings of the Eighth ACM SIGKDD
1300           International Conference on Knowledge Discovery and Data Mining,
1301           pp. 538--543. ACM Press, 2002.
1302    """
1303    import numpy as np
1304
1305    nodelist = list(G)
1306    s_indx = None if source is None else nodelist.index(source)
1307    t_indx = None if target is None else nodelist.index(target)
1308
1309    x = _simrank_similarity_numpy(
1310        G, s_indx, t_indx, importance_factor, max_iterations, tolerance
1311    )
1312
1313    if isinstance(x, np.ndarray):
1314        if x.ndim == 1:
1315            return {node: val for node, val in zip(G, x)}
1316        else:  # x.ndim == 2:
1317            return {u: dict(zip(G, row)) for u, row in zip(G, x)}
1318    return x
1319
1320
1321def _simrank_similarity_python(
1322    G,
1323    source=None,
1324    target=None,
1325    importance_factor=0.9,
1326    max_iterations=1000,
1327    tolerance=1e-4,
1328):
1329    """Returns the SimRank similarity of nodes in the graph ``G``.
1330
1331    This pure Python version is provided for pedagogical purposes.
1332
1333    Examples
1334    --------
1335    >>> G = nx.cycle_graph(2)
1336    >>> nx.similarity._simrank_similarity_python(G)
1337    {0: {0: 1, 1: 0.0}, 1: {0: 0.0, 1: 1}}
1338    >>> nx.similarity._simrank_similarity_python(G, source=0)
1339    {0: 1, 1: 0.0}
1340    >>> nx.similarity._simrank_similarity_python(G, source=0, target=0)
1341    1
1342    """
1343    # build up our similarity adjacency dictionary output
1344    newsim = {u: {v: 1 if u == v else 0 for v in G} for u in G}
1345
1346    # These functions compute the update to the similarity value of the nodes
1347    # `u` and `v` with respect to the previous similarity values.
1348    def avg_sim(s):
1349        return sum(newsim[w][x] for (w, x) in s) / len(s) if s else 0.0
1350
1351    Gadj = G.pred if G.is_directed() else G.adj
1352
1353    def sim(u, v):
1354        return importance_factor * avg_sim(list(product(Gadj[u], Gadj[v])))
1355
1356    for its in range(max_iterations):
1357        oldsim = newsim
1358        newsim = {u: {v: sim(u, v) if u is not v else 1 for v in G} for u in G}
1359        is_close = all(
1360            all(
1361                abs(newsim[u][v] - old) <= tolerance * (1 + abs(old))
1362                for v, old in nbrs.items()
1363            )
1364            for u, nbrs in oldsim.items()
1365        )
1366        if is_close:
1367            break
1368
1369    if its + 1 == max_iterations:
1370        raise nx.ExceededMaxIterations(
1371            f"simrank did not converge after {max_iterations} iterations."
1372        )
1373
1374    if source is not None and target is not None:
1375        return newsim[source][target]
1376    if source is not None:
1377        return newsim[source]
1378    return newsim
1379
1380
1381def _simrank_similarity_numpy(
1382    G,
1383    source=None,
1384    target=None,
1385    importance_factor=0.9,
1386    max_iterations=1000,
1387    tolerance=1e-4,
1388):
1389    """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``.
1390
1391    The SimRank algorithm for determining node similarity is defined in
1392    [1]_.
1393
1394    Parameters
1395    ----------
1396    G : NetworkX graph
1397        A NetworkX graph
1398
1399    source : node
1400        If this is specified, the returned dictionary maps each node
1401        ``v`` in the graph to the similarity between ``source`` and
1402        ``v``.
1403
1404    target : node
1405        If both ``source`` and ``target`` are specified, the similarity
1406        value between ``source`` and ``target`` is returned. If
1407        ``target`` is specified but ``source`` is not, this argument is
1408        ignored.
1409
1410    importance_factor : float
1411        The relative importance of indirect neighbors with respect to
1412        direct neighbors.
1413
1414    max_iterations : integer
1415        Maximum number of iterations.
1416
1417    tolerance : float
1418        Error tolerance used to check convergence. When an iteration of
1419        the algorithm finds that no similarity value changes more than
1420        this amount, the algorithm halts.
1421
1422    Returns
1423    -------
1424    similarity : numpy matrix, numpy array or float
1425        If ``source`` and ``target`` are both ``None``, this returns a
1426        Matrix containing SimRank scores of the nodes.
1427
1428        If ``source`` is not ``None`` but ``target`` is, this returns an
1429        Array containing SimRank scores of ``source`` and that
1430        node.
1431
1432        If neither ``source`` nor ``target`` is ``None``, this returns
1433        the similarity value for the given pair of nodes.
1434
1435    Examples
1436    --------
1437    >>> G = nx.cycle_graph(2)
1438    >>> nx.similarity._simrank_similarity_numpy(G)
1439    array([[1., 0.],
1440           [0., 1.]])
1441    >>> nx.similarity._simrank_similarity_numpy(G, source=0)
1442    array([1., 0.])
1443    >>> nx.similarity._simrank_similarity_numpy(G, source=0, target=0)
1444    1.0
1445
1446    References
1447    ----------
1448    .. [1] G. Jeh and J. Widom.
1449           "SimRank: a measure of structural-context similarity",
1450           In KDD'02: Proceedings of the Eighth ACM SIGKDD
1451           International Conference on Knowledge Discovery and Data Mining,
1452           pp. 538--543. ACM Press, 2002.
1453    """
1454    # This algorithm follows roughly
1455    #
1456    #     S = max{C * (A.T * S * A), I}
1457    #
1458    # where C is the importance factor, A is the column normalized
1459    # adjacency matrix, and I is the identity matrix.
1460    import numpy as np
1461
1462    adjacency_matrix = nx.to_numpy_array(G)
1463
1464    # column-normalize the ``adjacency_matrix``
1465    s = np.array(adjacency_matrix.sum(axis=0))
1466    s[s == 0] = 1
1467    adjacency_matrix /= s  # adjacency_matrix.sum(axis=0)
1468
1469    newsim = np.eye(len(G), dtype=np.float64)
1470    for its in range(max_iterations):
1471        prevsim = newsim.copy()
1472        newsim = importance_factor * ((adjacency_matrix.T @ prevsim) @ adjacency_matrix)
1473        np.fill_diagonal(newsim, 1.0)
1474
1475        if np.allclose(prevsim, newsim, atol=tolerance):
1476            break
1477
1478    if its + 1 == max_iterations:
1479        raise nx.ExceededMaxIterations(
1480            f"simrank did not converge after {max_iterations} iterations."
1481        )
1482
1483    if source is not None and target is not None:
1484        return newsim[source, target]
1485    if source is not None:
1486        return newsim[source]
1487    return newsim
1488
1489
1490def simrank_similarity_numpy(
1491    G,
1492    source=None,
1493    target=None,
1494    importance_factor=0.9,
1495    max_iterations=100,
1496    tolerance=1e-4,
1497):
1498    """Calculate SimRank of nodes in ``G`` using matrices with ``numpy``.
1499
1500    .. deprecated:: 2.6
1501        simrank_similarity_numpy is deprecated and will be removed in networkx 3.0.
1502        Use simrank_similarity
1503
1504    """
1505    warnings.warn(
1506        (
1507            "networkx.simrank_similarity_numpy is deprecated and will be removed"
1508            "in NetworkX 3.0, use networkx.simrank_similarity instead."
1509        ),
1510        DeprecationWarning,
1511        stacklevel=2,
1512    )
1513    return _simrank_similarity_numpy(
1514        G,
1515        source,
1516        target,
1517        importance_factor,
1518        max_iterations,
1519        tolerance,
1520    )
1521
1522
1523# TODO replace w/ math.comb(n, k) for Python 3.8+
1524def _n_choose_k(n, k):
1525    """Pure Python implementation of the binomial coefficient
1526
1527    The general equation is n! / (k! * (n - k)!). The below
1528    implementation is a more efficient version.
1529
1530    Note: this will be removed in favor of Python 3.8's ``math.comb(n, k)``.
1531
1532    Parameters
1533    ----------
1534    n : int
1535        Set of ``n`` elements
1536    k : int
1537        Unordered chosen subset of length ``k``
1538
1539    Returns
1540    -------
1541    binomial_coeff : int
1542        The number of ways (disregarding order) that k objects
1543        can be chosen from among n objects.
1544
1545    Examples
1546    --------
1547    >>> _n_choose_k(5, 2)
1548    10
1549    >>> _n_choose_k(5, 4)
1550    5
1551    >>> _n_choose_k(100, 100)
1552    1
1553
1554    """
1555    if k > n:
1556        return 0
1557    if n == k:
1558        return 1
1559    elif k < n - k:
1560        return reduce(mul, range(n - k + 1, n + 1)) // math.factorial(k)
1561    else:
1562        return reduce(mul, range(k + 1, n + 1)) // math.factorial(n - k)
1563
1564
1565def panther_similarity(G, source, k=5, path_length=5, c=0.5, delta=0.1, eps=None):
1566    r"""Returns the Panther similarity of nodes in the graph `G` to node ``v``.
1567
1568    Panther is a similarity metric that says "two objects are considered
1569    to be similar if they frequently appear on the same paths." [1]_.
1570
1571    Parameters
1572    ----------
1573    G : NetworkX graph
1574        A NetworkX graph
1575    source : node
1576        Source node for which to find the top `k` similar other nodes
1577    k : int (default = 5)
1578        The number of most similar nodes to return
1579    path_length : int (default = 5)
1580        How long the randomly generated paths should be (``T`` in [1]_)
1581    c : float (default = 0.5)
1582        A universal positive constant used to scale the number
1583        of sample random paths to generate.
1584    delta : float (default = 0.1)
1585        The probability that the similarity $S$ is not an epsilon-approximation to (R, phi),
1586        where $R$ is the number of random paths and $\phi$ is the probability
1587        that an element sampled from a set $A \subseteq D$, where $D$ is the domain.
1588    eps : float or None (default = None)
1589        The error bound. Per [1]_, a good value is ``sqrt(1/|E|)``. Therefore,
1590        if no value is provided, the recommended computed value will be used.
1591
1592    Returns
1593    -------
1594    similarity : dictionary
1595        Dictionary of nodes to similarity scores (as floats). Note:
1596        the self-similarity (i.e., ``v``) will not be included in
1597        the returned dictionary.
1598
1599    Examples
1600    --------
1601    >>> G = nx.star_graph(10)
1602    >>> sim = nx.panther_similarity(G, 0)
1603
1604    References
1605    ----------
1606    .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J.
1607           Panther: Fast top-k similarity search on large networks.
1608           In Proceedings of the ACM SIGKDD International Conference
1609           on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454).
1610           Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267.
1611    """
1612    import numpy as np
1613
1614    num_nodes = G.number_of_nodes()
1615    if num_nodes < k:
1616        warnings.warn(
1617            f"Number of nodes is {num_nodes}, but requested k is {k}. "
1618            "Setting k to number of nodes."
1619        )
1620        k = num_nodes
1621    # According to [1], they empirically determined
1622    # a good value for ``eps`` to be sqrt( 1 / |E| )
1623    if eps is None:
1624        eps = np.sqrt(1.0 / G.number_of_edges())
1625
1626    inv_node_map = {name: index for index, name in enumerate(G.nodes)}
1627    node_map = np.array(G)
1628
1629    # Calculate the sample size ``R`` for how many paths
1630    # to randomly generate
1631    t_choose_2 = _n_choose_k(path_length, 2)
1632    sample_size = int((c / eps ** 2) * (np.log2(t_choose_2) + 1 + np.log(1 / delta)))
1633    index_map = {}
1634    _ = list(
1635        generate_random_paths(
1636            G, sample_size, path_length=path_length, index_map=index_map
1637        )
1638    )
1639    S = np.zeros(num_nodes)
1640
1641    inv_sample_size = 1 / sample_size
1642
1643    source_paths = set(index_map[source])
1644
1645    # Calculate the path similarities
1646    # between ``source`` (v) and ``node`` (v_j)
1647    # using our inverted index mapping of
1648    # vertices to paths
1649    for node, paths in index_map.items():
1650        # Only consider paths where both
1651        # ``node`` and ``source`` are present
1652        common_paths = source_paths.intersection(paths)
1653        S[inv_node_map[node]] = len(common_paths) * inv_sample_size
1654
1655    # Retrieve top ``k`` similar
1656    # Note: the below performed anywhere from 4-10x faster
1657    # (depending on input sizes) vs the equivalent ``np.argsort(S)[::-1]``
1658    top_k_unsorted = np.argpartition(S, -k)[-k:]
1659    top_k_sorted = top_k_unsorted[np.argsort(S[top_k_unsorted])][::-1]
1660
1661    # Add back the similarity scores
1662    top_k_sorted_names = map(lambda n: node_map[n], top_k_sorted)
1663    top_k_with_val = dict(zip(top_k_sorted_names, S[top_k_sorted]))
1664
1665    # Remove the self-similarity
1666    top_k_with_val.pop(source, None)
1667    return top_k_with_val
1668
1669
1670def generate_random_paths(G, sample_size, path_length=5, index_map=None):
1671    """Randomly generate `sample_size` paths of length `path_length`.
1672
1673    Parameters
1674    ----------
1675    G : NetworkX graph
1676        A NetworkX graph
1677    sample_size : integer
1678        The number of paths to generate. This is ``R`` in [1]_.
1679    path_length : integer (default = 5)
1680        The maximum size of the path to randomly generate.
1681        This is ``T`` in [1]_. According to the paper, ``T >= 5`` is
1682        recommended.
1683    index_map : dictionary, optional
1684        If provided, this will be populated with the inverted
1685        index of nodes mapped to the set of generated random path
1686        indices within ``paths``.
1687
1688    Returns
1689    -------
1690    paths : generator of lists
1691        Generator of `sample_size` paths each with length `path_length`.
1692
1693    Examples
1694    --------
1695    Note that the return value is the list of paths:
1696
1697    >>> G = nx.star_graph(3)
1698    >>> random_path = nx.generate_random_paths(G, 2)
1699
1700    By passing a dictionary into `index_map`, it will build an
1701    inverted index mapping of nodes to the paths in which that node is present:
1702
1703    >>> G = nx.star_graph(3)
1704    >>> index_map = {}
1705    >>> random_path = nx.generate_random_paths(G, 3, index_map=index_map)
1706    >>> paths_containing_node_0 = [random_path[path_idx] for path_idx in index_map.get(0, [])]
1707
1708    References
1709    ----------
1710    .. [1] Zhang, J., Tang, J., Ma, C., Tong, H., Jing, Y., & Li, J.
1711           Panther: Fast top-k similarity search on large networks.
1712           In Proceedings of the ACM SIGKDD International Conference
1713           on Knowledge Discovery and Data Mining (Vol. 2015-August, pp. 1445–1454).
1714           Association for Computing Machinery. https://doi.org/10.1145/2783258.2783267.
1715    """
1716    import numpy as np
1717
1718    # Calculate transition probabilities between
1719    # every pair of vertices according to Eq. (3)
1720    adj_mat = nx.to_numpy_array(G)
1721    inv_row_sums = np.reciprocal(adj_mat.sum(axis=1)).reshape(-1, 1)
1722    transition_probabilities = adj_mat * inv_row_sums
1723
1724    node_map = np.array(G)
1725    num_nodes = G.number_of_nodes()
1726
1727    for path_index in range(sample_size):
1728        # Sample current vertex v = v_i uniformly at random
1729        node_index = np.random.randint(0, high=num_nodes)
1730        node = node_map[node_index]
1731
1732        # Add v into p_r and add p_r into the path set
1733        # of v, i.e., P_v
1734        path = [node]
1735
1736        # Build the inverted index (P_v) of vertices to paths
1737        if index_map is not None:
1738            if node in index_map:
1739                index_map[node].add(path_index)
1740            else:
1741                index_map[node] = {path_index}
1742
1743        starting_index = node_index
1744        for _ in range(path_length):
1745            # Randomly sample a neighbor (v_j) according
1746            # to transition probabilities from ``node`` (v) to its neighbors
1747            neighbor_index = np.random.choice(
1748                num_nodes, p=transition_probabilities[starting_index]
1749            )
1750
1751            # Set current vertex (v = v_j)
1752            starting_index = neighbor_index
1753
1754            # Add v into p_r
1755            neighbor_node = node_map[neighbor_index]
1756            path.append(neighbor_node)
1757
1758            # Add p_r into P_v
1759            if index_map is not None:
1760                if neighbor_node in index_map:
1761                    index_map[neighbor_node].add(path_index)
1762                else:
1763                    index_map[neighbor_node] = {path_index}
1764
1765        yield path
1766