1import numpy as np
2from scipy.sparse import csgraph as cg
3from scipy import sparse as sp
4
5try:
6    import sklearn.metrics as sk
7    import sklearn.metrics.pairwise as skp
8    from sklearn.preprocessing import LabelEncoder
9    import pandas as pd
10    HAS_REQUIREMENTS = True
11except ImportError as e:
12    HAS_REQUIREMENTS = False
13
14def _raise_initial_error():
15    missing = []
16    try:
17        import sklearn
18    except ImportError:
19        missing.append('scikit-learn')
20    try:
21        import pandas
22    except ImportError:
23        missing.append('pandas')
24    raise ImportError("this function requires scikit-learn and "
25                      "pandas to be installed. Missing {}".format(','.join(missing)))
26
27__all__ = ['path_silhouette', 'boundary_silhouette', 'silhouette_alist', 'nearest_label']
28
29def path_silhouette(data, labels, W, D=None,
30                    metric=skp.euclidean_distances,
31                    closest=False, return_nbfc=False,
32                    return_nbfc_score=False,
33                    return_paths=False,
34                    directed=False):
35    """
36    Compute a path silhouette for all observations  :cite:`wolf2019geosilhouettes,Rousseeuw1987`.
37
38
39    Parameters
40    -----------
41    data    :   np.ndarray (N,P)
42                matrix of data with N observations and P covariates.
43    labels  :   np.ndarray (N,)
44                flat vector of the L labels assigned over N observations.
45    W       :   pysal.W object
46                spatial weights object reflecting the spatial connectivity
47                in the problem under analysis
48    D       :   np.ndarray (N,N)
49                a precomputed distance matrix to apply over W. If passed,
50                takes precedence over data, and data is ignored.
51    metric  :   callable
52                function mapping the (N,P) data into an (N,N) dissimilarity matrix,
53                like that found in scikit.metrics.pairwise or scipy.spatial.distance
54    closest :   bool
55                whether or not to consider the observation "connected" when it
56                is first connected to the cluster, or considering the path cost
57                to transit through the cluster. If True, the path cost is assessed
58                between i and the path-closest j in each cluster. If False, the path
59                cost is assessed as the average of path costs between i and all j
60                in each cluster
61    return_nbfc     :   bool
62                        Whether or not to return the label of the next best fit
63                        cluster
64    return_nbfc_score:  bool
65                        Whether or not to return the score of the next best fit
66                        cluster.
67    return_paths    :   bool
68                        Whether or not to return the matrix of shortest path
69                        lengths after having computed them.
70    directed    :   bool
71                    whether to consider the weights matrix as directed or undirected.
72                    If directed, asymmetry in the input W is heeded. If not,
73                    asymmetry is ignored.
74
75    Returns
76    --------
77    An (N_obs,) array of the path silhouette values for each observation.
78    """
79    if not HAS_REQUIREMENTS:
80        _raise_initial_error()
81
82    if D is None:
83        D = metric(data)
84    #polymorphic for sparse & dense input
85    assert 0 == (D < 0).sum(), "Distance metric has negative values, which is not supported"
86    off_diag_zeros = (D + np.eye(D.shape[0])) == 0
87    D[off_diag_zeros] = -1
88    Wm = sp.csr_matrix(W.sparse)
89    DW = sp.csr_matrix(Wm.multiply(D))
90    DW.eliminate_zeros()
91    DW[DW < 0] = 0
92    assert 0 == (DW < 0).sum()
93    all_pairs = cg.shortest_path(DW, directed=directed)
94    labels = np.asarray(labels)
95    if W.n_components > 1:
96        from libpysal.weights.util import WSP
97        psils_ = np.empty(W.n, dtype=float)
98        closest_connecting_label_ = np.empty(W.n, dtype=labels.dtype)
99        closest_connection_score_ = np.empty(W.n, dtype=labels.dtype)
100        for component in np.unique(W.component_labels):
101            this_component_mask = np.nonzero(W.component_labels == component)[0]
102            subgraph = W.sparse[this_component_mask.reshape(-1,1), # these rows
103                                this_component_mask.reshape(1,-1)] #these columns
104            subgraph_W = WSP(subgraph).to_W()
105            assert subgraph_W.n_components == 1
106            # DW operation is idempotent
107            subgraph_D = DW[this_component_mask.reshape(-1,1), # these rows
108                            this_component_mask.reshape(1,-1)] #these columns
109            subgraph_labels = labels[this_component_mask]
110            n_subgraph_labels = len(np.unique(subgraph_labels))
111            if not (2 < n_subgraph_labels < (subgraph_W.n-1)) :
112                psils = subgraph_solutions = [0]*subgraph_W.n
113                closest_connecting_label = [np.nan]*subgraph_W.n
114                closest_connection_score = [np.inf]*subgraph_W.n
115            else:
116                subgraph_solutions = path_silhouette(data=None, labels=subgraph_labels, W=subgraph_W, D=subgraph_D,
117                                                     metric=metric, closest=closest, return_nbfc=return_nbfc,
118                                                     return_nbfc_score=return_nbfc_score,
119                                                     return_paths=return_paths,
120                                                     directed=directed)
121            # always throw away all_pairs, since we already have it built
122                if (return_nbfc or return_nbfc_score) and return_paths:
123                    if return_nbfc_score:
124                        (psils, closest_connecting_label,
125                                closest_connection_score,
126                                      _) = subgraph_solutions
127                    else:
128                        psils, closest_connecting_label, _ = subgraph_solutions
129                elif return_nbfc_score:
130                    psils, closest_connecting_label, closest_connection_score = subgraph_solutions
131                elif return_nbfc:
132                    psils, closest_connecting_label = subgraph_solutions
133                elif return_paths:
134                    psils, _ = subgraph_solutions
135                else:
136                    psils = subgraph_solutions
137            if return_nbfc:
138                closest_connecting_label_[this_component_mask] = closest_connecting_label
139            if return_nbfc_score:
140                closest_connection_score_[this_component_mask] = closest_connection_score
141            psils_[this_component_mask] = psils
142        closest_connection_score = closest_connection_score_
143        closest_connecting_label = closest_connecting_label_
144        psils = psils_
145    # Single Connected Component
146    elif closest is False:
147        psils = sk.silhouette_samples(all_pairs, labels, metric='precomputed')
148        if return_nbfc or return_nbfc_score:
149            closest_connecting_label = []
150            closest_connection_score = []
151            for i, label in enumerate(labels):
152                row = all_pairs[i].copy()
153                in_label = labels == label
154                masked_label = row.copy() # for observations in the row
155                masked_label[in_label] = np.inf # make those in cluster infinite
156                nearest_not_in_cluster = np.argmin(masked_label) # find the closest
157                nearest_not_in_cluster_label = labels[nearest_not_in_cluster] #label
158                nearest_not_in_cluster_score = masked_label[nearest_not_in_cluster]
159                closest_connecting_label.append(nearest_not_in_cluster_label)
160                closest_connection_score.append(nearest_not_in_cluster_score)
161    else:
162        psils = []
163        closest_connecting_label = []
164        closest_connection_score = []
165        for i,label in enumerate(labels):
166            row = all_pairs[i]
167            in_label = labels == label
168            #required to make argmin pertain to N, not N - len(in_label)
169            masked_label = row.copy()
170            masked_label[in_label] = np.inf
171            nearest_not_in_cluster = np.argmin(masked_label)
172            nearest_not_in_cluster_score = row[nearest_not_in_cluster]
173            nearest_not_in_cluster_label = labels[nearest_not_in_cluster]
174
175            average_interconnect_in_cluster = row[in_label].mean()
176            psil = nearest_not_in_cluster_score - average_interconnect_in_cluster
177            psil /= np.maximum(nearest_not_in_cluster_score, average_interconnect_in_cluster)
178            psils.append(psil)
179            closest_connecting_label.append(nearest_not_in_cluster_label)
180            closest_connection_score.append(nearest_not_in_cluster_score)
181        psils = np.asarray(psils)
182    if (return_nbfc or return_nbfc_score) and return_paths:
183        if return_nbfc_score:
184            out = (psils, np.asarray(closest_connecting_label),
185                          np.asarray(closest_connection_score),
186                          all_pairs )
187        else:
188            out = psils, np.asarray(closest_connecting_label), all_pairs
189    elif return_nbfc_score:
190        out = psils, np.asarray(closest_connecting_label),\
191                     np.asarray(closest_connection_score)
192    elif return_nbfc:
193        out = psils, np.asarray(closest_connecting_label)
194    elif return_paths:
195        out = psils, all_pairs
196    else:
197        out = psils
198    return out
199
200def boundary_silhouette(data, labels, W, metric=skp.euclidean_distances):
201    """
202    Compute the observation-level boundary silhouette score :cite:`wolf2019geosilhouettes`.
203
204    Arguments
205    ---------
206    data    :   (N_obs,P) numpy array
207                an array of covariates to analyze. Each row should be one
208                observation, and each clumn should be one feature.
209    labels  :   (N_obs,) array of labels
210                the labels corresponding to the group each observation is assigned.
211    W       :   pysal.weights.W object
212                a spatial weights object containing the connectivity structure
213                for the data
214    metric  :   callable, array,
215                a function that takes an argument (data) and returns the all-pairs
216                distances/dissimilarity between observations.
217
218    Returns
219    -------
220    (N_obs,) array of boundary silhouette values for each observation
221
222    Notes
223    -----
224    The boundary silhouette is the silhouette score using only spatially-proximate
225    clusters as candidates for the next-best-fit distance function (the
226    b(i) function in :cite:`Rousseeuw1987`.
227    This restricts the next-best-fit cluster to be the set of clusters on which
228    an observation neighbors.
229    So, instead of considering *all* clusters when finding the next-best-fit cluster,
230    only clusters that `i` borders are considered.
231    This is supposed to model the fact that, in spatially-constrained clustering,
232    observation i can only be reassigned from cluster c to cluster k if some observation
233    j neighbors i and also resides in k.
234
235    If an observation only neighbors its own cluster, i.e. is not on the boundary
236     of a cluster, this value is zero.
237
238    If a cluster has exactly one observation, this value is zero.
239
240    If an observation is on the boundary of more than one cluster, then the
241    best candidate is chosen from the set of clusters on which the observation borders.
242
243    metric is a callable mapping an (N,P) data into an (N,N) distance matrix OR
244    an (N,N) distance matrix already.
245    """
246    if not HAS_REQUIREMENTS:
247        _raise_initial_error()
248
249    alist = W.to_adjlist()
250    labels = np.asarray(labels)
251    if callable(metric):
252        full_distances = metric(data)
253    elif isinstance(metric, np.ndarray):
254        n_obs = W.n
255        if metric.shape == (n_obs, n_obs):
256            full_distances = metric
257        else:
258            raise ValueError('Precomputed metric is supplied, but is not the right shape.'
259                             ' The dissimilarity matrix should be of shape ({},{}), but was'
260                             ' of shape ({},{})'.format(W.n, W.n, *metric.shape))
261    else:
262        raise ValueError('The provided metric is neither a dissmilarity function'
263                         ' nor a dissimilarity matrix.')
264    assert 0 == (full_distances < 0).sum(), ("Distance metric has negative values, "
265                                             "which is not supported")
266    label_frame = pd.DataFrame(labels, index=W.id_order, columns=['label'])
267    alist = alist.merge(label_frame, left_on='focal', right_index=True, how='left')\
268                 .merge(label_frame, left_on='neighbor', right_index=True, how='left',
269                        suffixes=('_focal', '_neighbor'))
270    alist['boundary'] = alist.label_focal != alist.label_neighbor
271    focals = alist.groupby('focal')
272    bmask = focals.boundary.any()
273    result = []
274    np.seterr(all='raise')
275    for i, (ix,bnd) in enumerate(bmask.iteritems()):
276        if not bnd:
277            result.append(np.array([0]))
278            continue
279        sil_score = np.array([np.inf])
280        label = labels[i]
281        focal_mask = np.nonzero(labels == label)[0]
282        if len(focal_mask) == 1: #the candidate is singleton
283            result.append(np.array([0]))
284            continue
285        neighbors = alist.query("focal == {}".format(ix)).label_neighbor
286        mean_dissim = full_distances[i,focal_mask].sum() / (len(focal_mask)-1)
287        if not np.isfinite(mean_dissim).all():
288            raise ValueError('A non-finite mean dissimilarity between groups'
289                             ' and the boundary observation occurred. Please ensure'
290                             ' the data & labels are formatted and shaped correctly.')
291        neighbor_score = np.array([np.inf])
292        for neighbor in set(neighbors).difference([label]):
293            other_mask = np.nonzero(labels == neighbor)[0]
294            other_score = full_distances[i,other_mask].mean()
295            neighbor_score = np.minimum(neighbor_score, other_score, neighbor_score)
296            if neighbor_score < 0:
297                raise ValueError('A negative neighborhood similarity value occurred. '
298                                 'This should not happen. Please create a bug report on'
299                                 'https://github.com/pysal/esda/issues')
300        sil_score = (neighbor_score - mean_dissim) / np.maximum(neighbor_score,
301                                                                mean_dissim)
302        result.append(sil_score)
303    if len(result) != len(labels):
304        raise ValueError('The number of boundary silhouettes does not match the number of'
305                         ' observations.'
306                         'This should not happen. Please create a bug report on'
307                         'https://github.com/pysal/esda/issues')
308    return np.asarray(result).squeeze()
309
310def silhouette_alist(data, labels, alist, indices=None,
311                     metric=skp.euclidean_distances):
312    """
313    Compute the silhouette for each edge in an adjacency graph. Given the alist
314    containing `focal` id, `neighbor` id, and `label_focal`, and `label_neighbor`,
315    this computes:
316
317    `d(i,label_neighbor) - d(i,label_focal) / (max(d(i,label_neighbor), d(i,label_focal)))`
318
319    Arguments
320    ---------
321    data : (N,P) array to cluster on or DataFrame indexed on the same values as
322           that in alist.focal/alist.neighbor
323    labels: (N,) array containing classifications, indexed on the same values
324                 as that in alist.focal/alist.neighbor
325    alist: adjacency list containing columns focal & neighbor,
326           describing one edge of the graph.
327    indices: (N,) array containing the "name" for observations in
328           alist to be linked to data. indices should be:
329           1. aligned with data by iteration order
330           2. include all values in the alist.focal set.
331           if alist.focal and alist.neighbor are strings, then indices should be
332           a list/array of strings aligned with the rows of data.
333           if not provided and labels is a series/dataframe,
334           then its index will be used.
335    metric  :   callable, array,
336                a function that takes an argument (data) and returns the all-pairs
337                distances/dissimilarity between observations.
338
339    Results
340    -------
341    pandas.DataFrame, copy of the adjacency list `alist`, with an additional
342    column called `silhouette` that contains the pseudo-silhouette values
343    expressing the relative dissimilarity between neighboring observations.
344    """
345    if not HAS_REQUIREMENTS:
346        _raise_initial_error()
347
348    n_obs = data.shape[0]
349    if callable(metric):
350        full_distances = metric(data)
351    elif isinstance(metric, np.ndarray):
352        if metric.shape == (n_obs, n_obs):
353            full_distances = metric
354    if isinstance(data, pd.DataFrame):
355        indices = data.index
356    if isinstance(labels, (pd.DataFrame, pd.Series)) and indices is None:
357        indices = labels.index
358    elif indices is not None \
359        and not isinstance(labels, (pd.DataFrame, pd.Series)):
360        labels = pd.Series(labels, index=indices)
361    elif indices is None \
362        and not isinstance(labels, (pd.DataFrame, pd.Series)):
363        indices = np.arange(len(labels))
364        labels = pd.Series(labels, index=indices)
365    if isinstance(labels, pd.DataFrame):
366        labels = pd.Series(labels.values, index=labels.index)
367    assert indices is not None
368    assert isinstance(labels, pd.Series)
369    labels = labels.to_frame("label")
370
371    result = alist.sort_values('focal').copy(deep=True)
372
373    result = result.merge(labels, left_on='focal', right_index=True, how='left')\
374                   .merge(labels, left_on='neighbor', right_index=True, how='left',
375                          suffixes=('_focal','_neighbor'))
376    self_dcache = dict()
377    sils = []
378    indices = list(indices)
379    for i_alist, row in result.iterrows():
380        name = row.focal
381        label = row.label_focal
382        neighbor_label = row.label_neighbor
383        if neighbor_label == label:
384            sils.append(0)
385            continue
386        i_Xc = indices.index(name)
387        mask = labels == label
388        mask = np.nonzero(mask.values)[0]
389        within_cluster = self_dcache.get((i_Xc,label), full_distances[i_Xc,mask].mean())
390        self_dcache[(i_Xc,label)] = within_cluster
391        neighbor_mask = labels == neighbor_label
392        neighbor_mask = np.nonzero(neighbor_mask.values)[0]
393        if len(neighbor_mask) == 0:
394            sils.append(0)
395            warn('A link ({},{}) has been found to have an empty set of neighbors. '
396                 ' This may happen when a label assignment is missing for the neighbor unit.'
397                 ' Check that no labels are missing.'.format(row.focal, row.neighbor))
398            continue
399        outer_distance = full_distances[i_Xc,neighbor_mask].mean()
400        sils.append((outer_distance - within_cluster) / np.maximum(outer_distance, within_cluster))
401    result['silhouette'] = sils
402    return result.sort_values("focal").reset_index(drop=True)
403
404def nearest_label(data, labels,
405                  metric=skp.euclidean_distances,
406                  return_distance=False, keep_self=False):
407    """
408    Find the nearest label in attribute space.
409
410    Given the data and a set of labels in labels, this finds the label
411    whose mean center is closest to the observation in data.
412
413    Arguments
414    ---------
415    data : (N,P) array to cluster on or DataFrame indexed on the same values as
416           that in alist.focal/alist.neighbor
417    labels: (N,) array containing classifications, indexed on the same values
418                 as that in alist.focal/alist.neighbor
419    metric  :   callable, array,
420                a function that takes an argument (data) and returns the all-pairs
421                distances/dissimilarity between observations.
422    return_distance: bool
423                     Whether to return the distance from the observation to its nearest
424                     cluster in feature space. If True, the tuple of (nearest_label, dissim)
425                     is returned. If False, only the nearest_label array is returned.
426    keep_self:  bool
427                whether to allow observations to use their current cluster as their
428                nearest label. If True, an observation's existing cluster assignment can
429                also be the cluster it is closest to. If False, an observation's existing
430                cluster assignment cannot be the cluster it is closest to. This would mean
431                the function computes the nearest *alternative* cluster.
432
433    Returns
434    -------
435    (N_obs,) array of assignments reflect each observation's nearest label.
436
437    If return_distance is True, a tuple of ((N,) and (N,)) where the first
438        array is the assignment, and the second is the distance to the centroid
439        of that assignment.
440    """
441    if not HAS_REQUIREMENTS:
442        _raise_initial_error()
443
444    if callable(metric):
445        dissim = metric(data)
446    elif metric.lower == "precomputed":
447        assert data.shape == (labels.shape[0],labels.shape[0]), "dissimilarity matrix is malformed!"
448        dissim = data
449    elif isinstance(metric, np.ndarray):
450        assert metric.shape == (labels.shape[0],labels.shape[0]), "dissimilarity matrix is malformed!"
451        dissim = metric
452    unique_labels = np.unique(labels)
453    nearest_label = np.empty(labels.shape, dtype=labels.dtype)
454    nearest_label_dissim = np.empty(labels.shape)
455    for label in unique_labels:
456        this_label_mask = labels == label
457        n_in_label = this_label_mask.sum()
458        this_label_mask = np.nonzero(this_label_mask)[0]
459        next_best_fit = np.ones(this_label_mask.shape) * np.inf
460        next_best_label = np.empty(this_label_mask.shape, dtype=labels.dtype)
461        for neighbor in unique_labels:
462            if (neighbor == label) & (not keep_self):
463                continue
464            neighbor_label_mask = labels == neighbor
465            n_in_neighbor = neighbor_label_mask.sum()
466            neighbor_label_mask = np.nonzero(neighbor_label_mask)[0].reshape(1,-1)
467            # Need to account for the fact that the self-distance
468            # is not included in the silhouette; in small clusters,
469            # this extra zero can bring down the average, resulting in a case
470            # where the silhouette is negative, but the "nearest" cluster would
471            # be the current cluster if we take averages including i in C.
472            chunk = dissim[this_label_mask.reshape(-1,1), #these rows
473                           neighbor_label_mask] #and these columns
474            neighbor_distance = chunk.sum(axis=1) / np.maximum(n_in_neighbor-1, 1) #and sum across rows
475            next_best_label[neighbor_distance < next_best_fit] = neighbor
476            np.minimum(next_best_fit, neighbor_distance, next_best_fit)
477        nearest_label[this_label_mask] = next_best_label
478        nearest_label_dissim[this_label_mask] = next_best_fit
479    if return_distance:
480        return nearest_label, nearest_label_dissim
481    else:
482        return nearest_label
483