1import numpy as np 2from scipy.sparse import csgraph as cg 3from scipy import sparse as sp 4 5try: 6 import sklearn.metrics as sk 7 import sklearn.metrics.pairwise as skp 8 from sklearn.preprocessing import LabelEncoder 9 import pandas as pd 10 HAS_REQUIREMENTS = True 11except ImportError as e: 12 HAS_REQUIREMENTS = False 13 14def _raise_initial_error(): 15 missing = [] 16 try: 17 import sklearn 18 except ImportError: 19 missing.append('scikit-learn') 20 try: 21 import pandas 22 except ImportError: 23 missing.append('pandas') 24 raise ImportError("this function requires scikit-learn and " 25 "pandas to be installed. Missing {}".format(','.join(missing))) 26 27__all__ = ['path_silhouette', 'boundary_silhouette', 'silhouette_alist', 'nearest_label'] 28 29def path_silhouette(data, labels, W, D=None, 30 metric=skp.euclidean_distances, 31 closest=False, return_nbfc=False, 32 return_nbfc_score=False, 33 return_paths=False, 34 directed=False): 35 """ 36 Compute a path silhouette for all observations :cite:`wolf2019geosilhouettes,Rousseeuw1987`. 37 38 39 Parameters 40 ----------- 41 data : np.ndarray (N,P) 42 matrix of data with N observations and P covariates. 43 labels : np.ndarray (N,) 44 flat vector of the L labels assigned over N observations. 45 W : pysal.W object 46 spatial weights object reflecting the spatial connectivity 47 in the problem under analysis 48 D : np.ndarray (N,N) 49 a precomputed distance matrix to apply over W. If passed, 50 takes precedence over data, and data is ignored. 51 metric : callable 52 function mapping the (N,P) data into an (N,N) dissimilarity matrix, 53 like that found in scikit.metrics.pairwise or scipy.spatial.distance 54 closest : bool 55 whether or not to consider the observation "connected" when it 56 is first connected to the cluster, or considering the path cost 57 to transit through the cluster. If True, the path cost is assessed 58 between i and the path-closest j in each cluster. If False, the path 59 cost is assessed as the average of path costs between i and all j 60 in each cluster 61 return_nbfc : bool 62 Whether or not to return the label of the next best fit 63 cluster 64 return_nbfc_score: bool 65 Whether or not to return the score of the next best fit 66 cluster. 67 return_paths : bool 68 Whether or not to return the matrix of shortest path 69 lengths after having computed them. 70 directed : bool 71 whether to consider the weights matrix as directed or undirected. 72 If directed, asymmetry in the input W is heeded. If not, 73 asymmetry is ignored. 74 75 Returns 76 -------- 77 An (N_obs,) array of the path silhouette values for each observation. 78 """ 79 if not HAS_REQUIREMENTS: 80 _raise_initial_error() 81 82 if D is None: 83 D = metric(data) 84 #polymorphic for sparse & dense input 85 assert 0 == (D < 0).sum(), "Distance metric has negative values, which is not supported" 86 off_diag_zeros = (D + np.eye(D.shape[0])) == 0 87 D[off_diag_zeros] = -1 88 Wm = sp.csr_matrix(W.sparse) 89 DW = sp.csr_matrix(Wm.multiply(D)) 90 DW.eliminate_zeros() 91 DW[DW < 0] = 0 92 assert 0 == (DW < 0).sum() 93 all_pairs = cg.shortest_path(DW, directed=directed) 94 labels = np.asarray(labels) 95 if W.n_components > 1: 96 from libpysal.weights.util import WSP 97 psils_ = np.empty(W.n, dtype=float) 98 closest_connecting_label_ = np.empty(W.n, dtype=labels.dtype) 99 closest_connection_score_ = np.empty(W.n, dtype=labels.dtype) 100 for component in np.unique(W.component_labels): 101 this_component_mask = np.nonzero(W.component_labels == component)[0] 102 subgraph = W.sparse[this_component_mask.reshape(-1,1), # these rows 103 this_component_mask.reshape(1,-1)] #these columns 104 subgraph_W = WSP(subgraph).to_W() 105 assert subgraph_W.n_components == 1 106 # DW operation is idempotent 107 subgraph_D = DW[this_component_mask.reshape(-1,1), # these rows 108 this_component_mask.reshape(1,-1)] #these columns 109 subgraph_labels = labels[this_component_mask] 110 n_subgraph_labels = len(np.unique(subgraph_labels)) 111 if not (2 < n_subgraph_labels < (subgraph_W.n-1)) : 112 psils = subgraph_solutions = [0]*subgraph_W.n 113 closest_connecting_label = [np.nan]*subgraph_W.n 114 closest_connection_score = [np.inf]*subgraph_W.n 115 else: 116 subgraph_solutions = path_silhouette(data=None, labels=subgraph_labels, W=subgraph_W, D=subgraph_D, 117 metric=metric, closest=closest, return_nbfc=return_nbfc, 118 return_nbfc_score=return_nbfc_score, 119 return_paths=return_paths, 120 directed=directed) 121 # always throw away all_pairs, since we already have it built 122 if (return_nbfc or return_nbfc_score) and return_paths: 123 if return_nbfc_score: 124 (psils, closest_connecting_label, 125 closest_connection_score, 126 _) = subgraph_solutions 127 else: 128 psils, closest_connecting_label, _ = subgraph_solutions 129 elif return_nbfc_score: 130 psils, closest_connecting_label, closest_connection_score = subgraph_solutions 131 elif return_nbfc: 132 psils, closest_connecting_label = subgraph_solutions 133 elif return_paths: 134 psils, _ = subgraph_solutions 135 else: 136 psils = subgraph_solutions 137 if return_nbfc: 138 closest_connecting_label_[this_component_mask] = closest_connecting_label 139 if return_nbfc_score: 140 closest_connection_score_[this_component_mask] = closest_connection_score 141 psils_[this_component_mask] = psils 142 closest_connection_score = closest_connection_score_ 143 closest_connecting_label = closest_connecting_label_ 144 psils = psils_ 145 # Single Connected Component 146 elif closest is False: 147 psils = sk.silhouette_samples(all_pairs, labels, metric='precomputed') 148 if return_nbfc or return_nbfc_score: 149 closest_connecting_label = [] 150 closest_connection_score = [] 151 for i, label in enumerate(labels): 152 row = all_pairs[i].copy() 153 in_label = labels == label 154 masked_label = row.copy() # for observations in the row 155 masked_label[in_label] = np.inf # make those in cluster infinite 156 nearest_not_in_cluster = np.argmin(masked_label) # find the closest 157 nearest_not_in_cluster_label = labels[nearest_not_in_cluster] #label 158 nearest_not_in_cluster_score = masked_label[nearest_not_in_cluster] 159 closest_connecting_label.append(nearest_not_in_cluster_label) 160 closest_connection_score.append(nearest_not_in_cluster_score) 161 else: 162 psils = [] 163 closest_connecting_label = [] 164 closest_connection_score = [] 165 for i,label in enumerate(labels): 166 row = all_pairs[i] 167 in_label = labels == label 168 #required to make argmin pertain to N, not N - len(in_label) 169 masked_label = row.copy() 170 masked_label[in_label] = np.inf 171 nearest_not_in_cluster = np.argmin(masked_label) 172 nearest_not_in_cluster_score = row[nearest_not_in_cluster] 173 nearest_not_in_cluster_label = labels[nearest_not_in_cluster] 174 175 average_interconnect_in_cluster = row[in_label].mean() 176 psil = nearest_not_in_cluster_score - average_interconnect_in_cluster 177 psil /= np.maximum(nearest_not_in_cluster_score, average_interconnect_in_cluster) 178 psils.append(psil) 179 closest_connecting_label.append(nearest_not_in_cluster_label) 180 closest_connection_score.append(nearest_not_in_cluster_score) 181 psils = np.asarray(psils) 182 if (return_nbfc or return_nbfc_score) and return_paths: 183 if return_nbfc_score: 184 out = (psils, np.asarray(closest_connecting_label), 185 np.asarray(closest_connection_score), 186 all_pairs ) 187 else: 188 out = psils, np.asarray(closest_connecting_label), all_pairs 189 elif return_nbfc_score: 190 out = psils, np.asarray(closest_connecting_label),\ 191 np.asarray(closest_connection_score) 192 elif return_nbfc: 193 out = psils, np.asarray(closest_connecting_label) 194 elif return_paths: 195 out = psils, all_pairs 196 else: 197 out = psils 198 return out 199 200def boundary_silhouette(data, labels, W, metric=skp.euclidean_distances): 201 """ 202 Compute the observation-level boundary silhouette score :cite:`wolf2019geosilhouettes`. 203 204 Arguments 205 --------- 206 data : (N_obs,P) numpy array 207 an array of covariates to analyze. Each row should be one 208 observation, and each clumn should be one feature. 209 labels : (N_obs,) array of labels 210 the labels corresponding to the group each observation is assigned. 211 W : pysal.weights.W object 212 a spatial weights object containing the connectivity structure 213 for the data 214 metric : callable, array, 215 a function that takes an argument (data) and returns the all-pairs 216 distances/dissimilarity between observations. 217 218 Returns 219 ------- 220 (N_obs,) array of boundary silhouette values for each observation 221 222 Notes 223 ----- 224 The boundary silhouette is the silhouette score using only spatially-proximate 225 clusters as candidates for the next-best-fit distance function (the 226 b(i) function in :cite:`Rousseeuw1987`. 227 This restricts the next-best-fit cluster to be the set of clusters on which 228 an observation neighbors. 229 So, instead of considering *all* clusters when finding the next-best-fit cluster, 230 only clusters that `i` borders are considered. 231 This is supposed to model the fact that, in spatially-constrained clustering, 232 observation i can only be reassigned from cluster c to cluster k if some observation 233 j neighbors i and also resides in k. 234 235 If an observation only neighbors its own cluster, i.e. is not on the boundary 236 of a cluster, this value is zero. 237 238 If a cluster has exactly one observation, this value is zero. 239 240 If an observation is on the boundary of more than one cluster, then the 241 best candidate is chosen from the set of clusters on which the observation borders. 242 243 metric is a callable mapping an (N,P) data into an (N,N) distance matrix OR 244 an (N,N) distance matrix already. 245 """ 246 if not HAS_REQUIREMENTS: 247 _raise_initial_error() 248 249 alist = W.to_adjlist() 250 labels = np.asarray(labels) 251 if callable(metric): 252 full_distances = metric(data) 253 elif isinstance(metric, np.ndarray): 254 n_obs = W.n 255 if metric.shape == (n_obs, n_obs): 256 full_distances = metric 257 else: 258 raise ValueError('Precomputed metric is supplied, but is not the right shape.' 259 ' The dissimilarity matrix should be of shape ({},{}), but was' 260 ' of shape ({},{})'.format(W.n, W.n, *metric.shape)) 261 else: 262 raise ValueError('The provided metric is neither a dissmilarity function' 263 ' nor a dissimilarity matrix.') 264 assert 0 == (full_distances < 0).sum(), ("Distance metric has negative values, " 265 "which is not supported") 266 label_frame = pd.DataFrame(labels, index=W.id_order, columns=['label']) 267 alist = alist.merge(label_frame, left_on='focal', right_index=True, how='left')\ 268 .merge(label_frame, left_on='neighbor', right_index=True, how='left', 269 suffixes=('_focal', '_neighbor')) 270 alist['boundary'] = alist.label_focal != alist.label_neighbor 271 focals = alist.groupby('focal') 272 bmask = focals.boundary.any() 273 result = [] 274 np.seterr(all='raise') 275 for i, (ix,bnd) in enumerate(bmask.iteritems()): 276 if not bnd: 277 result.append(np.array([0])) 278 continue 279 sil_score = np.array([np.inf]) 280 label = labels[i] 281 focal_mask = np.nonzero(labels == label)[0] 282 if len(focal_mask) == 1: #the candidate is singleton 283 result.append(np.array([0])) 284 continue 285 neighbors = alist.query("focal == {}".format(ix)).label_neighbor 286 mean_dissim = full_distances[i,focal_mask].sum() / (len(focal_mask)-1) 287 if not np.isfinite(mean_dissim).all(): 288 raise ValueError('A non-finite mean dissimilarity between groups' 289 ' and the boundary observation occurred. Please ensure' 290 ' the data & labels are formatted and shaped correctly.') 291 neighbor_score = np.array([np.inf]) 292 for neighbor in set(neighbors).difference([label]): 293 other_mask = np.nonzero(labels == neighbor)[0] 294 other_score = full_distances[i,other_mask].mean() 295 neighbor_score = np.minimum(neighbor_score, other_score, neighbor_score) 296 if neighbor_score < 0: 297 raise ValueError('A negative neighborhood similarity value occurred. ' 298 'This should not happen. Please create a bug report on' 299 'https://github.com/pysal/esda/issues') 300 sil_score = (neighbor_score - mean_dissim) / np.maximum(neighbor_score, 301 mean_dissim) 302 result.append(sil_score) 303 if len(result) != len(labels): 304 raise ValueError('The number of boundary silhouettes does not match the number of' 305 ' observations.' 306 'This should not happen. Please create a bug report on' 307 'https://github.com/pysal/esda/issues') 308 return np.asarray(result).squeeze() 309 310def silhouette_alist(data, labels, alist, indices=None, 311 metric=skp.euclidean_distances): 312 """ 313 Compute the silhouette for each edge in an adjacency graph. Given the alist 314 containing `focal` id, `neighbor` id, and `label_focal`, and `label_neighbor`, 315 this computes: 316 317 `d(i,label_neighbor) - d(i,label_focal) / (max(d(i,label_neighbor), d(i,label_focal)))` 318 319 Arguments 320 --------- 321 data : (N,P) array to cluster on or DataFrame indexed on the same values as 322 that in alist.focal/alist.neighbor 323 labels: (N,) array containing classifications, indexed on the same values 324 as that in alist.focal/alist.neighbor 325 alist: adjacency list containing columns focal & neighbor, 326 describing one edge of the graph. 327 indices: (N,) array containing the "name" for observations in 328 alist to be linked to data. indices should be: 329 1. aligned with data by iteration order 330 2. include all values in the alist.focal set. 331 if alist.focal and alist.neighbor are strings, then indices should be 332 a list/array of strings aligned with the rows of data. 333 if not provided and labels is a series/dataframe, 334 then its index will be used. 335 metric : callable, array, 336 a function that takes an argument (data) and returns the all-pairs 337 distances/dissimilarity between observations. 338 339 Results 340 ------- 341 pandas.DataFrame, copy of the adjacency list `alist`, with an additional 342 column called `silhouette` that contains the pseudo-silhouette values 343 expressing the relative dissimilarity between neighboring observations. 344 """ 345 if not HAS_REQUIREMENTS: 346 _raise_initial_error() 347 348 n_obs = data.shape[0] 349 if callable(metric): 350 full_distances = metric(data) 351 elif isinstance(metric, np.ndarray): 352 if metric.shape == (n_obs, n_obs): 353 full_distances = metric 354 if isinstance(data, pd.DataFrame): 355 indices = data.index 356 if isinstance(labels, (pd.DataFrame, pd.Series)) and indices is None: 357 indices = labels.index 358 elif indices is not None \ 359 and not isinstance(labels, (pd.DataFrame, pd.Series)): 360 labels = pd.Series(labels, index=indices) 361 elif indices is None \ 362 and not isinstance(labels, (pd.DataFrame, pd.Series)): 363 indices = np.arange(len(labels)) 364 labels = pd.Series(labels, index=indices) 365 if isinstance(labels, pd.DataFrame): 366 labels = pd.Series(labels.values, index=labels.index) 367 assert indices is not None 368 assert isinstance(labels, pd.Series) 369 labels = labels.to_frame("label") 370 371 result = alist.sort_values('focal').copy(deep=True) 372 373 result = result.merge(labels, left_on='focal', right_index=True, how='left')\ 374 .merge(labels, left_on='neighbor', right_index=True, how='left', 375 suffixes=('_focal','_neighbor')) 376 self_dcache = dict() 377 sils = [] 378 indices = list(indices) 379 for i_alist, row in result.iterrows(): 380 name = row.focal 381 label = row.label_focal 382 neighbor_label = row.label_neighbor 383 if neighbor_label == label: 384 sils.append(0) 385 continue 386 i_Xc = indices.index(name) 387 mask = labels == label 388 mask = np.nonzero(mask.values)[0] 389 within_cluster = self_dcache.get((i_Xc,label), full_distances[i_Xc,mask].mean()) 390 self_dcache[(i_Xc,label)] = within_cluster 391 neighbor_mask = labels == neighbor_label 392 neighbor_mask = np.nonzero(neighbor_mask.values)[0] 393 if len(neighbor_mask) == 0: 394 sils.append(0) 395 warn('A link ({},{}) has been found to have an empty set of neighbors. ' 396 ' This may happen when a label assignment is missing for the neighbor unit.' 397 ' Check that no labels are missing.'.format(row.focal, row.neighbor)) 398 continue 399 outer_distance = full_distances[i_Xc,neighbor_mask].mean() 400 sils.append((outer_distance - within_cluster) / np.maximum(outer_distance, within_cluster)) 401 result['silhouette'] = sils 402 return result.sort_values("focal").reset_index(drop=True) 403 404def nearest_label(data, labels, 405 metric=skp.euclidean_distances, 406 return_distance=False, keep_self=False): 407 """ 408 Find the nearest label in attribute space. 409 410 Given the data and a set of labels in labels, this finds the label 411 whose mean center is closest to the observation in data. 412 413 Arguments 414 --------- 415 data : (N,P) array to cluster on or DataFrame indexed on the same values as 416 that in alist.focal/alist.neighbor 417 labels: (N,) array containing classifications, indexed on the same values 418 as that in alist.focal/alist.neighbor 419 metric : callable, array, 420 a function that takes an argument (data) and returns the all-pairs 421 distances/dissimilarity between observations. 422 return_distance: bool 423 Whether to return the distance from the observation to its nearest 424 cluster in feature space. If True, the tuple of (nearest_label, dissim) 425 is returned. If False, only the nearest_label array is returned. 426 keep_self: bool 427 whether to allow observations to use their current cluster as their 428 nearest label. If True, an observation's existing cluster assignment can 429 also be the cluster it is closest to. If False, an observation's existing 430 cluster assignment cannot be the cluster it is closest to. This would mean 431 the function computes the nearest *alternative* cluster. 432 433 Returns 434 ------- 435 (N_obs,) array of assignments reflect each observation's nearest label. 436 437 If return_distance is True, a tuple of ((N,) and (N,)) where the first 438 array is the assignment, and the second is the distance to the centroid 439 of that assignment. 440 """ 441 if not HAS_REQUIREMENTS: 442 _raise_initial_error() 443 444 if callable(metric): 445 dissim = metric(data) 446 elif metric.lower == "precomputed": 447 assert data.shape == (labels.shape[0],labels.shape[0]), "dissimilarity matrix is malformed!" 448 dissim = data 449 elif isinstance(metric, np.ndarray): 450 assert metric.shape == (labels.shape[0],labels.shape[0]), "dissimilarity matrix is malformed!" 451 dissim = metric 452 unique_labels = np.unique(labels) 453 nearest_label = np.empty(labels.shape, dtype=labels.dtype) 454 nearest_label_dissim = np.empty(labels.shape) 455 for label in unique_labels: 456 this_label_mask = labels == label 457 n_in_label = this_label_mask.sum() 458 this_label_mask = np.nonzero(this_label_mask)[0] 459 next_best_fit = np.ones(this_label_mask.shape) * np.inf 460 next_best_label = np.empty(this_label_mask.shape, dtype=labels.dtype) 461 for neighbor in unique_labels: 462 if (neighbor == label) & (not keep_self): 463 continue 464 neighbor_label_mask = labels == neighbor 465 n_in_neighbor = neighbor_label_mask.sum() 466 neighbor_label_mask = np.nonzero(neighbor_label_mask)[0].reshape(1,-1) 467 # Need to account for the fact that the self-distance 468 # is not included in the silhouette; in small clusters, 469 # this extra zero can bring down the average, resulting in a case 470 # where the silhouette is negative, but the "nearest" cluster would 471 # be the current cluster if we take averages including i in C. 472 chunk = dissim[this_label_mask.reshape(-1,1), #these rows 473 neighbor_label_mask] #and these columns 474 neighbor_distance = chunk.sum(axis=1) / np.maximum(n_in_neighbor-1, 1) #and sum across rows 475 next_best_label[neighbor_distance < next_best_fit] = neighbor 476 np.minimum(next_best_fit, neighbor_distance, next_best_fit) 477 nearest_label[this_label_mask] = next_best_label 478 nearest_label_dissim[this_label_mask] = next_best_fit 479 if return_distance: 480 return nearest_label, nearest_label_dissim 481 else: 482 return nearest_label 483