1import numpy as np 2import pandas as pd 3import warnings 4from scipy import sparse 5from sklearn.base import BaseEstimator 6from libpysal import weights 7from esda.crand import ( 8 crand as _crand_plus, 9 njit as _njit, 10 _prepare_univariate, 11 _prepare_bivariate, 12) 13 14 15PERMUTATIONS = 999 16 17 18class Join_Counts_Local_BV(BaseEstimator): 19 20 """Univariate Local Join Count Statistic""" 21 22 def __init__( 23 self, 24 connectivity=None, 25 permutations=PERMUTATIONS, 26 n_jobs=1, 27 keep_simulations=True, 28 seed=None, 29 ): 30 """ 31 Initialize a Local_Join_Counts_BV estimator 32 Arguments 33 --------- 34 connectivity : scipy.sparse matrix object 35 the connectivity structure describing 36 the relationships between observed units. 37 Need not be row-standardized. 38 permutations : int 39 number of random permutations for calculation of pseudo 40 p_values 41 n_jobs : int 42 Number of cores to be used in the conditional randomisation. If -1, 43 all available cores are used. 44 keep_simulations : Boolean 45 (default=True) 46 If True, the entire matrix of replications under the null 47 is stored in memory and accessible; otherwise, replications 48 are not saved 49 seed : None/int 50 Seed to ensure reproducibility of conditional randomizations. 51 Must be set here, and not outside of the function, since numba 52 does not correctly interpret external seeds 53 nor numpy.random.RandomState instances. 54 55 """ 56 57 self.connectivity = connectivity 58 self.permutations = permutations 59 self.n_jobs = n_jobs 60 self.keep_simulations = keep_simulations 61 self.seed = seed 62 63 def fit(self, x, z, case="CLC", n_jobs=1, permutations=999): 64 """ 65 Arguments 66 --------- 67 x : numpy.ndarray 68 array containing binary (0/1) data 69 z : numpy.ndarray 70 array containing binary (0/1) data 71 Returns 72 ------- 73 the fitted estimator. 74 75 Notes 76 ----- 77 Technical details and derivations can be found in :cite:`AnselinLi2019`. 78 79 Examples 80 -------- 81 >>> import libpysal 82 >>> w = libpysal.weights.lat2W(4, 4) 83 >>> x = np.ones(16) 84 >>> x[0:8] = 0 85 >>> z = [0,1,0,1,1,1,1,1,0,0,1,1,0,0,1,1] 86 >>> LJC_BV_C1 = Local_Join_Counts_BV(connectivity=w).fit(x, z, case="BJC") 87 >>> LJC_BV_C2 = Local_Join_Counts_BV(connectivity=w).fit(x, z, case="CLC") 88 >>> LJC_BV_C1.LJC 89 >>> LJC_BV_C1.p_sim 90 >>> LJC_BV_C2.LJC 91 >>> LJC_BV_C2.p_sim 92 93 Commpop data replicating GeoDa tutorial (Case 1) 94 >>> import libpysal 95 >>> import geopandas as gpd 96 >>> commpop = gpd.read_file("https://github.com/jeffcsauer/GSOC2020/raw/master/validation/data/commpop.gpkg") 97 >>> w = libpysal.weights.Queen.from_dataframe(commpop) 98 >>> LJC_BV_Case1 = Local_Join_Counts_BV(connectivity=w).fit(commpop['popneg'], commpop['popplus'], case='BJC') 99 >>> LJC_BV_Case1.LJC 100 >>> LJC_BV_Case1.p_sim 101 102 Guerry data replicating GeoDa tutorial (Case 2) 103 >>> import libpysal 104 >>> import geopandas as gpd 105 >>> guerry = libpysal.examples.load_example('Guerry') 106 >>> guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp')) 107 >>> guerry_ds['infq5'] = 0 108 >>> guerry_ds['donq5'] = 0 109 >>> guerry_ds.loc[(guerry_ds['Infants'] > 23574), 'infq5'] = 1 110 >>> guerry_ds.loc[(guerry_ds['Donatns'] > 10973), 'donq5'] = 1 111 >>> w = libpysal.weights.Queen.from_dataframe(guerry_ds) 112 >>> LJC_BV_Case2 = Local_Join_Counts_BV(connectivity=w).fit(guerry_ds['infq5'], guerry_ds['donq5'], case='CLC') 113 >>> LJC_BV_Case2.LJC 114 >>> LJC_BV_Case2.p_sim 115 """ 116 # Need to ensure that the np.array() are of 117 # dtype='float' for numba 118 x = np.array(x, dtype="float") 119 z = np.array(z, dtype="float") 120 121 w = self.connectivity 122 # Fill the diagonal with 0s 123 w = weights.util.fill_diagonal(w, val=0) 124 w.transform = "b" 125 126 self.x = x 127 self.z = z 128 self.n = len(x) 129 self.w = w 130 self.case = case 131 132 keep_simulations = self.keep_simulations 133 n_jobs = self.n_jobs 134 seed = self.seed 135 136 self.LJC = self._statistic(x, z, w, case=case) 137 138 if permutations: 139 if case == "BJC": 140 self.p_sim, self.rjoins = _crand_plus( 141 z=np.column_stack((x, z)), 142 w=self.w, 143 observed=self.LJC, 144 permutations=permutations, 145 keep=True, 146 n_jobs=n_jobs, 147 stat_func=_ljc_bv_case1, 148 ) 149 # Set p-values for those with LJC of 0 to NaN 150 self.p_sim[self.LJC == 0] = "NaN" 151 elif case == "CLC": 152 self.p_sim, self.rjoins = _crand_plus( 153 z=np.column_stack((x, z)), 154 w=self.w, 155 observed=self.LJC, 156 permutations=permutations, 157 keep=True, 158 n_jobs=n_jobs, 159 stat_func=_ljc_bv_case2, 160 ) 161 # Set p-values for those with LJC of 0 to NaN 162 self.p_sim[self.LJC == 0] = "NaN" 163 else: 164 raise NotImplementedError( 165 f"The requested LJC method ({case}) \ 166 is not currently supported!" 167 ) 168 169 return self 170 171 @staticmethod 172 def _statistic(x, z, w, case): 173 # Create adjacency list. Note that remove_symmetric=False - this is 174 # different from the esda.Join_Counts() function. 175 adj_list = w.to_adjlist(remove_symmetric=False) 176 177 # First, set up a series that maps the values 178 # to the weights table 179 zseries_x = pd.Series(x, index=w.id_order) 180 zseries_z = pd.Series(z, index=w.id_order) 181 182 # Map the values to the focal (i) values 183 focal_x = zseries_x.loc[adj_list.focal].values 184 focal_z = zseries_z.loc[adj_list.focal].values 185 186 # Map the values to the neighbor (j) values 187 neighbor_x = zseries_x.loc[adj_list.neighbor].values 188 neighbor_z = zseries_z.loc[adj_list.neighbor].values 189 190 if case == "BJC": 191 BJC = ( 192 (focal_x == 1) & (focal_z == 0) & (neighbor_x == 0) & (neighbor_z == 1) 193 ) 194 adj_list_BJC = pd.DataFrame( 195 adj_list.focal.values, BJC.astype("uint8") 196 ).reset_index() 197 adj_list_BJC.columns = ["BJC", "ID"] 198 adj_list_BJC = adj_list_BJC.groupby(by="ID").sum() 199 return np.array(adj_list_BJC.BJC.values, dtype="float") 200 elif case == "CLC": 201 CLC = ( 202 (focal_x == 1) & (focal_z == 1) & (neighbor_x == 1) & (neighbor_z == 1) 203 ) 204 adj_list_CLC = pd.DataFrame( 205 adj_list.focal.values, CLC.astype("uint8") 206 ).reset_index() 207 adj_list_CLC.columns = ["CLC", "ID"] 208 adj_list_CLC = adj_list_CLC.groupby(by="ID").sum() 209 return np.array(adj_list_CLC.CLC.values, dtype="float") 210 else: 211 raise NotImplementedError( 212 f"The requested LJC method ({case}) \ 213 is not currently supported!" 214 ) 215 216 217# -------------------------------------------------------------- 218# Conditional Randomization Function Implementations 219# -------------------------------------------------------------- 220 221# Note: scaling not used 222 223 224@_njit(fastmath=True) 225def _ljc_bv_case1(i, z, permuted_ids, weights_i, scaling): 226 zx = z[:, 0] 227 zy = z[:, 1] 228 self_weight = weights_i[0] 229 other_weights = weights_i[1:] 230 zyi, zyrand = _prepare_univariate(i, zy, permuted_ids, other_weights) 231 return zx[i] * (zyrand @ other_weights) 232 233 234@_njit(fastmath=True) 235def _ljc_bv_case2(i, z, permuted_ids, weights_i, scaling): 236 zx = z[:, 0] 237 zy = z[:, 1] 238 self_weight = weights_i[0] 239 other_weights = weights_i[1:] 240 zxi, zxrand, zyi, zyrand = _prepare_bivariate(i, z, permuted_ids, other_weights) 241 zf = zxrand * zyrand 242 return zy[i] * (zf @ other_weights) 243