1import numpy as np
2import pandas as pd
3import warnings
4from scipy import sparse
5from sklearn.base import BaseEstimator
6from libpysal import weights
7from esda.crand import (
8    crand as _crand_plus,
9    njit as _njit,
10    _prepare_univariate,
11    _prepare_bivariate,
12)
13
14
15PERMUTATIONS = 999
16
17
18class Join_Counts_Local_BV(BaseEstimator):
19
20    """Univariate Local Join Count Statistic"""
21
22    def __init__(
23        self,
24        connectivity=None,
25        permutations=PERMUTATIONS,
26        n_jobs=1,
27        keep_simulations=True,
28        seed=None,
29    ):
30        """
31        Initialize a Local_Join_Counts_BV estimator
32        Arguments
33        ---------
34        connectivity     : scipy.sparse matrix object
35                           the connectivity structure describing
36                           the relationships between observed units.
37                           Need not be row-standardized.
38        permutations     : int
39                           number of random permutations for calculation of pseudo
40                           p_values
41        n_jobs           : int
42                           Number of cores to be used in the conditional randomisation. If -1,
43                           all available cores are used.
44        keep_simulations : Boolean
45                           (default=True)
46                           If True, the entire matrix of replications under the null
47                           is stored in memory and accessible; otherwise, replications
48                           are not saved
49        seed             : None/int
50                           Seed to ensure reproducibility of conditional randomizations.
51                           Must be set here, and not outside of the function, since numba
52                           does not correctly interpret external seeds
53                           nor numpy.random.RandomState instances.
54
55        """
56
57        self.connectivity = connectivity
58        self.permutations = permutations
59        self.n_jobs = n_jobs
60        self.keep_simulations = keep_simulations
61        self.seed = seed
62
63    def fit(self, x, z, case="CLC", n_jobs=1, permutations=999):
64        """
65        Arguments
66        ---------
67        x                : numpy.ndarray
68                           array containing binary (0/1) data
69        z                : numpy.ndarray
70                           array containing binary (0/1) data
71        Returns
72        -------
73        the fitted estimator.
74
75        Notes
76        -----
77        Technical details and derivations can be found in :cite:`AnselinLi2019`.
78
79        Examples
80        --------
81        >>> import libpysal
82        >>> w = libpysal.weights.lat2W(4, 4)
83        >>> x = np.ones(16)
84        >>> x[0:8] = 0
85        >>> z = [0,1,0,1,1,1,1,1,0,0,1,1,0,0,1,1]
86        >>> LJC_BV_C1 = Local_Join_Counts_BV(connectivity=w).fit(x, z, case="BJC")
87        >>> LJC_BV_C2 = Local_Join_Counts_BV(connectivity=w).fit(x, z, case="CLC")
88        >>> LJC_BV_C1.LJC
89        >>> LJC_BV_C1.p_sim
90        >>> LJC_BV_C2.LJC
91        >>> LJC_BV_C2.p_sim
92
93        Commpop data replicating GeoDa tutorial (Case 1)
94        >>> import libpysal
95        >>> import geopandas as gpd
96        >>> commpop = gpd.read_file("https://github.com/jeffcsauer/GSOC2020/raw/master/validation/data/commpop.gpkg")
97        >>> w = libpysal.weights.Queen.from_dataframe(commpop)
98        >>> LJC_BV_Case1 = Local_Join_Counts_BV(connectivity=w).fit(commpop['popneg'], commpop['popplus'], case='BJC')
99        >>> LJC_BV_Case1.LJC
100        >>> LJC_BV_Case1.p_sim
101
102        Guerry data replicating GeoDa tutorial (Case 2)
103        >>> import libpysal
104        >>> import geopandas as gpd
105        >>> guerry = libpysal.examples.load_example('Guerry')
106        >>> guerry_ds = gpd.read_file(guerry.get_path('Guerry.shp'))
107        >>> guerry_ds['infq5'] = 0
108        >>> guerry_ds['donq5'] = 0
109        >>> guerry_ds.loc[(guerry_ds['Infants'] > 23574), 'infq5'] = 1
110        >>> guerry_ds.loc[(guerry_ds['Donatns'] > 10973), 'donq5'] = 1
111        >>> w = libpysal.weights.Queen.from_dataframe(guerry_ds)
112        >>> LJC_BV_Case2 = Local_Join_Counts_BV(connectivity=w).fit(guerry_ds['infq5'], guerry_ds['donq5'], case='CLC')
113        >>> LJC_BV_Case2.LJC
114        >>> LJC_BV_Case2.p_sim
115        """
116        # Need to ensure that the np.array() are of
117        # dtype='float' for numba
118        x = np.array(x, dtype="float")
119        z = np.array(z, dtype="float")
120
121        w = self.connectivity
122        # Fill the diagonal with 0s
123        w = weights.util.fill_diagonal(w, val=0)
124        w.transform = "b"
125
126        self.x = x
127        self.z = z
128        self.n = len(x)
129        self.w = w
130        self.case = case
131
132        keep_simulations = self.keep_simulations
133        n_jobs = self.n_jobs
134        seed = self.seed
135
136        self.LJC = self._statistic(x, z, w, case=case)
137
138        if permutations:
139            if case == "BJC":
140                self.p_sim, self.rjoins = _crand_plus(
141                    z=np.column_stack((x, z)),
142                    w=self.w,
143                    observed=self.LJC,
144                    permutations=permutations,
145                    keep=True,
146                    n_jobs=n_jobs,
147                    stat_func=_ljc_bv_case1,
148                )
149                # Set p-values for those with LJC of 0 to NaN
150                self.p_sim[self.LJC == 0] = "NaN"
151            elif case == "CLC":
152                self.p_sim, self.rjoins = _crand_plus(
153                    z=np.column_stack((x, z)),
154                    w=self.w,
155                    observed=self.LJC,
156                    permutations=permutations,
157                    keep=True,
158                    n_jobs=n_jobs,
159                    stat_func=_ljc_bv_case2,
160                )
161                # Set p-values for those with LJC of 0 to NaN
162                self.p_sim[self.LJC == 0] = "NaN"
163            else:
164                raise NotImplementedError(
165                    f"The requested LJC method ({case}) \
166                is not currently supported!"
167                )
168
169        return self
170
171    @staticmethod
172    def _statistic(x, z, w, case):
173        # Create adjacency list. Note that remove_symmetric=False - this is
174        # different from the esda.Join_Counts() function.
175        adj_list = w.to_adjlist(remove_symmetric=False)
176
177        # First, set up a series that maps the values
178        # to the weights table
179        zseries_x = pd.Series(x, index=w.id_order)
180        zseries_z = pd.Series(z, index=w.id_order)
181
182        # Map the values to the focal (i) values
183        focal_x = zseries_x.loc[adj_list.focal].values
184        focal_z = zseries_z.loc[adj_list.focal].values
185
186        # Map the values to the neighbor (j) values
187        neighbor_x = zseries_x.loc[adj_list.neighbor].values
188        neighbor_z = zseries_z.loc[adj_list.neighbor].values
189
190        if case == "BJC":
191            BJC = (
192                (focal_x == 1) & (focal_z == 0) & (neighbor_x == 0) & (neighbor_z == 1)
193            )
194            adj_list_BJC = pd.DataFrame(
195                adj_list.focal.values, BJC.astype("uint8")
196            ).reset_index()
197            adj_list_BJC.columns = ["BJC", "ID"]
198            adj_list_BJC = adj_list_BJC.groupby(by="ID").sum()
199            return np.array(adj_list_BJC.BJC.values, dtype="float")
200        elif case == "CLC":
201            CLC = (
202                (focal_x == 1) & (focal_z == 1) & (neighbor_x == 1) & (neighbor_z == 1)
203            )
204            adj_list_CLC = pd.DataFrame(
205                adj_list.focal.values, CLC.astype("uint8")
206            ).reset_index()
207            adj_list_CLC.columns = ["CLC", "ID"]
208            adj_list_CLC = adj_list_CLC.groupby(by="ID").sum()
209            return np.array(adj_list_CLC.CLC.values, dtype="float")
210        else:
211            raise NotImplementedError(
212                f"The requested LJC method ({case}) \
213            is not currently supported!"
214            )
215
216
217# --------------------------------------------------------------
218# Conditional Randomization Function Implementations
219# --------------------------------------------------------------
220
221# Note: scaling not used
222
223
224@_njit(fastmath=True)
225def _ljc_bv_case1(i, z, permuted_ids, weights_i, scaling):
226    zx = z[:, 0]
227    zy = z[:, 1]
228    self_weight = weights_i[0]
229    other_weights = weights_i[1:]
230    zyi, zyrand = _prepare_univariate(i, zy, permuted_ids, other_weights)
231    return zx[i] * (zyrand @ other_weights)
232
233
234@_njit(fastmath=True)
235def _ljc_bv_case2(i, z, permuted_ids, weights_i, scaling):
236    zx = z[:, 0]
237    zy = z[:, 1]
238    self_weight = weights_i[0]
239    other_weights = weights_i[1:]
240    zxi, zxrand, zyi, zyrand = _prepare_bivariate(i, z, permuted_ids, other_weights)
241    zf = zxrand * zyrand
242    return zy[i] * (zf @ other_weights)
243