1"""
2ML Estimation of Spatial Error Model
3"""
4
5__author__ = "Luc Anselin luc.anselin@asu.edu,\
6              Serge Rey srey@asu.edu, \
7              Levi Wolf levi.john.wolf@asu.edu"
8
9import numpy as np
10import numpy.linalg as la
11from scipy import sparse as sp
12from scipy.sparse.linalg import splu as SuperLU
13from .utils import RegressionPropsY, RegressionPropsVM, set_warn
14from . import diagnostics as DIAG
15from . import user_output as USER
16from . import summary_output as SUMMARY
17from . import regimes as REGI
18from .w_utils import symmetrize
19try:
20    from scipy.optimize import minimize_scalar
21    minimize_scalar_available = True
22except ImportError:
23    minimize_scalar_available = False
24from .sputils import spdot, spfill_diagonal, spinv
25from libpysal import weights
26
27__all__ = ["ML_Error"]
28
29
30class BaseML_Error(RegressionPropsY, RegressionPropsVM, REGI.Regimes_Frame):
31
32    """
33    ML estimation of the spatial error model (note no consistency
34    checks, diagnostics or constants added): :cite:`Anselin1988`
35
36    Parameters
37    ----------
38    y            : array
39                   nx1 array for dependent variable
40    x            : array
41                   Two dimensional array with n rows and one column for each
42                   independent (exogenous) variable, excluding the constant
43    w            : Sparse matrix
44                   Spatial weights sparse matrix
45    method       : string
46                   if 'full', brute force calculation (full matrix expressions)
47                   if 'ord', Ord eigenvalue calculation
48                   if 'LU', LU decomposition for sparse matrices
49    epsilon      : float
50                   tolerance criterion in mimimize_scalar function and inverse_product
51    regimes_att  : dictionary
52                   Dictionary containing elements to be used in case of a regimes model,
53                   i.e. 'x' before regimes, 'regimes' list and 'cols2regi'
54
55
56    Attributes
57    ----------
58    betas        : array
59                   kx1 array of estimated coefficients
60    lam          : float
61                   estimate of spatial autoregressive coefficient
62    u            : array
63                   nx1 array of residuals
64    e_filtered   : array
65                   spatially filtered residuals
66    predy        : array
67                   nx1 array of predicted y values
68    n            : integer
69                   Number of observations
70    k            : integer
71                   Number of variables for which coefficients are estimated
72                   (including the constant, excluding the rho)
73    y            : array
74                   nx1 array for dependent variable
75    x            : array
76                   Two dimensional array with n rows and one column for each
77                   independent (exogenous) variable, including the constant
78    method       : string
79                   log Jacobian method
80                   if 'full': brute force (full matrix computations)
81                   if 'ord' : Ord eigenvalue method
82    epsilon      : float
83                   tolerance criterion used in minimize_scalar function and inverse_product
84    mean_y       : float
85                   Mean of dependent variable
86    std_y        : float
87                   Standard deviation of dependent variable
88    vm           : array
89                   Variance covariance matrix (k+1 x k+1) - includes lambda
90    vm1          : array
91                   2x2 array of variance covariance for lambda, sigma
92    sig2         : float
93                   Sigma squared used in computations
94    logll        : float
95                   maximized log-likelihood (including constant terms)
96
97    Examples
98    --------
99    >>> import numpy as np
100    >>> import libpysal
101    >>> from libpysal import examples
102    >>> from libpysal.examples import load_example
103    >>> import spreg
104    >>> np.set_printoptions(suppress=True) #prevent scientific format
105    >>> south = load_example('South')
106    >>> db = libpysal.io.open(south.get_path("south.dbf"),'r')
107    >>> y_name = "HR90"
108    >>> y = np.array(db.by_col(y_name))
109    >>> y.shape = (len(y),1)
110    >>> x_names = ["RD90","PS90","UE90","DV90"]
111    >>> x = np.array([db.by_col(var) for var in x_names]).T
112    >>> x = np.hstack((np.ones((len(y),1)),x))
113    >>> w = libpysal.weights.Queen.from_shapefile(south.get_path("south.shp"))
114    >>> w.transform = 'r'
115    >>> mlerr = spreg.ml.error.BaseML_Error(y,x,w) #doctest: +SKIP
116    >>> "{0:.6f}".format(mlerr.lam) #doctest: +SKIP
117    '0.299078'
118    >>> np.around(mlerr.betas, decimals=4) #doctest: +SKIP
119    array([[ 6.1492],
120           [ 4.4024],
121           [ 1.7784],
122           [-0.3781],
123           [ 0.4858],
124           [ 0.2991]])
125    >>> "{0:.6f}".format(mlerr.mean_y) #doctest: +SKIP
126    '9.549293'
127    >>> "{0:.6f}".format(mlerr.std_y) #doctest: +SKIP
128    '7.038851'
129    >>> np.diag(mlerr.vm) #doctest: +SKIP
130    array([ 1.06476526,  0.05548248,  0.04544514,  0.00614425,  0.01481356,
131            0.00143001])
132    >>> "{0:.6f}".format(mlerr.sig2[0][0]) #doctest: +SKIP
133    '32.406854'
134    >>> "{0:.6f}".format(mlerr.logll) #doctest: +SKIP
135    '-4471.407067'
136    >>> mlerr1 = BaseML_Error(y,x,w,method='ord') #doctest: +SKIP
137    >>> "{0:.6f}".format(mlerr1.lam) #doctest: +SKIP
138    '0.299078'
139    >>> np.around(mlerr1.betas, decimals=4) #doctest: +SKIP
140    array([[ 6.1492],
141           [ 4.4024],
142           [ 1.7784],
143           [-0.3781],
144           [ 0.4858],
145           [ 0.2991]])
146    >>> "{0:.6f}".format(mlerr1.mean_y) #doctest: +SKIP
147    '9.549293'
148    >>> "{0:.6f}".format(mlerr1.std_y) #doctest: +SKIP
149    '7.038851'
150    >>> np.around(np.diag(mlerr1.vm), decimals=4) #doctest: +SKIP
151    array([ 1.0648,  0.0555,  0.0454,  0.0061,  0.0148,  0.0014])
152    >>> "{0:.4f}".format(mlerr1.sig2[0][0]) #doctest: +SKIP
153    '32.4069'
154    >>> "{0:.4f}".format(mlerr1.logll) #doctest: +SKIP
155    '-4471.4071'
156
157    """
158
159    def __init__(self, y, x, w, method='full', epsilon=0.0000001, regimes_att=None):
160        # set up main regression variables and spatial filters
161        self.y = y
162        if regimes_att:
163            self.x = x.toarray()
164        else:
165            self.x = x
166        self.n, self.k = self.x.shape
167        self.method = method
168        self.epsilon = epsilon
169
170        #W = w.full()[0] #wait to build pending what is needed
171        #Wsp = w.sparse
172
173        ylag = weights.lag_spatial(w, self.y)
174        xlag = self.get_x_lag(w, regimes_att)
175
176        # call minimizer using concentrated log-likelihood to get lambda
177        methodML = method.upper()
178        if methodML in ['FULL', 'LU', 'ORD']:
179            if methodML == 'FULL':
180                W = w.full()[0]      # need dense here
181                res = minimize_scalar(err_c_loglik, 0.0, bounds=(-1.0, 1.0),
182                                      args=(self.n, self.y, ylag, self.x,
183                                            xlag, W), method='bounded',
184                                      tol=epsilon)
185            elif methodML == 'LU':
186                I = sp.identity(w.n)
187                Wsp = w.sparse   # need sparse here
188                res = minimize_scalar(err_c_loglik_sp, 0.0, bounds=(-1.0,1.0),
189                                      args=(self.n, self.y, ylag,
190                                            self.x, xlag, I, Wsp),
191                                      method='bounded', tol=epsilon)
192                W = Wsp
193            elif methodML == 'ORD':
194                # check on symmetry structure
195                if w.asymmetry(intrinsic=False) == []:
196                    ww = symmetrize(w)
197                    WW = np.array(ww.todense())
198                    evals = la.eigvalsh(WW)
199                    W = WW
200                else:
201                    W = w.full()[0]      # need dense here
202                    evals = la.eigvals(W)
203                res = minimize_scalar(
204                    err_c_loglik_ord, 0.0, bounds=(-1.0, 1.0),
205                    args=(self.n, self.y, ylag, self.x,
206                          xlag, evals), method='bounded',
207                    tol=epsilon)
208        else:
209            raise Exception("{0} is an unsupported method".format(method))
210
211        self.lam = res.x
212
213        # compute full log-likelihood, including constants
214        ln2pi = np.log(2.0 * np.pi)
215        llik = -res.fun - self.n / 2.0 * ln2pi - self.n / 2.0
216
217        self.logll = llik
218
219        # b, residuals and predicted values
220
221        ys = self.y - self.lam * ylag
222        xs = self.x - self.lam * xlag
223        xsxs = np.dot(xs.T, xs)
224        xsxsi = np.linalg.inv(xsxs)
225        xsys = np.dot(xs.T, ys)
226        b = np.dot(xsxsi, xsys)
227
228        self.betas = np.vstack((b, self.lam))
229
230        self.u = y - np.dot(self.x, b)
231        self.predy = self.y - self.u
232
233        # residual variance
234
235        self.e_filtered = self.u - self.lam * weights.lag_spatial(w, self.u)
236        self.sig2 = np.dot(self.e_filtered.T, self.e_filtered) / self.n
237
238        # variance-covariance matrix betas
239
240        varb = self.sig2 * xsxsi
241
242        # variance-covariance matrix lambda, sigma
243
244        a = -self.lam * W
245        spfill_diagonal(a, 1.0)
246        ai = spinv(a)
247        wai = spdot(W, ai)
248        tr1 = wai.diagonal().sum()
249
250        wai2 = spdot(wai, wai)
251        tr2 = wai2.diagonal().sum()
252
253        waiTwai = spdot(wai.T, wai)
254        tr3 = waiTwai.diagonal().sum()
255
256        v1 = np.vstack((tr2 + tr3,
257                        tr1 / self.sig2))
258        v2 = np.vstack((tr1 / self.sig2,
259                        self.n / (2.0 * self.sig2 ** 2)))
260
261        v = np.hstack((v1, v2))
262
263        self.vm1 = np.linalg.inv(v)
264
265        # create variance matrix for beta, lambda
266        vv = np.hstack((varb, np.zeros((self.k, 1))))
267        vv1 = np.hstack(
268            (np.zeros((1, self.k)), self.vm1[0, 0] * np.ones((1, 1))))
269
270        self.vm = np.vstack((vv, vv1))
271
272
273    def get_x_lag(self, w, regimes_att):
274        if regimes_att:
275            xlag = weights.lag_spatial(w, regimes_att['x'])
276            xlag = REGI.Regimes_Frame.__init__(self, xlag,
277                                               regimes_att['regimes'], constant_regi=None, cols2regi=regimes_att['cols2regi'])[0]
278            xlag = xlag.toarray()
279        else:
280            xlag = weights.lag_spatial(w, self.x)
281        return xlag
282
283
284class ML_Error(BaseML_Error):
285
286    """
287    ML estimation of the spatial error model with all results and diagnostics;
288    :cite:`Anselin1988`
289
290    Parameters
291    ----------
292    y            : array
293                   nx1 array for dependent variable
294    x            : array
295                   Two dimensional array with n rows and one column for each
296                   independent (exogenous) variable, excluding the constant
297    w            : Sparse matrix
298                   Spatial weights sparse matrix
299    method       : string
300                   if 'full', brute force calculation (full matrix expressions)
301                   if 'ord', Ord eigenvalue method
302                   if 'LU', LU sparse matrix decomposition
303    epsilon      : float
304                   tolerance criterion in mimimize_scalar function and inverse_product
305    vm           : boolean
306                   if True, include variance-covariance matrix in summary
307                   results
308    name_y       : string
309                   Name of dependent variable for use in output
310    name_x       : list of strings
311                   Names of independent variables for use in output
312    name_w       : string
313                   Name of weights matrix for use in output
314    name_ds      : string
315                   Name of dataset for use in output
316
317    Attributes
318    ----------
319    betas        : array
320                   (k+1)x1 array of estimated coefficients (rho first)
321    lam          : float
322                   estimate of spatial autoregressive coefficient
323    u            : array
324                   nx1 array of residuals
325    e_filtered   : array
326                   nx1 array of spatially filtered residuals
327    predy        : array
328                   nx1 array of predicted y values
329    n            : integer
330                   Number of observations
331    k            : integer
332                   Number of variables for which coefficients are estimated
333                   (including the constant, excluding lambda)
334    y            : array
335                   nx1 array for dependent variable
336    x            : array
337                   Two dimensional array with n rows and one column for each
338                   independent (exogenous) variable, including the constant
339    method       : string
340                   log Jacobian method
341                   if 'full': brute force (full matrix computations)
342    epsilon      : float
343                   tolerance criterion used in minimize_scalar function and inverse_product
344    mean_y       : float
345                   Mean of dependent variable
346    std_y        : float
347                   Standard deviation of dependent variable
348    varb         : array
349                   Variance covariance matrix (k+1 x k+1) - includes var(lambda)
350    vm1          : array
351                   variance covariance matrix for lambda, sigma (2 x 2)
352    sig2         : float
353                   Sigma squared used in computations
354    logll        : float
355                   maximized log-likelihood (including constant terms)
356    pr2          : float
357                   Pseudo R squared (squared correlation between y and ypred)
358    utu          : float
359                   Sum of squared residuals
360    std_err      : array
361                   1xk array of standard errors of the betas
362    z_stat       : list of tuples
363                   z statistic; each tuple contains the pair (statistic,
364                   p-value), where each is a float
365    name_y       : string
366                   Name of dependent variable for use in output
367    name_x       : list of strings
368                   Names of independent variables for use in output
369    name_w       : string
370                   Name of weights matrix for use in output
371    name_ds      : string
372                   Name of dataset for use in output
373    title        : string
374                   Name of the regression method used
375
376    Examples
377    --------
378
379    >>> import numpy as np
380    >>> import libpysal
381    >>> from libpysal.examples import load_example
382    >>> from libpysal.weights import Queen
383    >>> from spreg import ML_Error
384    >>> np.set_printoptions(suppress=True) #prevent scientific format
385    >>> south = load_example('South')
386    >>> db = libpysal.io.open(south.get_path("south.dbf"),'r')
387    >>> y_name = "HR90"
388    >>> y = np.array(db.by_col(y_name))
389    >>> y.shape = (len(y),1)
390    >>> x_names = ["RD90","PS90","UE90","DV90"]
391    >>> x = np.array([db.by_col(var) for var in x_names]).T
392    >>> w = Queen.from_shapefile(south.get_path("south.shp"))
393    >>> w_name = "south_q.gal"
394    >>> w.transform = 'r'
395    >>> mlerr = ML_Error(y,x,w,name_y=y_name,name_x=x_names,\
396               name_w=w_name,name_ds=ds_name) #doctest: +SKIP
397    >>> np.around(mlerr.betas, decimals=4) #doctest: +SKIP
398    array([[ 6.1492],
399           [ 4.4024],
400           [ 1.7784],
401           [-0.3781],
402           [ 0.4858],
403           [ 0.2991]])
404    >>> "{0:.4f}".format(mlerr.lam) #doctest: +SKIP
405    '0.2991'
406    >>> "{0:.4f}".format(mlerr.mean_y) #doctest: +SKIP
407    '9.5493'
408    >>> "{0:.4f}".format(mlerr.std_y) #doctest: +SKIP
409    '7.0389'
410    >>> np.around(np.diag(mlerr.vm), decimals=4) #doctest: +SKIP
411    array([ 1.0648,  0.0555,  0.0454,  0.0061,  0.0148,  0.0014])
412    >>> np.around(mlerr.sig2, decimals=4) #doctest: +SKIP
413    array([[ 32.4069]])
414    >>> "{0:.4f}".format(mlerr.logll) #doctest: +SKIP
415    '-4471.4071'
416    >>> "{0:.4f}".format(mlerr.aic) #doctest: +SKIP
417    '8952.8141'
418    >>> "{0:.4f}".format(mlerr.schwarz) #doctest: +SKIP
419    '8979.0779'
420    >>> "{0:.4f}".format(mlerr.pr2) #doctest: +SKIP
421    '0.3058'
422    >>> "{0:.4f}".format(mlerr.utu) #doctest: +SKIP
423    '48534.9148'
424    >>> np.around(mlerr.std_err, decimals=4) #doctest: +SKIP
425    array([ 1.0319,  0.2355,  0.2132,  0.0784,  0.1217,  0.0378])
426    >>> np.around(mlerr.z_stat, decimals=4) #doctest: +SKIP
427    array([[  5.9593,   0.    ],
428           [ 18.6902,   0.    ],
429           [  8.3422,   0.    ],
430           [ -4.8233,   0.    ],
431           [  3.9913,   0.0001],
432           [  7.9089,   0.    ]])
433    >>> mlerr.name_y #doctest: +SKIP
434    'HR90'
435    >>> mlerr.name_x #doctest: +SKIP
436    ['CONSTANT', 'RD90', 'PS90', 'UE90', 'DV90', 'lambda']
437    >>> mlerr.name_w #doctest: +SKIP
438    'south_q.gal'
439    >>> mlerr.name_ds #doctest: +SKIP
440    'south.dbf'
441    >>> mlerr.title #doctest: +SKIP
442    'MAXIMUM LIKELIHOOD SPATIAL ERROR (METHOD = FULL)'
443
444
445    """
446
447    def __init__(self, y, x, w, method='full', epsilon=0.0000001,
448                 vm=False, name_y=None, name_x=None,
449                 name_w=None, name_ds=None):
450        n = USER.check_arrays(y, x)
451        y = USER.check_y(y, n)
452        USER.check_weights(w, y, w_required=True)
453        x_constant,name_x,warn = USER.check_constant(x,name_x)
454        set_warn(self, warn)
455        method = method.upper()
456        BaseML_Error.__init__(self, y=y, x=x_constant,
457                              w=w, method=method, epsilon=epsilon)
458        self.title = "MAXIMUM LIKELIHOOD SPATIAL ERROR" + \
459            " (METHOD = " + method + ")"
460        self.name_ds = USER.set_name_ds(name_ds)
461        self.name_y = USER.set_name_y(name_y)
462        self.name_x = USER.set_name_x(name_x, x)
463        self.name_x.append('lambda')
464        self.name_w = USER.set_name_w(name_w, w)
465        self.aic = DIAG.akaike(reg=self)
466        self.schwarz = DIAG.schwarz(reg=self)
467        SUMMARY.ML_Error(reg=self, w=w, vm=vm, spat_diag=False)
468
469
470def err_c_loglik(lam, n, y, ylag, x, xlag, W):
471    # concentrated log-lik for error model, no constants, brute force
472    ys = y - lam * ylag
473    xs = x - lam * xlag
474    ysys = np.dot(ys.T, ys)
475    xsxs = np.dot(xs.T, xs)
476    xsxsi = np.linalg.inv(xsxs)
477    xsys = np.dot(xs.T, ys)
478    x1 = np.dot(xsxsi, xsys)
479    x2 = np.dot(xsys.T, x1)
480    ee = ysys - x2
481    sig2 = ee[0][0] / n
482    nlsig2 = (n / 2.0) * np.log(sig2)
483    a = -lam * W
484    np.fill_diagonal(a, 1.0)
485    jacob = np.log(np.linalg.det(a))
486    # this is the negative of the concentrated log lik for minimization
487    clik = nlsig2 - jacob
488    return clik
489
490def err_c_loglik_sp(lam, n, y, ylag, x, xlag, I, Wsp):
491    # concentrated log-lik for error model, no constants, LU
492    if isinstance(lam, np.ndarray):
493        if lam.shape == (1,1):
494            lam = lam[0][0] #why does the interior value change?
495    ys = y - lam * ylag
496    xs = x - lam * xlag
497    ysys = np.dot(ys.T, ys)
498    xsxs = np.dot(xs.T, xs)
499    xsxsi = np.linalg.inv(xsxs)
500    xsys = np.dot(xs.T, ys)
501    x1 = np.dot(xsxsi, xsys)
502    x2 = np.dot(xsys.T, x1)
503    ee = ysys - x2
504    sig2 = ee[0][0] / n
505    nlsig2 = (n / 2.0) * np.log(sig2)
506    a = I - lam * Wsp
507    LU = SuperLU(a.tocsc())
508    jacob = np.sum(np.log(np.abs(LU.U.diagonal())))
509    # this is the negative of the concentrated log lik for minimization
510    clik = nlsig2 - jacob
511    return clik
512
513
514def err_c_loglik_ord(lam, n, y, ylag, x, xlag, evals):
515    # concentrated log-lik for error model, no constants, eigenvalues
516    ys = y - lam * ylag
517    xs = x - lam * xlag
518    ysys = np.dot(ys.T, ys)
519    xsxs = np.dot(xs.T, xs)
520    xsxsi = np.linalg.inv(xsxs)
521    xsys = np.dot(xs.T, ys)
522    x1 = np.dot(xsxsi, xsys)
523    x2 = np.dot(xsys.T, x1)
524    ee = ysys - x2
525    sig2 = ee[0][0] / n
526    nlsig2 = (n / 2.0) * np.log(sig2)
527    revals = lam * evals
528    jacob = np.log(1 - revals).sum()
529    if isinstance(jacob, complex):
530        jacob = jacob.real
531    # this is the negative of the concentrated log lik for minimization
532    clik = nlsig2 - jacob
533    return clik
534
535
536def _test():
537    import doctest
538    start_suppress = np.get_printoptions()['suppress']
539    np.set_printoptions(suppress=True)
540    doctest.testmod()
541    np.set_printoptions(suppress=start_suppress)
542