1"""Probit regression class and diagnostics."""
2
3__author__ = "Luc Anselin luc.anselin@asu.edu, Pedro V. Amaral pedro.amaral@asu.edu"
4
5import numpy as np
6import numpy.linalg as la
7import scipy.optimize as op
8from scipy.stats import norm, chi2
9chisqprob = chi2.sf
10import scipy.sparse as SP
11from . import user_output as USER
12from . import summary_output as SUMMARY
13from .utils import spdot, spbroadcast, set_warn
14
15__all__ = ["Probit"]
16
17
18class BaseProbit(object):
19
20    """
21    Probit class to do all the computations
22
23    Parameters
24    ----------
25
26    x           : array
27                  nxk array of independent variables (assumed to be aligned with y)
28    y           : array
29                  nx1 array of dependent binary variable
30    w           : W
31                  PySAL weights instance or spatial weights sparse matrix
32                  aligned with y
33    optim       : string
34                  Optimization method.
35                  Default: 'newton' (Newton-Raphson).
36                  Alternatives: 'ncg' (Newton-CG), 'bfgs' (BFGS algorithm)
37    scalem      : string
38                  Method to calculate the scale of the marginal effects.
39                  Default: 'phimean' (Mean of individual marginal effects)
40                  Alternative: 'xmean' (Marginal effects at variables mean)
41    maxiter     : int
42                  Maximum number of iterations until optimizer stops
43
44    Attributes
45    ----------
46
47    x           : array
48                  Two dimensional array with n rows and one column for each
49                  independent (exogenous) variable, including the constant
50    y           : array
51                  nx1 array of dependent variable
52    betas       : array
53                  kx1 array with estimated coefficients
54    predy       : array
55                  nx1 array of predicted y values
56    n           : int
57                  Number of observations
58    k           : int
59                  Number of variables
60    vm          : array
61                  Variance-covariance matrix (kxk)
62    z_stat      : list of tuples
63                  z statistic; each tuple contains the pair (statistic,
64                  p-value), where each is a float
65    xmean       : array
66                  Mean of the independent variables (kx1)
67    predpc      : float
68                  Percent of y correctly predicted
69    logl        : float
70                  Log-Likelihhod of the estimation
71    scalem      : string
72                  Method to calculate the scale of the marginal effects.
73    scale       : float
74                  Scale of the marginal effects.
75    slopes      : array
76                  Marginal effects of the independent variables (k-1x1)
77		          Note: Disregards the presence of dummies.
78    slopes_vm   : array
79                  Variance-covariance matrix of the slopes (k-1xk-1)
80    LR          : tuple
81                  Likelihood Ratio test of all coefficients = 0
82		          (test statistics, p-value)
83    Pinkse_error: float
84                  Lagrange Multiplier test against spatial error correlation.
85                  Implemented as presented in :cite:`Pinkse2004`.
86    KP_error    : float
87                  Moran's I type test against spatial error correlation.
88                  Implemented as presented in  :cite:`Kelejian2001`.
89    PS_error    : float
90                  Lagrange Multiplier test against spatial error correlation.
91                  Implemented as presented in :cite:`Pinkse1998`.
92    warning     : boolean
93                  if True Maximum number of iterations exceeded or gradient
94                  and/or function calls not changing.
95
96    Examples
97    --------
98    >>> import numpy as np
99    >>> import libpysal
100    >>> import spreg
101    >>> np.set_printoptions(suppress=True) #prevent scientific format
102    >>> dbf = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
103    >>> y = np.array([dbf.by_col('CRIME')]).T
104    >>> x = np.array([dbf.by_col('INC'), dbf.by_col('HOVAL')]).T
105    >>> x = np.hstack((np.ones(y.shape),x))
106    >>> w = libpysal.io.open(libpysal.examples.get_path("columbus.gal"), 'r').read()
107    >>> w.transform='r'
108    >>> model = spreg.probit.BaseProbit((y>40).astype(float), x, w=w)
109    >>> print(np.around(model.betas, decimals=6))
110    [[ 3.353811]
111     [-0.199653]
112     [-0.029514]]
113
114    >>> print(np.around(model.vm, decimals=6))
115    [[ 0.852814 -0.043627 -0.008052]
116     [-0.043627  0.004114 -0.000193]
117     [-0.008052 -0.000193  0.00031 ]]
118
119    >>> tests = np.array([['Pinkse_error','KP_error','PS_error']])
120    >>> stats = np.array([[model.Pinkse_error[0],model.KP_error[0],model.PS_error[0]]])
121    >>> pvalue = np.array([[model.Pinkse_error[1],model.KP_error[1],model.PS_error[1]]])
122    >>> print(np.hstack((tests.T,np.around(np.hstack((stats.T,pvalue.T)),6))))
123    [['Pinkse_error' '3.131719' '0.076783']
124     ['KP_error' '1.721312' '0.085194']
125     ['PS_error' '2.558166' '0.109726']]
126    """
127
128    def __init__(self, y, x, w=None, optim='newton', scalem='phimean', maxiter=100):
129        self.y = y
130        self.x = x
131        self.n, self.k = x.shape
132        self.optim = optim
133        self.scalem = scalem
134        self.w = w
135        self.maxiter = maxiter
136        par_est, self.warning = self.par_est()
137        self.betas = np.reshape(par_est[0], (self.k, 1))
138        self.logl = -float(par_est[1])
139
140    @property
141    def vm(self):
142        try:
143            return self._cache['vm']
144        except AttributeError:
145            self._cache = {}
146            H = self.hessian(self.betas)
147            self._cache['vm'] = -la.inv(H)
148        except KeyError:
149            H = self.hessian(self.betas)
150            self._cache['vm'] = -la.inv(H)
151        return self._cache['vm']
152
153    @vm.setter
154    def vm(self, val):
155        try:
156            self._cache['vm'] = val
157        except AttributeError:
158            self._cache = {}
159        self._cache['vm'] = val
160
161    @property #could this get packaged into a separate function or something? It feels weird to duplicate this.
162    def z_stat(self):
163        try:
164            return self._cache['z_stat']
165        except AttributeError:
166            self._cache = {}
167            variance = self.vm.diagonal()
168            zStat = self.betas.reshape(len(self.betas),) / np.sqrt(variance)
169            rs = {}
170            for i in range(len(self.betas)):
171                rs[i] = (zStat[i], norm.sf(abs(zStat[i])) * 2)
172            self._cache['z_stat'] = rs.values()
173        except KeyError:
174            variance = self.vm.diagonal()
175            zStat = self.betas.reshape(len(self.betas),) / np.sqrt(variance)
176            rs = {}
177            for i in range(len(self.betas)):
178                rs[i] = (zStat[i], norm.sf(abs(zStat[i])) * 2)
179            self._cache['z_stat'] = rs.values()
180        return self._cache['z_stat']
181
182    @z_stat.setter
183    def z_stat(self, val):
184        try:
185            self._cache['z_stat'] = val
186        except AttributeError:
187            self._cache = {}
188        self._cache['z_stat'] = val
189
190    @property
191    def slopes_std_err(self):
192        try:
193            return self._cache['slopes_std_err']
194        except AttributeError:
195            self._cache = {}
196            self._cache['slopes_std_err'] = np.sqrt(self.slopes_vm.diagonal())
197        except KeyError:
198            self._cache['slopes_std_err'] = np.sqrt(self.slopes_vm.diagonal())
199        return self._cache['slopes_std_err']
200
201    @slopes_std_err.setter
202    def slopes_std_err(self, val):
203        try:
204            self._cache['slopes_std_err'] = val
205        except AttributeError:
206            self._cache = {}
207        self._cache['slopes_std_err'] = val
208
209    @property
210    def slopes_z_stat(self):
211        try:
212            return self._cache['slopes_z_stat']
213        except AttributeError:
214            self._cache = {}
215            return self.slopes_z_stat
216        except KeyError:
217            zStat = self.slopes.reshape(
218                len(self.slopes),) / self.slopes_std_err
219            rs = {}
220            for i in range(len(self.slopes)):
221                rs[i] = (zStat[i], norm.sf(abs(zStat[i])) * 2)
222            self._cache['slopes_z_stat'] = list(rs.values())
223        return self._cache['slopes_z_stat']
224
225    @slopes_z_stat.setter
226    def slopes_z_stat(self, val):
227        try:
228            self._cache['slopes_z_stat'] = val
229        except AttributeError:
230            self._cache = {}
231        self._cache['slopes_z_stat'] = val
232
233    @property
234    def xmean(self):
235        try:
236            return self._cache['xmean']
237        except AttributeError:
238            self._cache = {}
239            try: #why is this try-accept? can x be a list??
240                self._cache['xmean'] = np.reshape(sum(self.x) / self.n, (self.k, 1))
241            except:
242                self._cache['xmean'] = np.reshape(sum(self.x).toarray() / self.n, (self.k, 1))
243        except KeyError:
244            try:
245                self._cache['xmean'] = np.reshape(sum(self.x) / self.n, (self.k, 1))
246            except:
247                self._cache['xmean'] = np.reshape(sum(self.x).toarray() / self.n, (self.k, 1))
248        return self._cache['xmean']
249
250    @xmean.setter
251    def xmean(self, val):
252        try:
253            self._cache['xmean'] = val
254        except AttributeError:
255            self._cache = {}
256        self._cache['xmean'] = val
257
258    @property
259    def xb(self):
260        try:
261            return self._cache['xb']
262        except AttributeError:
263            self._cache = {}
264            self._cache['xb'] = spdot(self.x, self.betas)
265        except KeyError:
266            self._cache['xb'] = spdot(self.x, self.betas)
267        return self._cache['xb']
268
269    @xb.setter
270    def xb(self, val):
271        try:
272            self._cache['xb'] = val
273        except AttributeError:
274            self._cache = {}
275        self._cache['xb'] = val
276
277    @property
278    def predy(self):
279        try:
280            return self._cache['predy']
281        except AttributeError:
282            self._cache = {}
283            self._cache['predy'] = norm.cdf(self.xb)
284        except KeyError:
285            self._cache['predy'] = norm.cdf(self.xb)
286        return self._cache['predy']
287
288    @predy.setter
289    def predy(self, val):
290        try:
291            self._cache['predy'] = val
292        except AttributeError:
293            self._cache = {}
294        self._cache['predy'] = val
295
296    @property
297    def predpc(self):
298        try:
299            return self._cache['predpc']
300        except AttributeError:
301            self._cache = {}
302            predpc = abs(self.y - self.predy)
303            for i in range(len(predpc)):
304                if predpc[i] > 0.5:
305                    predpc[i] = 0
306                else:
307                    predpc[i] = 1
308            self._cache['predpc'] = float(100.0 * np.sum(predpc) / self.n)
309        except KeyError:
310            predpc = abs(self.y - self.predy)
311            for i in range(len(predpc)):
312                if predpc[i] > 0.5:
313                    predpc[i] = 0
314                else:
315                    predpc[i] = 1
316            self._cache['predpc'] = float(100.0 * np.sum(predpc) / self.n)
317        return self._cache['predpc']
318
319    @predpc.setter
320    def predpc(self, val):
321        try:
322            self._cache['predpc'] = val
323        except AttributeError:
324            self._cache = {}
325        self._cache['predpc'] = val
326
327    @property
328    def phiy(self):
329        try:
330            return self._cache['phiy']
331        except AttributeError:
332            self._cache = {}
333            self._cache['phiy'] = norm.pdf(self.xb)
334        except KeyError:
335            self._cache['phiy'] = norm.pdf(self.xb)
336        return self._cache['phiy']
337
338    @phiy.setter
339    def phiy(self, val):
340        try:
341            self._cache['phiy'] = val
342        except AttributeError:
343            self._cache = {}
344        self._cache['phiy'] = val
345
346    @property
347    def scale(self):
348        try:
349            return self._cache['scale']
350        except AttributeError:
351            self._cache = {}
352            if self.scalem == 'phimean':
353                self._cache['scale'] = float(1.0 * np.sum(self.phiy) / self.n)
354            elif self.scalem == 'xmean':
355                self._cache['scale'] = float(norm.pdf(np.dot(self.xmean.T, self.betas)))
356        except KeyError:
357            if self.scalem == 'phimean':
358                self._cache['scale'] = float(1.0 * np.sum(self.phiy) / self.n)
359            if self.scalem == 'xmean':
360                self._cache['scale'] = float(norm.pdf(np.dot(self.xmean.T, self.betas)))
361        return self._cache['scale']
362
363    @scale.setter
364    def scale(self, val):
365        try:
366            self._cache['scale'] = val
367        except AttributeError:
368            self._cache = {}
369        self._cache['scale'] = val
370
371    @property
372    def slopes(self):
373        try:
374            return self._cache['slopes']
375        except AttributeError:
376            self._cache = {}
377            self._cache['slopes'] = self.betas[1:] * self.scale
378        except KeyError:
379            self._cache['slopes'] = self.betas[1:] * self.scale
380        return self._cache['slopes']
381
382    @slopes.setter
383    def slopes(self, val):
384        try:
385            self._cache['slopes'] = val
386        except AttributeError:
387            self._cache = {}
388        self._cache['slopes'] = val
389
390    @property
391    def slopes_vm(self):
392        try:
393            return self._cache['slopes_vm']
394        except AttributeError:
395            self._cache = {}
396            x = self.xmean
397            b = self.betas
398            dfdb = np.eye(self.k) - spdot(b.T, x) * spdot(b, x.T)
399            slopes_vm = (self.scale ** 2) * \
400                np.dot(np.dot(dfdb, self.vm), dfdb.T)
401            self._cache['slopes_vm'] = slopes_vm[1:, 1:]
402        except KeyError:
403            x = self.xmean
404            b = self.betas
405            dfdb = np.eye(self.k) - spdot(b.T, x) * spdot(b, x.T)
406            slopes_vm = (self.scale ** 2) * \
407                np.dot(np.dot(dfdb, self.vm), dfdb.T)
408            self._cache['slopes_vm'] = slopes_vm[1:, 1:]
409        return self._cache['slopes_vm']
410
411    @slopes_vm.setter
412    def slopes_vm(self, val):
413        try:
414            self._cache['slopes_vm'] = val
415        except AttributeError:
416            self._cache = {}
417        self._cache['slopes_vm'] = val
418
419    @property
420    def LR(self):
421        try:
422            return self._cache['LR']
423        except AttributeError:
424            self._cache = {}
425            P = 1.0 * np.sum(self.y) / self.n
426            LR = float(
427                -2 * (self.n * (P * np.log(P) + (1 - P) * np.log(1 - P)) - self.logl))
428            self._cache['LR'] = (LR, chisqprob(LR, self.k))
429        except KeyError:
430            P = 1.0 * np.sum(self.y) / self.n
431            LR = float(
432                -2 * (self.n * (P * np.log(P) + (1 - P) * np.log(1 - P)) - self.logl))
433            self._cache['LR'] = (LR, chisqprob(LR, self.k))
434        return self._cache['LR']
435
436    @LR.setter
437    def LR(self, val):
438        try:
439            self._cache['LR'] = val
440        except AttributeError:
441            self._cache = {}
442        self._cache['LR'] = val
443
444    @property
445    def u_naive(self):
446        try:
447            return self._cache['u_naive']
448        except AttributeError:
449            self._cache = {}
450            self._cache['u_naive'] = self.y - self.predy
451        except KeyError:
452            u_naive = self.y - self.predy
453            self._cache['u_naive'] = u_naive
454        return self._cache['u_naive']
455
456    @u_naive.setter
457    def u_naive(self, val):
458        try:
459            self._cache['u_naive'] = val
460        except AttributeError:
461            self._cache = {}
462        self._cache['u_naive'] = val
463
464    @property
465    def u_gen(self):
466        try:
467            return self._cache['u_gen']
468        except AttributeError:
469            self._cache = {}
470            Phi_prod = self.predy * (1 - self.predy)
471            u_gen = self.phiy * (self.u_naive / Phi_prod)
472            self._cache['u_gen'] = u_gen
473        except KeyError:
474            Phi_prod = self.predy * (1 - self.predy)
475            u_gen = self.phiy * (self.u_naive / Phi_prod)
476            self._cache['u_gen'] = u_gen
477        return self._cache['u_gen']
478
479    @u_gen.setter
480    def u_gen(self, val):
481        try:
482            self._cache['u_gen'] = val
483        except AttributeError:
484            self._cache = {}
485        self._cache['u_gen'] = val
486
487    @property
488    def Pinkse_error(self):
489        try:
490            return self._cache['Pinkse_error']
491        except AttributeError:
492            self._cache = {}
493            self._cache['Pinkse_error'], self._cache[
494                'KP_error'], self._cache['PS_error'] = sp_tests(self)
495        except KeyError:
496            self._cache['Pinkse_error'], self._cache[
497                'KP_error'], self._cache['PS_error'] = sp_tests(self)
498        return self._cache['Pinkse_error']
499
500    @Pinkse_error.setter
501    def Pinkse_error(self, val):
502        try:
503            self._cache['Pinkse_error'] = val
504        except AttributeError:
505            self._cache = {}
506        self._cache['Pinkse_error'] = val
507
508    @property
509    def KP_error(self):
510        try:
511            return self._cache['KP_error']
512        except AttributeError:
513            self._cache = {}
514            self._cache['Pinkse_error'], self._cache[
515                'KP_error'], self._cache['PS_error'] = sp_tests(self)
516        except KeyError:
517            self._cache['Pinkse_error'], self._cache[
518                'KP_error'], self._cache['PS_error'] = sp_tests(self)
519        return self._cache['KP_error']
520
521    @KP_error.setter
522    def KP_error(self, val):
523        try:
524            self._cache['KP_error'] = val
525        except AttributeError:
526            self._cache = {}
527        self._cache['KP_error'] = val
528
529    @property
530    def PS_error(self):
531        try:
532            return self._cache['PS_error']
533        except AttributeError:
534            self._cache = {}
535            self._cache['Pinkse_error'], self._cache[
536                'KP_error'], self._cache['PS_error'] = sp_tests(self)
537        except KeyError:
538            self._cache['Pinkse_error'], self._cache[
539                'KP_error'], self._cache['PS_error'] = sp_tests(self)
540        return self._cache['PS_error']
541
542    @PS_error.setter
543    def PS_error(self, val):
544        try:
545            self._cache['PS_error'] = val
546        except AttributeError:
547            self._cache = {}
548        self._cache['PS_error'] = val
549
550    def par_est(self):
551        start = np.dot(la.inv(spdot(self.x.T, self.x)),
552                       spdot(self.x.T, self.y))
553        flogl = lambda par: -self.ll(par)
554        if self.optim == 'newton':
555            fgrad = lambda par: self.gradient(par)
556            fhess = lambda par: self.hessian(par)
557            par_hat = newton(flogl, start, fgrad, fhess, self.maxiter)
558            warn = par_hat[2]
559        else:
560            fgrad = lambda par: -self.gradient(par)
561            if self.optim == 'bfgs':
562                par_hat = op.fmin_bfgs(
563                    flogl, start, fgrad, full_output=1, disp=0)
564                warn = par_hat[6]
565            if self.optim == 'ncg':
566                fhess = lambda par: -self.hessian(par)
567                par_hat = op.fmin_ncg(
568                    flogl, start, fgrad, fhess=fhess, full_output=1, disp=0)
569                warn = par_hat[5]
570        if warn > 0:
571            warn = True
572        else:
573            warn = False
574        return par_hat, warn
575
576    def ll(self, par):
577        beta = np.reshape(np.array(par), (self.k, 1))
578        q = 2 * self.y - 1
579        qxb = q * spdot(self.x, beta)
580        ll = sum(np.log(norm.cdf(qxb)))
581        return ll
582
583    def gradient(self, par):
584        beta = np.reshape(np.array(par), (self.k, 1))
585        q = 2 * self.y - 1
586        qxb = q * spdot(self.x, beta)
587        lamb = q * norm.pdf(qxb) / norm.cdf(qxb)
588        gradient = spdot(lamb.T, self.x)[0]
589        return gradient
590
591    def hessian(self, par):
592        beta = np.reshape(np.array(par), (self.k, 1))
593        q = 2 * self.y - 1
594        xb = spdot(self.x, beta)
595        qxb = q * xb
596        lamb = q * norm.pdf(qxb) / norm.cdf(qxb)
597        hessian = spdot(self.x.T, spbroadcast(self.x,-lamb * (lamb + xb)))
598        return hessian
599
600
601class Probit(BaseProbit):
602
603    """
604    Classic non-spatial Probit and spatial diagnostics. The class includes a
605    printout that formats all the results and tests in a nice format.
606
607    The diagnostics for spatial dependence currently implemented are:
608
609    * Pinkse Error :cite:`Pinkse2004`
610
611    * Kelejian and Prucha Moran's I :cite:`Kelejian2001`
612
613    * Pinkse & Slade Error :cite:`Pinkse1998`
614
615    Parameters
616    ----------
617
618    x           : array
619                  nxk array of independent variables (assumed to be aligned with y)
620    y           : array
621                  nx1 array of dependent binary variable
622    w           : W
623                  PySAL weights instance aligned with y
624    optim       : string
625                  Optimization method.
626                  Default: 'newton' (Newton-Raphson).
627                  Alternatives: 'ncg' (Newton-CG), 'bfgs' (BFGS algorithm)
628    scalem      : string
629                  Method to calculate the scale of the marginal effects.
630                  Default: 'phimean' (Mean of individual marginal effects)
631                  Alternative: 'xmean' (Marginal effects at variables mean)
632    maxiter     : int
633                  Maximum number of iterations until optimizer stops
634    name_y       : string
635                   Name of dependent variable for use in output
636    name_x       : list of strings
637                   Names of independent variables for use in output
638    name_w       : string
639                   Name of weights matrix for use in output
640    name_ds      : string
641                   Name of dataset for use in output
642
643    Attributes
644    ----------
645
646    x           : array
647                  Two dimensional array with n rows and one column for each
648                  independent (exogenous) variable, including the constant
649    y           : array
650                  nx1 array of dependent variable
651    betas       : array
652                  kx1 array with estimated coefficients
653    predy       : array
654                  nx1 array of predicted y values
655    n           : int
656                  Number of observations
657    k           : int
658                  Number of variables
659    vm          : array
660                  Variance-covariance matrix (kxk)
661    z_stat      : list of tuples
662                  z statistic; each tuple contains the pair (statistic,
663                  p-value), where each is a float
664    xmean       : array
665                  Mean of the independent variables (kx1)
666    predpc      : float
667                  Percent of y correctly predicted
668    logl        : float
669                  Log-Likelihhod of the estimation
670    scalem      : string
671                  Method to calculate the scale of the marginal effects.
672    scale       : float
673                  Scale of the marginal effects.
674    slopes      : array
675                  Marginal effects of the independent variables (k-1x1)
676    slopes_vm   : array
677                  Variance-covariance matrix of the slopes (k-1xk-1)
678    LR          : tuple
679                  Likelihood Ratio test of all coefficients = 0
680                  (test statistics, p-value)
681    Pinkse_error: float
682                  Lagrange Multiplier test against spatial error correlation.
683                  Implemented as presented in :cite:`Pinkse2004`
684    KP_error    : float
685                  Moran's I type test against spatial error correlation.
686                  Implemented as presented in :cite:`Kelejian2001`
687    PS_error    : float
688                  Lagrange Multiplier test against spatial error correlation.
689                  Implemented as presented in :cite:`Pinkse1998`
690    warning     : boolean
691                  if True Maximum number of iterations exceeded or gradient
692                  and/or function calls not changing.
693    name_y       : string
694                   Name of dependent variable for use in output
695    name_x       : list of strings
696                   Names of independent variables for use in output
697    name_w       : string
698                   Name of weights matrix for use in output
699    name_ds      : string
700                   Name of dataset for use in output
701    title        : string
702                   Name of the regression method used
703
704    Examples
705    --------
706
707    We first need to import the needed modules, namely numpy to convert the
708    data we read into arrays that ``spreg`` understands and ``libpysal`` to
709    perform all the analysis.
710
711    >>> import numpy as np
712    >>> import libpysal
713    >>> np.set_printoptions(suppress=True) #prevent scientific format
714
715    Open data on Columbus neighborhood crime (49 areas) using libpysal.io.open().
716    This is the DBF associated with the Columbus shapefile.  Note that
717    libpysal.io.open() also reads data in CSV format; since the actual class
718    requires data to be passed in as numpy arrays, the user can read their
719    data in using any method.
720
721    >>> dbf = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'),'r')
722
723    Extract the CRIME column (crime) from the DBF file and make it the
724    dependent variable for the regression. Note that libpysal requires this to be
725    an numpy array of shape (n, 1) as opposed to the also common shape of (n, )
726    that other packages accept. Since we want to run a probit model and for this
727    example we use the Columbus data, we also need to transform the continuous
728    CRIME variable into a binary variable. As in :cite:`McMillen1992`, we define
729    y = 1 if CRIME > 40.
730
731    >>> y = np.array([dbf.by_col('CRIME')]).T
732    >>> y = (y>40).astype(float)
733
734    Extract HOVAL (home values) and INC (income) vectors from the DBF to be used as
735    independent variables in the regression.  Note that libpysal requires this to
736    be an nxj numpy array, where j is the number of independent variables (not
737    including a constant). By default this class adds a vector of ones to the
738    independent variables passed in.
739
740    >>> names_to_extract = ['INC', 'HOVAL']
741    >>> x = np.array([dbf.by_col(name) for name in names_to_extract]).T
742
743    Since we want to the test the probit model for spatial dependence, we need to
744    specify the spatial weights matrix that includes the spatial configuration of
745    the observations into the error component of the model. To do that, we can open
746    an already existing gal file or create a new one. In this case, we will use
747    ``columbus.gal``, which contains contiguity relationships between the
748    observations in the Columbus dataset we are using throughout this example.
749    Note that, in order to read the file, not only to open it, we need to
750    append '.read()' at the end of the command.
751
752    >>> w = libpysal.io.open(libpysal.examples.get_path("columbus.gal"), 'r').read()
753
754    Unless there is a good reason not to do it, the weights have to be
755    row-standardized so every row of the matrix sums to one. In libpysal, this
756    can be easily performed in the following way:
757
758    >>> w.transform='r'
759
760    We are all set with the preliminaries, we are good to run the model. In this
761    case, we will need the variables and the weights matrix. If we want to
762    have the names of the variables printed in the output summary, we will
763    have to pass them in as well, although this is optional.
764
765    >>> from spreg import Probit
766    >>> model = Probit(y, x, w=w, name_y='crime', name_x=['income','home value'], name_ds='columbus', name_w='columbus.gal')
767
768    Once we have run the model, we can explore a little bit the output. The
769    regression object we have created has many attributes so take your time to
770    discover them.
771
772    >>> np.around(model.betas, decimals=6)
773    array([[ 3.353811],
774           [-0.199653],
775           [-0.029514]])
776
777    >>> np.around(model.vm, decimals=6)
778    array([[ 0.852814, -0.043627, -0.008052],
779           [-0.043627,  0.004114, -0.000193],
780           [-0.008052, -0.000193,  0.00031 ]])
781
782    Since we have provided a spatial weigths matrix, the diagnostics for
783    spatial dependence have also been computed. We can access them and their
784    p-values individually:
785
786    >>> tests = np.array([['Pinkse_error','KP_error','PS_error']])
787    >>> stats = np.array([[model.Pinkse_error[0],model.KP_error[0],model.PS_error[0]]])
788    >>> pvalue = np.array([[model.Pinkse_error[1],model.KP_error[1],model.PS_error[1]]])
789    >>> print(np.hstack((tests.T,np.around(np.hstack((stats.T,pvalue.T)),6))))
790    [['Pinkse_error' '3.131719' '0.076783']
791     ['KP_error' '1.721312' '0.085194']
792     ['PS_error' '2.558166' '0.109726']]
793
794    Or we can easily obtain a full summary of all the results nicely formatted and
795    ready to be printed simply by typing 'print model.summary'
796
797    """
798
799    def __init__(
800        self, y, x, w=None, optim='newton', scalem='phimean', maxiter=100,
801        vm=False, name_y=None, name_x=None, name_w=None, name_ds=None,
802            spat_diag=False):
803
804        n = USER.check_arrays(y, x)
805        y = USER.check_y(y, n)
806        if w != None:
807            USER.check_weights(w, y)
808            spat_diag = True
809            ws = w.sparse
810        else:
811            ws = None
812        x_constant,name_x,warn = USER.check_constant(x,name_x)
813        set_warn(self, warn)
814
815        BaseProbit.__init__(self, y=y, x=x_constant, w=ws,
816                            optim=optim, scalem=scalem, maxiter=maxiter)
817        self.title = "CLASSIC PROBIT ESTIMATOR"
818        self.name_ds = USER.set_name_ds(name_ds)
819        self.name_y = USER.set_name_y(name_y)
820        self.name_x = USER.set_name_x(name_x, x)
821        self.name_w = USER.set_name_w(name_w, w)
822        SUMMARY.Probit(reg=self, w=w, vm=vm, spat_diag=spat_diag)
823
824
825def newton(flogl, start, fgrad, fhess, maxiter):
826    """
827    Calculates the Newton-Raphson method
828
829    Parameters
830    ----------
831
832    flogl       : lambda
833                  Function to calculate the log-likelihood
834    start       : array
835                  kx1 array of starting values
836    fgrad       : lambda
837                  Function to calculate the gradient
838    fhess       : lambda
839                  Function to calculate the hessian
840    maxiter     : int
841                  Maximum number of iterations until optimizer stops
842    """
843    warn = 0
844    iteration = 0
845    par_hat0 = start
846    m = 1
847    while (iteration < maxiter and m >= 1e-04):
848        H = -la.inv(fhess(par_hat0))
849        g = fgrad(par_hat0).reshape(start.shape)
850        Hg = np.dot(H, g)
851        par_hat0 = par_hat0 + Hg
852        iteration += 1
853        m = np.dot(g.T, Hg)
854    if iteration == maxiter:
855        warn = 1
856    logl = flogl(par_hat0)
857    return (par_hat0, logl, warn)
858
859
860def sp_tests(reg):
861    """
862    Calculates tests for spatial dependence in Probit models
863
864    Parameters
865    ----------
866    reg         : regression object
867                  output instance from a probit model
868    """
869    if reg.w != None:
870        try:
871            w = reg.w.sparse
872        except:
873            w = reg.w
874        Phi = reg.predy
875        phi = reg.phiy
876        # Pinkse_error:
877        Phi_prod = Phi * (1 - Phi)
878        u_naive = reg.u_naive
879        u_gen = reg.u_gen
880        sig2 = np.sum((phi * phi) / Phi_prod) / reg.n
881        LM_err_num = np.dot(u_gen.T, (w * u_gen)) ** 2
882        trWW = np.sum((w * w).diagonal())
883        trWWWWp = trWW + np.sum((w * w.T).diagonal())
884        LM_err = float(1.0 * LM_err_num / (sig2 ** 2 * trWWWWp))
885        LM_err = np.array([LM_err, chisqprob(LM_err, 1)])
886        # KP_error:
887        moran = moran_KP(reg.w, u_naive, Phi_prod)
888        # Pinkse-Slade_error:
889        u_std = u_naive / np.sqrt(Phi_prod)
890        ps_num = np.dot(u_std.T, (w * u_std)) ** 2
891        trWpW = np.sum((w.T * w).diagonal())
892        ps = float(ps_num / (trWW + trWpW))
893        # chi-square instead of bootstrap.
894        ps = np.array([ps, chisqprob(ps, 1)])
895    else:
896        raise Exception("W matrix must be provided to calculate spatial tests.")
897    return LM_err, moran, ps
898
899
900def moran_KP(w, u, sig2i):
901    """
902    Calculates Moran-flavoured tests
903
904    Parameters
905    ----------
906
907    w           : W
908                  PySAL weights instance aligned with y
909    u           : array
910                  nx1 array of naive residuals
911    sig2i       : array
912                  nx1 array of individual variance
913    """
914    try:
915        w = w.sparse
916    except:
917        pass
918    moran_num = np.dot(u.T, (w * u))
919    E = SP.lil_matrix(w.get_shape())
920    E.setdiag(sig2i.flat)
921    E = E.asformat('csr')
922    WE = w * E
923    moran_den = np.sqrt(np.sum((WE * WE + (w.T * E) * WE).diagonal()))
924    moran = float(1.0 * moran_num / moran_den)
925    moran = np.array([moran, norm.sf(abs(moran)) * 2.])
926    return moran
927
928
929def _test():
930    import doctest
931    start_suppress = np.get_printoptions()['suppress']
932    np.set_printoptions(suppress=True)
933    doctest.testmod()
934    np.set_printoptions(suppress=start_suppress)
935
936if __name__ == '__main__':
937    _test()
938    import numpy as np
939    import libpysal
940    dbf = libpysal.io.open(libpysal.examples.get_path('columbus.dbf'), 'r')
941    y = np.array([dbf.by_col('CRIME')]).T
942    var_x = ['INC', 'HOVAL']
943    x = np.array([dbf.by_col(name) for name in var_x]).T
944    w = libpysal.io.open(libpysal.examples.get_path("columbus.gal"), 'r').read()
945    w.transform = 'r'
946    probit1 = Probit(
947        (y > 40).astype(float), x, w=w, name_x=var_x, name_y="CRIME",
948        name_ds="Columbus", name_w="columbus.dbf")
949    print(probit1.summary)
950