1"""numerical differentiation function, gradient, Jacobian, and Hessian
2
3Author : josef-pkt
4License : BSD
5
6Notes
7-----
8These are simple forward differentiation, so that we have them available
9without dependencies.
10
11* Jacobian should be faster than numdifftools because it does not use loop over
12  observations.
13* numerical precision will vary and depend on the choice of stepsizes
14"""
15
16# TODO:
17# * some cleanup
18# * check numerical accuracy (and bugs) with numdifftools and analytical
19#   derivatives
20#   - linear least squares case: (hess - 2*X'X) is 1e-8 or so
21#   - gradient and Hessian agree with numdifftools when evaluated away from
22#     minimum
23#   - forward gradient, Jacobian evaluated at minimum is inaccurate, centered
24#     (+/- epsilon) is ok
25# * dot product of Jacobian is different from Hessian, either wrong example or
26#   a bug (unlikely), or a real difference
27#
28#
29# What are the conditions that Jacobian dotproduct and Hessian are the same?
30#
31# See also:
32#
33# BHHH: Greene p481 17.4.6,  MLE Jacobian = d loglike / d beta , where loglike
34# is vector for each observation
35#    see also example 17.4 when J'J is very different from Hessian
36#    also does it hold only at the minimum, what's relationship to covariance
37#    of Jacobian matrix
38# http://projects.scipy.org/scipy/ticket/1157
39# https://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm
40#    objective: sum((y-f(beta,x)**2),   Jacobian = d f/d beta
41#    and not d objective/d beta as in MLE Greene
42#    similar: http://crsouza.blogspot.com/2009/11/neural-network-learning-by-levenberg_18.html#hessian
43#
44# in example: if J = d x*beta / d beta then J'J == X'X
45#    similar to https://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm
46import numpy as np
47
48from statsmodels.compat.pandas import Appender, Substitution
49
50# NOTE: we only do double precision internally so far
51EPS = np.finfo(float).eps
52
53_hessian_docs = """
54    Calculate Hessian with finite difference derivative approximation
55
56    Parameters
57    ----------
58    x : array_like
59       value at which function derivative is evaluated
60    f : function
61       function of one array f(x, `*args`, `**kwargs`)
62    epsilon : float or array_like, optional
63       Stepsize used, if None, then stepsize is automatically chosen
64       according to EPS**(1/%(scale)s)*x.
65    args : tuple
66        Arguments for function `f`.
67    kwargs : dict
68        Keyword arguments for function `f`.
69    %(extra_params)s
70
71    Returns
72    -------
73    hess : ndarray
74       array of partial second derivatives, Hessian
75    %(extra_returns)s
76
77    Notes
78    -----
79    Equation (%(equation_number)s) in Ridout. Computes the Hessian as::
80
81      %(equation)s
82
83    where e[j] is a vector with element j == 1 and the rest are zero and
84    d[i] is epsilon[i].
85
86    References
87    ----------:
88
89    Ridout, M.S. (2009) Statistical applications of the complex-step method
90        of numerical differentiation. The American Statistician, 63, 66-74
91"""
92
93
94def _get_epsilon(x, s, epsilon, n):
95    if epsilon is None:
96        h = EPS**(1. / s) * np.maximum(np.abs(x), 0.1)
97    else:
98        if np.isscalar(epsilon):
99            h = np.empty(n)
100            h.fill(epsilon)
101        else:  # pragma : no cover
102            h = np.asarray(epsilon)
103            if h.shape != x.shape:
104                raise ValueError("If h is not a scalar it must have the same"
105                                 " shape as x.")
106    return h
107
108
109def approx_fprime(x, f, epsilon=None, args=(), kwargs={}, centered=False):
110    '''
111    Gradient of function, or Jacobian if function f returns 1d array
112
113    Parameters
114    ----------
115    x : ndarray
116        parameters at which the derivative is evaluated
117    f : function
118        `f(*((x,)+args), **kwargs)` returning either one value or 1d array
119    epsilon : float, optional
120        Stepsize, if None, optimal stepsize is used. This is EPS**(1/2)*x for
121        `centered` == False and EPS**(1/3)*x for `centered` == True.
122    args : tuple
123        Tuple of additional arguments for function `f`.
124    kwargs : dict
125        Dictionary of additional keyword arguments for function `f`.
126    centered : bool
127        Whether central difference should be returned. If not, does forward
128        differencing.
129
130    Returns
131    -------
132    grad : ndarray
133        gradient or Jacobian
134
135    Notes
136    -----
137    If f returns a 1d array, it returns a Jacobian. If a 2d array is returned
138    by f (e.g., with a value for each observation), it returns a 3d array
139    with the Jacobian of each observation with shape xk x nobs x xk. I.e.,
140    the Jacobian of the first observation would be [:, 0, :]
141    '''
142    n = len(x)
143    f0 = f(*((x,)+args), **kwargs)
144    dim = np.atleast_1d(f0).shape  # it could be a scalar
145    grad = np.zeros((n,) + dim, np.promote_types(float, x.dtype))
146    ei = np.zeros((n,), float)
147    if not centered:
148        epsilon = _get_epsilon(x, 2, epsilon, n)
149        for k in range(n):
150            ei[k] = epsilon[k]
151            grad[k, :] = (f(*((x+ei,) + args), **kwargs) - f0)/epsilon[k]
152            ei[k] = 0.0
153    else:
154        epsilon = _get_epsilon(x, 3, epsilon, n) / 2.
155        for k in range(n):
156            ei[k] = epsilon[k]
157            grad[k, :] = (f(*((x+ei,)+args), **kwargs) -
158                          f(*((x-ei,)+args), **kwargs))/(2 * epsilon[k])
159            ei[k] = 0.0
160
161    return grad.squeeze().T
162
163
164def _approx_fprime_scalar(x, f, epsilon=None, args=(), kwargs={},
165                          centered=False):
166    '''
167    Gradient of function vectorized for scalar parameter.
168
169    This assumes that the function ``f`` is vectorized for a scalar parameter.
170    The function value ``f(x)`` has then the same shape as the input ``x``.
171    The derivative returned by this function also has the same shape as ``x``.
172
173    Parameters
174    ----------
175    x : ndarray
176        Parameters at which the derivative is evaluated.
177    f : function
178        `f(*((x,)+args), **kwargs)` returning either one value or 1d array
179    epsilon : float, optional
180        Stepsize, if None, optimal stepsize is used. This is EPS**(1/2)*x for
181        `centered` == False and EPS**(1/3)*x for `centered` == True.
182    args : tuple
183        Tuple of additional arguments for function `f`.
184    kwargs : dict
185        Dictionary of additional keyword arguments for function `f`.
186    centered : bool
187        Whether central difference should be returned. If not, does forward
188        differencing.
189
190    Returns
191    -------
192    grad : ndarray
193        Array of derivatives, gradient evaluated at parameters ``x``.
194    '''
195    x = np.asarray(x)
196    n = 1
197
198    f0 = f(*((x,)+args), **kwargs)
199    if not centered:
200        eps = _get_epsilon(x, 2, epsilon, n)
201        grad = (f(*((x+eps,) + args), **kwargs) - f0) / eps
202    else:
203        eps = _get_epsilon(x, 3, epsilon, n) / 2.
204        grad = (f(*((x+eps,)+args), **kwargs) -
205                f(*((x-eps,)+args), **kwargs)) / (2 * eps)
206
207    return grad
208
209
210def approx_fprime_cs(x, f, epsilon=None, args=(), kwargs={}):
211    '''
212    Calculate gradient or Jacobian with complex step derivative approximation
213
214    Parameters
215    ----------
216    x : ndarray
217        parameters at which the derivative is evaluated
218    f : function
219        `f(*((x,)+args), **kwargs)` returning either one value or 1d array
220    epsilon : float, optional
221        Stepsize, if None, optimal stepsize is used. Optimal step-size is
222        EPS*x. See note.
223    args : tuple
224        Tuple of additional arguments for function `f`.
225    kwargs : dict
226        Dictionary of additional keyword arguments for function `f`.
227
228    Returns
229    -------
230    partials : ndarray
231       array of partial derivatives, Gradient or Jacobian
232
233    Notes
234    -----
235    The complex-step derivative has truncation error O(epsilon**2), so
236    truncation error can be eliminated by choosing epsilon to be very small.
237    The complex-step derivative avoids the problem of round-off error with
238    small epsilon because there is no subtraction.
239    '''
240    # From Guilherme P. de Freitas, numpy mailing list
241    # May 04 2010 thread "Improvement of performance"
242    # http://mail.scipy.org/pipermail/numpy-discussion/2010-May/050250.html
243    n = len(x)
244
245    epsilon = _get_epsilon(x, 1, epsilon, n)
246    increments = np.identity(n) * 1j * epsilon
247    # TODO: see if this can be vectorized, but usually dim is small
248    partials = [f(x+ih, *args, **kwargs).imag / epsilon[i]
249                for i, ih in enumerate(increments)]
250
251    return np.array(partials).T
252
253
254def _approx_fprime_cs_scalar(x, f, epsilon=None, args=(), kwargs={}):
255    '''
256    Calculate gradient for scalar parameter with complex step derivatives.
257
258    This assumes that the function ``f`` is vectorized for a scalar parameter.
259    The function value ``f(x)`` has then the same shape as the input ``x``.
260    The derivative returned by this function also has the same shape as ``x``.
261
262    Parameters
263    ----------
264    x : ndarray
265        Parameters at which the derivative is evaluated.
266    f : function
267        `f(*((x,)+args), **kwargs)` returning either one value or 1d array.
268    epsilon : float, optional
269        Stepsize, if None, optimal stepsize is used. Optimal step-size is
270        EPS*x. See note.
271    args : tuple
272        Tuple of additional arguments for function `f`.
273    kwargs : dict
274        Dictionary of additional keyword arguments for function `f`.
275
276    Returns
277    -------
278    partials : ndarray
279       Array of derivatives, gradient evaluated for parameters ``x``.
280
281    Notes
282    -----
283    The complex-step derivative has truncation error O(epsilon**2), so
284    truncation error can be eliminated by choosing epsilon to be very small.
285    The complex-step derivative avoids the problem of round-off error with
286    small epsilon because there is no subtraction.
287    '''
288    # From Guilherme P. de Freitas, numpy mailing list
289    # May 04 2010 thread "Improvement of performance"
290    # http://mail.scipy.org/pipermail/numpy-discussion/2010-May/050250.html
291    x = np.asarray(x)
292    n = x.shape[-1]
293
294    epsilon = _get_epsilon(x, 1, epsilon, n)
295    eps = 1j * epsilon
296    partials = f(x + eps, *args, **kwargs).imag / epsilon
297
298    return np.array(partials)
299
300
301def approx_hess_cs(x, f, epsilon=None, args=(), kwargs={}):
302    '''Calculate Hessian with complex-step derivative approximation
303
304    Parameters
305    ----------
306    x : array_like
307       value at which function derivative is evaluated
308    f : function
309       function of one array f(x)
310    epsilon : float
311       stepsize, if None, then stepsize is automatically chosen
312
313    Returns
314    -------
315    hess : ndarray
316       array of partial second derivatives, Hessian
317
318    Notes
319    -----
320    based on equation 10 in
321    M. S. RIDOUT: Statistical Applications of the Complex-step Method
322    of Numerical Differentiation, University of Kent, Canterbury, Kent, U.K.
323
324    The stepsize is the same for the complex and the finite difference part.
325    '''
326    # TODO: might want to consider lowering the step for pure derivatives
327    n = len(x)
328    h = _get_epsilon(x, 3, epsilon, n)
329    ee = np.diag(h)
330    hess = np.outer(h, h)
331
332    n = len(x)
333
334    for i in range(n):
335        for j in range(i, n):
336            hess[i, j] = (f(*((x + 1j*ee[i, :] + ee[j, :],) + args), **kwargs)
337                          - f(*((x + 1j*ee[i, :] - ee[j, :],)+args),
338                              **kwargs)).imag/2./hess[i, j]
339            hess[j, i] = hess[i, j]
340
341    return hess
342
343
344@Substitution(
345    scale="3",
346    extra_params="""return_grad : bool
347        Whether or not to also return the gradient
348""",
349    extra_returns="""grad : nparray
350        Gradient if return_grad == True
351""",
352    equation_number="7",
353    equation="""1/(d_j*d_k) * ((f(x + d[j]*e[j] + d[k]*e[k]) - f(x + d[j]*e[j])))
354"""
355)
356@Appender(_hessian_docs)
357def approx_hess1(x, f, epsilon=None, args=(), kwargs={}, return_grad=False):
358    n = len(x)
359    h = _get_epsilon(x, 3, epsilon, n)
360    ee = np.diag(h)
361
362    f0 = f(*((x,)+args), **kwargs)
363    # Compute forward step
364    g = np.zeros(n)
365    for i in range(n):
366        g[i] = f(*((x+ee[i, :],)+args), **kwargs)
367
368    hess = np.outer(h, h)  # this is now epsilon**2
369    # Compute "double" forward step
370    for i in range(n):
371        for j in range(i, n):
372            hess[i, j] = (f(*((x + ee[i, :] + ee[j, :],) + args), **kwargs) -
373                          g[i] - g[j] + f0)/hess[i, j]
374            hess[j, i] = hess[i, j]
375    if return_grad:
376        grad = (g - f0)/h
377        return hess, grad
378    else:
379        return hess
380
381
382@Substitution(
383    scale="3",
384    extra_params="""return_grad : bool
385        Whether or not to also return the gradient
386""",
387    extra_returns="""grad : ndarray
388        Gradient if return_grad == True
389""",
390    equation_number="8",
391    equation="""1/(2*d_j*d_k) * ((f(x + d[j]*e[j] + d[k]*e[k]) - f(x + d[j]*e[j])) -
392                 (f(x + d[k]*e[k]) - f(x)) +
393                 (f(x - d[j]*e[j] - d[k]*e[k]) - f(x + d[j]*e[j])) -
394                 (f(x - d[k]*e[k]) - f(x)))
395"""
396)
397@Appender(_hessian_docs)
398def approx_hess2(x, f, epsilon=None, args=(), kwargs={}, return_grad=False):
399    #
400    n = len(x)
401    # NOTE: ridout suggesting using eps**(1/4)*theta
402    h = _get_epsilon(x, 3, epsilon, n)
403    ee = np.diag(h)
404    f0 = f(*((x,)+args), **kwargs)
405    # Compute forward step
406    g = np.zeros(n)
407    gg = np.zeros(n)
408    for i in range(n):
409        g[i] = f(*((x+ee[i, :],)+args), **kwargs)
410        gg[i] = f(*((x-ee[i, :],)+args), **kwargs)
411
412    hess = np.outer(h, h)  # this is now epsilon**2
413    # Compute "double" forward step
414    for i in range(n):
415        for j in range(i, n):
416            hess[i, j] = (f(*((x + ee[i, :] + ee[j, :],) + args), **kwargs) -
417                          g[i] - g[j] + f0 +
418                          f(*((x - ee[i, :] - ee[j, :],) + args), **kwargs) -
419                          gg[i] - gg[j] + f0)/(2 * hess[i, j])
420            hess[j, i] = hess[i, j]
421    if return_grad:
422        grad = (g - f0)/h
423        return hess, grad
424    else:
425        return hess
426
427
428@Substitution(
429    scale="4",
430    extra_params="",
431    extra_returns="",
432    equation_number="9",
433    equation="""1/(4*d_j*d_k) * ((f(x + d[j]*e[j] + d[k]*e[k]) - f(x + d[j]*e[j]
434                                                     - d[k]*e[k])) -
435                 (f(x - d[j]*e[j] + d[k]*e[k]) - f(x - d[j]*e[j]
436                                                     - d[k]*e[k]))"""
437)
438@Appender(_hessian_docs)
439def approx_hess3(x, f, epsilon=None, args=(), kwargs={}):
440    n = len(x)
441    h = _get_epsilon(x, 4, epsilon, n)
442    ee = np.diag(h)
443    hess = np.outer(h, h)
444
445    for i in range(n):
446        for j in range(i, n):
447            hess[i, j] = (f(*((x + ee[i, :] + ee[j, :],) + args), **kwargs)
448                          - f(*((x + ee[i, :] - ee[j, :],) + args), **kwargs)
449                          - (f(*((x - ee[i, :] + ee[j, :],) + args), **kwargs)
450                          - f(*((x - ee[i, :] - ee[j, :],) + args), **kwargs))
451                          )/(4.*hess[i, j])
452            hess[j, i] = hess[i, j]
453    return hess
454
455
456approx_hess = approx_hess3
457approx_hess.__doc__ += "\n    This is an alias for approx_hess3"
458