1"""numerical differentiation function, gradient, Jacobian, and Hessian 2 3Author : josef-pkt 4License : BSD 5 6Notes 7----- 8These are simple forward differentiation, so that we have them available 9without dependencies. 10 11* Jacobian should be faster than numdifftools because it does not use loop over 12 observations. 13* numerical precision will vary and depend on the choice of stepsizes 14""" 15 16# TODO: 17# * some cleanup 18# * check numerical accuracy (and bugs) with numdifftools and analytical 19# derivatives 20# - linear least squares case: (hess - 2*X'X) is 1e-8 or so 21# - gradient and Hessian agree with numdifftools when evaluated away from 22# minimum 23# - forward gradient, Jacobian evaluated at minimum is inaccurate, centered 24# (+/- epsilon) is ok 25# * dot product of Jacobian is different from Hessian, either wrong example or 26# a bug (unlikely), or a real difference 27# 28# 29# What are the conditions that Jacobian dotproduct and Hessian are the same? 30# 31# See also: 32# 33# BHHH: Greene p481 17.4.6, MLE Jacobian = d loglike / d beta , where loglike 34# is vector for each observation 35# see also example 17.4 when J'J is very different from Hessian 36# also does it hold only at the minimum, what's relationship to covariance 37# of Jacobian matrix 38# http://projects.scipy.org/scipy/ticket/1157 39# https://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm 40# objective: sum((y-f(beta,x)**2), Jacobian = d f/d beta 41# and not d objective/d beta as in MLE Greene 42# similar: http://crsouza.blogspot.com/2009/11/neural-network-learning-by-levenberg_18.html#hessian 43# 44# in example: if J = d x*beta / d beta then J'J == X'X 45# similar to https://en.wikipedia.org/wiki/Levenberg%E2%80%93Marquardt_algorithm 46import numpy as np 47 48from statsmodels.compat.pandas import Appender, Substitution 49 50# NOTE: we only do double precision internally so far 51EPS = np.finfo(float).eps 52 53_hessian_docs = """ 54 Calculate Hessian with finite difference derivative approximation 55 56 Parameters 57 ---------- 58 x : array_like 59 value at which function derivative is evaluated 60 f : function 61 function of one array f(x, `*args`, `**kwargs`) 62 epsilon : float or array_like, optional 63 Stepsize used, if None, then stepsize is automatically chosen 64 according to EPS**(1/%(scale)s)*x. 65 args : tuple 66 Arguments for function `f`. 67 kwargs : dict 68 Keyword arguments for function `f`. 69 %(extra_params)s 70 71 Returns 72 ------- 73 hess : ndarray 74 array of partial second derivatives, Hessian 75 %(extra_returns)s 76 77 Notes 78 ----- 79 Equation (%(equation_number)s) in Ridout. Computes the Hessian as:: 80 81 %(equation)s 82 83 where e[j] is a vector with element j == 1 and the rest are zero and 84 d[i] is epsilon[i]. 85 86 References 87 ----------: 88 89 Ridout, M.S. (2009) Statistical applications of the complex-step method 90 of numerical differentiation. The American Statistician, 63, 66-74 91""" 92 93 94def _get_epsilon(x, s, epsilon, n): 95 if epsilon is None: 96 h = EPS**(1. / s) * np.maximum(np.abs(x), 0.1) 97 else: 98 if np.isscalar(epsilon): 99 h = np.empty(n) 100 h.fill(epsilon) 101 else: # pragma : no cover 102 h = np.asarray(epsilon) 103 if h.shape != x.shape: 104 raise ValueError("If h is not a scalar it must have the same" 105 " shape as x.") 106 return h 107 108 109def approx_fprime(x, f, epsilon=None, args=(), kwargs={}, centered=False): 110 ''' 111 Gradient of function, or Jacobian if function f returns 1d array 112 113 Parameters 114 ---------- 115 x : ndarray 116 parameters at which the derivative is evaluated 117 f : function 118 `f(*((x,)+args), **kwargs)` returning either one value or 1d array 119 epsilon : float, optional 120 Stepsize, if None, optimal stepsize is used. This is EPS**(1/2)*x for 121 `centered` == False and EPS**(1/3)*x for `centered` == True. 122 args : tuple 123 Tuple of additional arguments for function `f`. 124 kwargs : dict 125 Dictionary of additional keyword arguments for function `f`. 126 centered : bool 127 Whether central difference should be returned. If not, does forward 128 differencing. 129 130 Returns 131 ------- 132 grad : ndarray 133 gradient or Jacobian 134 135 Notes 136 ----- 137 If f returns a 1d array, it returns a Jacobian. If a 2d array is returned 138 by f (e.g., with a value for each observation), it returns a 3d array 139 with the Jacobian of each observation with shape xk x nobs x xk. I.e., 140 the Jacobian of the first observation would be [:, 0, :] 141 ''' 142 n = len(x) 143 f0 = f(*((x,)+args), **kwargs) 144 dim = np.atleast_1d(f0).shape # it could be a scalar 145 grad = np.zeros((n,) + dim, np.promote_types(float, x.dtype)) 146 ei = np.zeros((n,), float) 147 if not centered: 148 epsilon = _get_epsilon(x, 2, epsilon, n) 149 for k in range(n): 150 ei[k] = epsilon[k] 151 grad[k, :] = (f(*((x+ei,) + args), **kwargs) - f0)/epsilon[k] 152 ei[k] = 0.0 153 else: 154 epsilon = _get_epsilon(x, 3, epsilon, n) / 2. 155 for k in range(n): 156 ei[k] = epsilon[k] 157 grad[k, :] = (f(*((x+ei,)+args), **kwargs) - 158 f(*((x-ei,)+args), **kwargs))/(2 * epsilon[k]) 159 ei[k] = 0.0 160 161 return grad.squeeze().T 162 163 164def _approx_fprime_scalar(x, f, epsilon=None, args=(), kwargs={}, 165 centered=False): 166 ''' 167 Gradient of function vectorized for scalar parameter. 168 169 This assumes that the function ``f`` is vectorized for a scalar parameter. 170 The function value ``f(x)`` has then the same shape as the input ``x``. 171 The derivative returned by this function also has the same shape as ``x``. 172 173 Parameters 174 ---------- 175 x : ndarray 176 Parameters at which the derivative is evaluated. 177 f : function 178 `f(*((x,)+args), **kwargs)` returning either one value or 1d array 179 epsilon : float, optional 180 Stepsize, if None, optimal stepsize is used. This is EPS**(1/2)*x for 181 `centered` == False and EPS**(1/3)*x for `centered` == True. 182 args : tuple 183 Tuple of additional arguments for function `f`. 184 kwargs : dict 185 Dictionary of additional keyword arguments for function `f`. 186 centered : bool 187 Whether central difference should be returned. If not, does forward 188 differencing. 189 190 Returns 191 ------- 192 grad : ndarray 193 Array of derivatives, gradient evaluated at parameters ``x``. 194 ''' 195 x = np.asarray(x) 196 n = 1 197 198 f0 = f(*((x,)+args), **kwargs) 199 if not centered: 200 eps = _get_epsilon(x, 2, epsilon, n) 201 grad = (f(*((x+eps,) + args), **kwargs) - f0) / eps 202 else: 203 eps = _get_epsilon(x, 3, epsilon, n) / 2. 204 grad = (f(*((x+eps,)+args), **kwargs) - 205 f(*((x-eps,)+args), **kwargs)) / (2 * eps) 206 207 return grad 208 209 210def approx_fprime_cs(x, f, epsilon=None, args=(), kwargs={}): 211 ''' 212 Calculate gradient or Jacobian with complex step derivative approximation 213 214 Parameters 215 ---------- 216 x : ndarray 217 parameters at which the derivative is evaluated 218 f : function 219 `f(*((x,)+args), **kwargs)` returning either one value or 1d array 220 epsilon : float, optional 221 Stepsize, if None, optimal stepsize is used. Optimal step-size is 222 EPS*x. See note. 223 args : tuple 224 Tuple of additional arguments for function `f`. 225 kwargs : dict 226 Dictionary of additional keyword arguments for function `f`. 227 228 Returns 229 ------- 230 partials : ndarray 231 array of partial derivatives, Gradient or Jacobian 232 233 Notes 234 ----- 235 The complex-step derivative has truncation error O(epsilon**2), so 236 truncation error can be eliminated by choosing epsilon to be very small. 237 The complex-step derivative avoids the problem of round-off error with 238 small epsilon because there is no subtraction. 239 ''' 240 # From Guilherme P. de Freitas, numpy mailing list 241 # May 04 2010 thread "Improvement of performance" 242 # http://mail.scipy.org/pipermail/numpy-discussion/2010-May/050250.html 243 n = len(x) 244 245 epsilon = _get_epsilon(x, 1, epsilon, n) 246 increments = np.identity(n) * 1j * epsilon 247 # TODO: see if this can be vectorized, but usually dim is small 248 partials = [f(x+ih, *args, **kwargs).imag / epsilon[i] 249 for i, ih in enumerate(increments)] 250 251 return np.array(partials).T 252 253 254def _approx_fprime_cs_scalar(x, f, epsilon=None, args=(), kwargs={}): 255 ''' 256 Calculate gradient for scalar parameter with complex step derivatives. 257 258 This assumes that the function ``f`` is vectorized for a scalar parameter. 259 The function value ``f(x)`` has then the same shape as the input ``x``. 260 The derivative returned by this function also has the same shape as ``x``. 261 262 Parameters 263 ---------- 264 x : ndarray 265 Parameters at which the derivative is evaluated. 266 f : function 267 `f(*((x,)+args), **kwargs)` returning either one value or 1d array. 268 epsilon : float, optional 269 Stepsize, if None, optimal stepsize is used. Optimal step-size is 270 EPS*x. See note. 271 args : tuple 272 Tuple of additional arguments for function `f`. 273 kwargs : dict 274 Dictionary of additional keyword arguments for function `f`. 275 276 Returns 277 ------- 278 partials : ndarray 279 Array of derivatives, gradient evaluated for parameters ``x``. 280 281 Notes 282 ----- 283 The complex-step derivative has truncation error O(epsilon**2), so 284 truncation error can be eliminated by choosing epsilon to be very small. 285 The complex-step derivative avoids the problem of round-off error with 286 small epsilon because there is no subtraction. 287 ''' 288 # From Guilherme P. de Freitas, numpy mailing list 289 # May 04 2010 thread "Improvement of performance" 290 # http://mail.scipy.org/pipermail/numpy-discussion/2010-May/050250.html 291 x = np.asarray(x) 292 n = x.shape[-1] 293 294 epsilon = _get_epsilon(x, 1, epsilon, n) 295 eps = 1j * epsilon 296 partials = f(x + eps, *args, **kwargs).imag / epsilon 297 298 return np.array(partials) 299 300 301def approx_hess_cs(x, f, epsilon=None, args=(), kwargs={}): 302 '''Calculate Hessian with complex-step derivative approximation 303 304 Parameters 305 ---------- 306 x : array_like 307 value at which function derivative is evaluated 308 f : function 309 function of one array f(x) 310 epsilon : float 311 stepsize, if None, then stepsize is automatically chosen 312 313 Returns 314 ------- 315 hess : ndarray 316 array of partial second derivatives, Hessian 317 318 Notes 319 ----- 320 based on equation 10 in 321 M. S. RIDOUT: Statistical Applications of the Complex-step Method 322 of Numerical Differentiation, University of Kent, Canterbury, Kent, U.K. 323 324 The stepsize is the same for the complex and the finite difference part. 325 ''' 326 # TODO: might want to consider lowering the step for pure derivatives 327 n = len(x) 328 h = _get_epsilon(x, 3, epsilon, n) 329 ee = np.diag(h) 330 hess = np.outer(h, h) 331 332 n = len(x) 333 334 for i in range(n): 335 for j in range(i, n): 336 hess[i, j] = (f(*((x + 1j*ee[i, :] + ee[j, :],) + args), **kwargs) 337 - f(*((x + 1j*ee[i, :] - ee[j, :],)+args), 338 **kwargs)).imag/2./hess[i, j] 339 hess[j, i] = hess[i, j] 340 341 return hess 342 343 344@Substitution( 345 scale="3", 346 extra_params="""return_grad : bool 347 Whether or not to also return the gradient 348""", 349 extra_returns="""grad : nparray 350 Gradient if return_grad == True 351""", 352 equation_number="7", 353 equation="""1/(d_j*d_k) * ((f(x + d[j]*e[j] + d[k]*e[k]) - f(x + d[j]*e[j]))) 354""" 355) 356@Appender(_hessian_docs) 357def approx_hess1(x, f, epsilon=None, args=(), kwargs={}, return_grad=False): 358 n = len(x) 359 h = _get_epsilon(x, 3, epsilon, n) 360 ee = np.diag(h) 361 362 f0 = f(*((x,)+args), **kwargs) 363 # Compute forward step 364 g = np.zeros(n) 365 for i in range(n): 366 g[i] = f(*((x+ee[i, :],)+args), **kwargs) 367 368 hess = np.outer(h, h) # this is now epsilon**2 369 # Compute "double" forward step 370 for i in range(n): 371 for j in range(i, n): 372 hess[i, j] = (f(*((x + ee[i, :] + ee[j, :],) + args), **kwargs) - 373 g[i] - g[j] + f0)/hess[i, j] 374 hess[j, i] = hess[i, j] 375 if return_grad: 376 grad = (g - f0)/h 377 return hess, grad 378 else: 379 return hess 380 381 382@Substitution( 383 scale="3", 384 extra_params="""return_grad : bool 385 Whether or not to also return the gradient 386""", 387 extra_returns="""grad : ndarray 388 Gradient if return_grad == True 389""", 390 equation_number="8", 391 equation="""1/(2*d_j*d_k) * ((f(x + d[j]*e[j] + d[k]*e[k]) - f(x + d[j]*e[j])) - 392 (f(x + d[k]*e[k]) - f(x)) + 393 (f(x - d[j]*e[j] - d[k]*e[k]) - f(x + d[j]*e[j])) - 394 (f(x - d[k]*e[k]) - f(x))) 395""" 396) 397@Appender(_hessian_docs) 398def approx_hess2(x, f, epsilon=None, args=(), kwargs={}, return_grad=False): 399 # 400 n = len(x) 401 # NOTE: ridout suggesting using eps**(1/4)*theta 402 h = _get_epsilon(x, 3, epsilon, n) 403 ee = np.diag(h) 404 f0 = f(*((x,)+args), **kwargs) 405 # Compute forward step 406 g = np.zeros(n) 407 gg = np.zeros(n) 408 for i in range(n): 409 g[i] = f(*((x+ee[i, :],)+args), **kwargs) 410 gg[i] = f(*((x-ee[i, :],)+args), **kwargs) 411 412 hess = np.outer(h, h) # this is now epsilon**2 413 # Compute "double" forward step 414 for i in range(n): 415 for j in range(i, n): 416 hess[i, j] = (f(*((x + ee[i, :] + ee[j, :],) + args), **kwargs) - 417 g[i] - g[j] + f0 + 418 f(*((x - ee[i, :] - ee[j, :],) + args), **kwargs) - 419 gg[i] - gg[j] + f0)/(2 * hess[i, j]) 420 hess[j, i] = hess[i, j] 421 if return_grad: 422 grad = (g - f0)/h 423 return hess, grad 424 else: 425 return hess 426 427 428@Substitution( 429 scale="4", 430 extra_params="", 431 extra_returns="", 432 equation_number="9", 433 equation="""1/(4*d_j*d_k) * ((f(x + d[j]*e[j] + d[k]*e[k]) - f(x + d[j]*e[j] 434 - d[k]*e[k])) - 435 (f(x - d[j]*e[j] + d[k]*e[k]) - f(x - d[j]*e[j] 436 - d[k]*e[k]))""" 437) 438@Appender(_hessian_docs) 439def approx_hess3(x, f, epsilon=None, args=(), kwargs={}): 440 n = len(x) 441 h = _get_epsilon(x, 4, epsilon, n) 442 ee = np.diag(h) 443 hess = np.outer(h, h) 444 445 for i in range(n): 446 for j in range(i, n): 447 hess[i, j] = (f(*((x + ee[i, :] + ee[j, :],) + args), **kwargs) 448 - f(*((x + ee[i, :] - ee[j, :],) + args), **kwargs) 449 - (f(*((x - ee[i, :] + ee[j, :],) + args), **kwargs) 450 - f(*((x - ee[i, :] - ee[j, :],) + args), **kwargs)) 451 )/(4.*hess[i, j]) 452 hess[j, i] = hess[i, j] 453 return hess 454 455 456approx_hess = approx_hess3 457approx_hess.__doc__ += "\n This is an alias for approx_hess3" 458