1""" 2Univariate lowess function, like in R. 3 4References 5---------- 6Hastie, Tibshirani, Friedman. (2009) The Elements of Statistical Learning: Data Mining, Inference, and Prediction, Second Edition: Chapter 6. 7 8Cleveland, W.S. (1979) "Robust Locally Weighted Regression and Smoothing Scatterplots". Journal of the American Statistical Association 74 (368): 829-836. 9""" 10import numpy as np 11from numpy.linalg import lstsq 12 13 14def lowess(endog, exog, frac=2./3, it=3): 15 """ 16 LOWESS (Locally Weighted Scatterplot Smoothing) 17 18 A lowess function that outs smoothed estimates of endog 19 at the given exog values from points (exog, endog) 20 21 Parameters 22 ---------- 23 endog : 1-D numpy array 24 The y-values of the observed points 25 exog : 1-D numpy array 26 The x-values of the observed points 27 frac : float 28 Between 0 and 1. The fraction of the data used 29 when estimating each y-value. 30 it : int 31 The number of residual-based reweightings 32 to perform. 33 34 Returns 35 ------- 36 out: numpy array 37 A numpy array with two columns. The first column 38 is the sorted x values and the second column the 39 associated estimated y-values. 40 41 Notes 42 ----- 43 This lowess function implements the algorithm given in the 44 reference below using local linear estimates. 45 46 Suppose the input data has N points. The algorithm works by 47 estimating the true ``y_i`` by taking the frac*N closest points 48 to ``(x_i,y_i)`` based on their x values and estimating ``y_i`` 49 using a weighted linear regression. The weight for ``(x_j,y_j)`` 50 is `_lowess_tricube` function applied to ``|x_i-x_j|``. 51 52 If ``iter > 0``, then further weighted local linear regressions 53 are performed, where the weights are the same as above 54 times the `_lowess_bisquare` function of the residuals. Each iteration 55 takes approximately the same amount of time as the original fit, 56 so these iterations are expensive. They are most useful when 57 the noise has extremely heavy tails, such as Cauchy noise. 58 Noise with less heavy-tails, such as t-distributions with ``df > 2``, 59 are less problematic. The weights downgrade the influence of 60 points with large residuals. In the extreme case, points whose 61 residuals are larger than 6 times the median absolute residual 62 are given weight 0. 63 64 Some experimentation is likely required to find a good 65 choice of frac and iter for a particular dataset. 66 67 References 68 ---------- 69 Cleveland, W.S. (1979) "Robust Locally Weighted Regression 70 and Smoothing Scatterplots". Journal of the American Statistical 71 Association 74 (368): 829-836. 72 73 Examples 74 -------- 75 The below allows a comparison between how different the fits from 76 `lowess` for different values of frac can be. 77 78 >>> import numpy as np 79 >>> import statsmodels.api as sm 80 >>> lowess = sm.nonparametric.lowess 81 >>> x = np.random.uniform(low=-2*np.pi, high=2*np.pi, size=500) 82 >>> y = np.sin(x) + np.random.normal(size=len(x)) 83 >>> z = lowess(y, x) 84 >>> w = lowess(y, x, frac=1./3) 85 86 This gives a similar comparison for when it is 0 vs not. 87 88 >>> import scipy.stats as stats 89 >>> x = np.random.uniform(low=-2*np.pi, high=2*np.pi, size=500) 90 >>> y = np.sin(x) + stats.cauchy.rvs(size=len(x)) 91 >>> z = lowess(y, x, frac= 1./3, it=0) 92 >>> w = lowess(y, x, frac=1./3) 93 """ 94 x = exog 95 96 if exog.ndim != 1: 97 raise ValueError('exog must be a vector') 98 if endog.ndim != 1: 99 raise ValueError('endog must be a vector') 100 if endog.shape[0] != x.shape[0] : 101 raise ValueError('exog and endog must have same length') 102 103 n = exog.shape[0] 104 fitted = np.zeros(n) 105 106 k = int(frac * n) 107 108 index_array = np.argsort(exog) 109 x_copy = np.array(exog[index_array]) #, dtype ='float32') 110 y_copy = endog[index_array] 111 112 fitted, weights = _lowess_initial_fit(x_copy, y_copy, k, n) 113 114 for i in range(it): 115 _lowess_robustify_fit(x_copy, y_copy, fitted, 116 weights, k, n) 117 118 out = np.array([x_copy, fitted]).T 119 out.shape = (n,2) 120 121 return out 122 123 124def _lowess_initial_fit(x_copy, y_copy, k, n): 125 """ 126 The initial weighted local linear regression for lowess. 127 128 Parameters 129 ---------- 130 x_copy : 1-d ndarray 131 The x-values/exogenous part of the data being smoothed 132 y_copy : 1-d ndarray 133 The y-values/ endogenous part of the data being smoothed 134 k : int 135 The number of data points which affect the linear fit for 136 each estimated point 137 n : int 138 The total number of points 139 140 Returns 141 ------- 142 fitted : 1-d ndarray 143 The fitted y-values 144 weights : 2-d ndarray 145 An n by k array. The contribution to the weights in the 146 local linear fit coming from the distances between the 147 x-values 148 149 """ 150 weights = np.zeros((n,k), dtype = x_copy.dtype) 151 nn_indices = [0,k] 152 153 X = np.ones((k,2)) 154 fitted = np.zeros(n) 155 156 for i in range(n): 157 #note: all _lowess functions are inplace, no return 158 left_width = x_copy[i] - x_copy[nn_indices[0]] 159 right_width = x_copy[nn_indices[1]-1] - x_copy[i] 160 width = max(left_width, right_width) 161 _lowess_wt_standardize(weights[i,:], 162 x_copy[nn_indices[0]:nn_indices[1]], 163 x_copy[i], width) 164 _lowess_tricube(weights[i,:]) 165 weights[i,:] = np.sqrt(weights[i,:]) 166 167 X[:,1] = x_copy[nn_indices[0]:nn_indices[1]] 168 y_i = weights[i,:] * y_copy[nn_indices[0]:nn_indices[1]] 169 170 beta = lstsq(weights[i,:].reshape(k,1) * X, y_i, rcond=-1)[0] 171 fitted[i] = beta[0] + beta[1]*x_copy[i] 172 173 _lowess_update_nn(x_copy, nn_indices, i+1) 174 175 176 return fitted, weights 177 178 179def _lowess_wt_standardize(weights, new_entries, x_copy_i, width): 180 """ 181 The initial phase of creating the weights. 182 Subtract the current x_i and divide by the width. 183 184 Parameters 185 ---------- 186 weights : ndarray 187 The memory where (new_entries - x_copy_i)/width will be placed 188 new_entries : ndarray 189 The x-values of the k closest points to x[i] 190 x_copy_i : float 191 x[i], the i'th point in the (sorted) x values 192 width : float 193 The maximum distance between x[i] and any point in new_entries 194 195 Returns 196 ------- 197 Nothing. The modifications are made to weight in place. 198 """ 199 weights[:] = new_entries 200 weights -= x_copy_i 201 weights /= width 202 203 204def _lowess_robustify_fit(x_copy, y_copy, fitted, weights, k, n): 205 """ 206 Additional weighted local linear regressions, performed if 207 iter>0. They take into account the sizes of the residuals, 208 to eliminate the effect of extreme outliers. 209 210 Parameters 211 ---------- 212 x_copy : 1-d ndarray 213 The x-values/exogenous part of the data being smoothed 214 y_copy : 1-d ndarray 215 The y-values/ endogenous part of the data being smoothed 216 fitted : 1-d ndarray 217 The fitted y-values from the previous iteration 218 weights : 2-d ndarray 219 An n by k array. The contribution to the weights in the 220 local linear fit coming from the distances between the 221 x-values 222 k : int 223 The number of data points which affect the linear fit for 224 each estimated point 225 n : int 226 The total number of points 227 228 Returns 229 ------- 230 Nothing. The fitted values are modified in place. 231 """ 232 nn_indices = [0,k] 233 X = np.ones((k,2)) 234 235 residual_weights = np.copy(y_copy) 236 residual_weights.shape = (n,) 237 residual_weights -= fitted 238 residual_weights = np.absolute(residual_weights)#, out=residual_weights) 239 s = np.median(residual_weights) 240 residual_weights /= (6*s) 241 too_big = residual_weights>=1 242 _lowess_bisquare(residual_weights) 243 residual_weights[too_big] = 0 244 245 246 for i in range(n): 247 total_weights = weights[i,:] * np.sqrt(residual_weights[nn_indices[0]: 248 nn_indices[1]]) 249 250 X[:,1] = x_copy[nn_indices[0]:nn_indices[1]] 251 y_i = total_weights * y_copy[nn_indices[0]:nn_indices[1]] 252 total_weights.shape = (k,1) 253 254 beta = lstsq(total_weights * X, y_i, rcond=-1)[0] 255 256 fitted[i] = beta[0] + beta[1] * x_copy[i] 257 258 _lowess_update_nn(x_copy, nn_indices, i+1) 259 260 261def _lowess_update_nn(x, cur_nn,i): 262 """ 263 Update the endpoints of the nearest neighbors to 264 the ith point. 265 266 Parameters 267 ---------- 268 x : iterable 269 The sorted points of x-values 270 cur_nn : list of length 2 271 The two current indices between which are the 272 k closest points to x[i]. (The actual value of 273 k is irrelevant for the algorithm. 274 i : int 275 The index of the current value in x for which 276 the k closest points are desired. 277 278 Returns 279 ------- 280 Nothing. It modifies cur_nn in place. 281 """ 282 while True: 283 if cur_nn[1]<x.size: 284 left_dist = x[i] - x[cur_nn[0]] 285 new_right_dist = x[cur_nn[1]] - x[i] 286 if new_right_dist < left_dist: 287 cur_nn[0] = cur_nn[0] + 1 288 cur_nn[1] = cur_nn[1] + 1 289 else: 290 break 291 else: 292 break 293 294 295def _lowess_tricube(t): 296 """ 297 The _tricube function applied to a numpy array. 298 The tricube function is (1-abs(t)**3)**3. 299 300 Parameters 301 ---------- 302 t : ndarray 303 Array the tricube function is applied to elementwise and 304 in-place. 305 306 Returns 307 ------- 308 Nothing 309 """ 310 #t = (1-np.abs(t)**3)**3 311 t[:] = np.absolute(t) #, out=t) #numpy version? 312 _lowess_mycube(t) 313 t[:] = np.negative(t) #, out = t) 314 t += 1 315 _lowess_mycube(t) 316 317 318def _lowess_mycube(t): 319 """ 320 Fast matrix cube 321 322 Parameters 323 ---------- 324 t : ndarray 325 Array that is cubed, elementwise and in-place 326 327 Returns 328 ------- 329 Nothing 330 """ 331 #t **= 3 332 t2 = t*t 333 t *= t2 334 335 336def _lowess_bisquare(t): 337 """ 338 The bisquare function applied to a numpy array. 339 The bisquare function is (1-t**2)**2. 340 341 Parameters 342 ---------- 343 t : ndarray 344 array bisquare function is applied to, element-wise and in-place. 345 346 Returns 347 ------- 348 Nothing 349 """ 350 #t = (1-t**2)**2 351 t *= t 352 t[:] = np.negative(t) #, out=t) 353 t += 1 354 t *= t 355