1"""
2Univariate lowess function, like in R.
3
4References
5----------
6Hastie, Tibshirani, Friedman. (2009) The Elements of Statistical Learning: Data Mining, Inference, and Prediction, Second Edition: Chapter 6.
7
8Cleveland, W.S. (1979) "Robust Locally Weighted Regression and Smoothing Scatterplots". Journal of the American Statistical Association 74 (368): 829-836.
9"""
10import numpy as np
11from numpy.linalg import lstsq
12
13
14def lowess(endog, exog, frac=2./3, it=3):
15    """
16    LOWESS (Locally Weighted Scatterplot Smoothing)
17
18    A lowess function that outs smoothed estimates of endog
19    at the given exog values from points (exog, endog)
20
21    Parameters
22    ----------
23    endog : 1-D numpy array
24        The y-values of the observed points
25    exog : 1-D numpy array
26        The x-values of the observed points
27    frac : float
28        Between 0 and 1. The fraction of the data used
29        when estimating each y-value.
30    it : int
31        The number of residual-based reweightings
32        to perform.
33
34    Returns
35    -------
36    out: numpy array
37        A numpy array with two columns. The first column
38        is the sorted x values and the second column the
39        associated estimated y-values.
40
41    Notes
42    -----
43    This lowess function implements the algorithm given in the
44    reference below using local linear estimates.
45
46    Suppose the input data has N points. The algorithm works by
47    estimating the true ``y_i`` by taking the frac*N closest points
48    to ``(x_i,y_i)`` based on their x values and estimating ``y_i``
49    using a weighted linear regression. The weight for ``(x_j,y_j)``
50    is `_lowess_tricube` function applied to ``|x_i-x_j|``.
51
52    If ``iter > 0``, then further weighted local linear regressions
53    are performed, where the weights are the same as above
54    times the `_lowess_bisquare` function of the residuals. Each iteration
55    takes approximately the same amount of time as the original fit,
56    so these iterations are expensive. They are most useful when
57    the noise has extremely heavy tails, such as Cauchy noise.
58    Noise with less heavy-tails, such as t-distributions with ``df > 2``,
59    are less problematic. The weights downgrade the influence of
60    points with large residuals. In the extreme case, points whose
61    residuals are larger than 6 times the median absolute residual
62    are given weight 0.
63
64    Some experimentation is likely required to find a good
65    choice of frac and iter for a particular dataset.
66
67    References
68    ----------
69    Cleveland, W.S. (1979) "Robust Locally Weighted Regression
70    and Smoothing Scatterplots". Journal of the American Statistical
71    Association 74 (368): 829-836.
72
73    Examples
74    --------
75    The below allows a comparison between how different the fits from
76    `lowess` for different values of frac can be.
77
78    >>> import numpy as np
79    >>> import statsmodels.api as sm
80    >>> lowess = sm.nonparametric.lowess
81    >>> x = np.random.uniform(low=-2*np.pi, high=2*np.pi, size=500)
82    >>> y = np.sin(x) + np.random.normal(size=len(x))
83    >>> z = lowess(y, x)
84    >>> w = lowess(y, x, frac=1./3)
85
86    This gives a similar comparison for when it is 0 vs not.
87
88    >>> import scipy.stats as stats
89    >>> x = np.random.uniform(low=-2*np.pi, high=2*np.pi, size=500)
90    >>> y = np.sin(x) + stats.cauchy.rvs(size=len(x))
91    >>> z = lowess(y, x, frac= 1./3, it=0)
92    >>> w = lowess(y, x, frac=1./3)
93    """
94    x = exog
95
96    if exog.ndim != 1:
97        raise ValueError('exog must be a vector')
98    if endog.ndim != 1:
99        raise ValueError('endog must be a vector')
100    if endog.shape[0] != x.shape[0] :
101        raise ValueError('exog and endog must have same length')
102
103    n = exog.shape[0]
104    fitted = np.zeros(n)
105
106    k = int(frac * n)
107
108    index_array = np.argsort(exog)
109    x_copy = np.array(exog[index_array]) #, dtype ='float32')
110    y_copy = endog[index_array]
111
112    fitted, weights = _lowess_initial_fit(x_copy, y_copy, k, n)
113
114    for i in range(it):
115        _lowess_robustify_fit(x_copy, y_copy, fitted,
116                              weights, k, n)
117
118    out = np.array([x_copy, fitted]).T
119    out.shape = (n,2)
120
121    return out
122
123
124def _lowess_initial_fit(x_copy, y_copy, k, n):
125    """
126    The initial weighted local linear regression for lowess.
127
128    Parameters
129    ----------
130    x_copy : 1-d ndarray
131        The x-values/exogenous part of the data being smoothed
132    y_copy : 1-d ndarray
133        The y-values/ endogenous part of the data being smoothed
134   k : int
135        The number of data points which affect the linear fit for
136        each estimated point
137    n : int
138        The total number of points
139
140    Returns
141    -------
142    fitted : 1-d ndarray
143        The fitted y-values
144    weights : 2-d ndarray
145        An n by k array. The contribution to the weights in the
146        local linear fit coming from the distances between the
147        x-values
148
149   """
150    weights = np.zeros((n,k), dtype = x_copy.dtype)
151    nn_indices = [0,k]
152
153    X = np.ones((k,2))
154    fitted = np.zeros(n)
155
156    for i in range(n):
157        #note: all _lowess functions are inplace, no return
158        left_width = x_copy[i] - x_copy[nn_indices[0]]
159        right_width = x_copy[nn_indices[1]-1] - x_copy[i]
160        width = max(left_width, right_width)
161        _lowess_wt_standardize(weights[i,:],
162                                x_copy[nn_indices[0]:nn_indices[1]],
163                            x_copy[i], width)
164        _lowess_tricube(weights[i,:])
165        weights[i,:] = np.sqrt(weights[i,:])
166
167        X[:,1] = x_copy[nn_indices[0]:nn_indices[1]]
168        y_i = weights[i,:] * y_copy[nn_indices[0]:nn_indices[1]]
169
170        beta = lstsq(weights[i,:].reshape(k,1) * X, y_i, rcond=-1)[0]
171        fitted[i] = beta[0] + beta[1]*x_copy[i]
172
173        _lowess_update_nn(x_copy, nn_indices, i+1)
174
175
176    return fitted, weights
177
178
179def _lowess_wt_standardize(weights, new_entries, x_copy_i, width):
180    """
181    The initial phase of creating the weights.
182    Subtract the current x_i and divide by the width.
183
184    Parameters
185    ----------
186    weights : ndarray
187        The memory where (new_entries - x_copy_i)/width will be placed
188    new_entries : ndarray
189        The x-values of the k closest points to x[i]
190    x_copy_i : float
191        x[i], the i'th point in the (sorted) x values
192    width : float
193        The maximum distance between x[i] and any point in new_entries
194
195    Returns
196    -------
197    Nothing. The modifications are made to weight in place.
198    """
199    weights[:] = new_entries
200    weights -= x_copy_i
201    weights /= width
202
203
204def _lowess_robustify_fit(x_copy, y_copy, fitted, weights, k, n):
205    """
206    Additional weighted local linear regressions, performed if
207    iter>0. They take into account the sizes of the residuals,
208    to eliminate the effect of extreme outliers.
209
210    Parameters
211    ----------
212    x_copy : 1-d ndarray
213        The x-values/exogenous part of the data being smoothed
214    y_copy : 1-d ndarray
215        The y-values/ endogenous part of the data being smoothed
216    fitted : 1-d ndarray
217        The fitted y-values from the previous iteration
218    weights : 2-d ndarray
219        An n by k array. The contribution to the weights in the
220        local linear fit coming from the distances between the
221        x-values
222    k : int
223        The number of data points which affect the linear fit for
224        each estimated point
225    n : int
226        The total number of points
227
228   Returns
229    -------
230    Nothing. The fitted values are modified in place.
231    """
232    nn_indices = [0,k]
233    X = np.ones((k,2))
234
235    residual_weights = np.copy(y_copy)
236    residual_weights.shape = (n,)
237    residual_weights -= fitted
238    residual_weights = np.absolute(residual_weights)#, out=residual_weights)
239    s = np.median(residual_weights)
240    residual_weights /= (6*s)
241    too_big = residual_weights>=1
242    _lowess_bisquare(residual_weights)
243    residual_weights[too_big] = 0
244
245
246    for i in range(n):
247        total_weights = weights[i,:] * np.sqrt(residual_weights[nn_indices[0]:
248                                                        nn_indices[1]])
249
250        X[:,1] = x_copy[nn_indices[0]:nn_indices[1]]
251        y_i = total_weights * y_copy[nn_indices[0]:nn_indices[1]]
252        total_weights.shape = (k,1)
253
254        beta = lstsq(total_weights * X, y_i, rcond=-1)[0]
255
256        fitted[i] = beta[0] + beta[1] * x_copy[i]
257
258        _lowess_update_nn(x_copy, nn_indices, i+1)
259
260
261def _lowess_update_nn(x, cur_nn,i):
262    """
263    Update the endpoints of the nearest neighbors to
264    the ith point.
265
266    Parameters
267    ----------
268    x : iterable
269        The sorted points of x-values
270    cur_nn : list of length 2
271        The two current indices between which are the
272        k closest points to x[i]. (The actual value of
273        k is irrelevant for the algorithm.
274    i : int
275        The index of the current value in x for which
276        the k closest points are desired.
277
278    Returns
279    -------
280    Nothing. It modifies cur_nn in place.
281    """
282    while True:
283        if cur_nn[1]<x.size:
284            left_dist = x[i] - x[cur_nn[0]]
285            new_right_dist = x[cur_nn[1]] - x[i]
286            if new_right_dist < left_dist:
287                cur_nn[0] = cur_nn[0] + 1
288                cur_nn[1] = cur_nn[1] + 1
289            else:
290                break
291        else:
292            break
293
294
295def _lowess_tricube(t):
296    """
297    The _tricube function applied to a numpy array.
298    The tricube function is (1-abs(t)**3)**3.
299
300    Parameters
301    ----------
302    t : ndarray
303        Array the tricube function is applied to elementwise and
304        in-place.
305
306    Returns
307    -------
308    Nothing
309    """
310    #t = (1-np.abs(t)**3)**3
311    t[:] = np.absolute(t) #, out=t) #numpy version?
312    _lowess_mycube(t)
313    t[:] = np.negative(t) #, out = t)
314    t += 1
315    _lowess_mycube(t)
316
317
318def _lowess_mycube(t):
319    """
320    Fast matrix cube
321
322    Parameters
323    ----------
324    t : ndarray
325        Array that is cubed, elementwise and in-place
326
327    Returns
328    -------
329    Nothing
330    """
331    #t **= 3
332    t2 = t*t
333    t *= t2
334
335
336def _lowess_bisquare(t):
337    """
338    The bisquare function applied to a numpy array.
339    The bisquare function is (1-t**2)**2.
340
341    Parameters
342    ----------
343    t : ndarray
344        array bisquare function is applied to, element-wise and in-place.
345
346    Returns
347    -------
348    Nothing
349    """
350    #t = (1-t**2)**2
351    t *= t
352    t[:] = np.negative(t) #, out=t)
353    t += 1
354    t *= t
355