1import datetime
2from datetime import timedelta, timezone
3from numbers import Number
4
5import numpy as np
6from scipy.signal import argrelextrema
7
8
9def _parse_args(args, kwargs, names, *defaults):
10    assert len(defaults) == len(names)
11    result = list(args) + [None] * (len(names) - len(args))
12    for i, (name, res) in enumerate(zip(names, result)):
13        result[i] = kwargs.get(name, res if res is not None else defaults[i])
14    return result
15
16
17def r2(true, pred):
18    """Coefficient of determination (R²)"""
19    nobs = len(pred)
20    true = true[-nobs:]
21    return 1 - np.sum((true - pred)**2) / np.sum((true - np.mean(true))**2)
22
23
24def rmse(true, pred):
25    """Root mean squared error"""
26    nobs = len(pred)
27    return np.sqrt(np.sum((true[-nobs:] - pred) ** 2) / nobs)
28
29
30def mape(true, pred):
31    """Mean absolute percentage error"""
32    nobs = len(pred)
33    return np.mean(np.abs(true[-nobs:] - pred)) / np.abs(true).mean()
34
35
36def mae(true, pred):
37    """Median absolute error"""
38    nobs = len(pred)
39    return np.median(np.abs(true[-nobs:] - pred))
40
41
42def pocid(true, pred):
43    """Prediction on change of direction"""
44    nobs = len(pred)
45    return 100 * np.mean((np.diff(true[-nobs:]) * np.diff(pred)) > 0)
46
47
48def _detrend(x, type):
49    if type == 'diff':
50        x = np.diff(x)
51    elif isinstance(type, str):
52        type = dict(constant=0, linear=1, quadratic=2, cubic=3)[type]
53    if isinstance(type, Number):
54        import statsmodels.api as sm
55        x = sm.tsa.detrend(x, type)
56    return x
57
58
59def _significant_periods(periods, pgram):
60    if pgram.size == 0:
61        return periods, pgram
62
63    # Order ascending
64    periods = periods[::-1]
65    pgram = pgram[::-1]
66    # Scale and extract significant
67    pgram = (pgram - pgram.min()) / pgram.ptp()
68    significant = argrelextrema(pgram, np.greater, order=5)
69    return periods[significant], pgram[significant]
70
71
72def periodogram(x, *args, detrend='diff', **kwargs):
73    """
74    Return periodogram of signal `x`.
75
76    Parameters
77    ----------
78    x: array_like
79        A 1D signal.
80    detrend: 'diff' or False or int
81        Remove trend from x. If int, fit and subtract a polynomial of this
82        order. See also: `statsmodels.tsa.detrend`.
83    args, kwargs:
84        As accepted by `scipy.signal.periodogram`.
85
86    Returns
87    -------
88    periods: array_like
89        The periods at which the spectral density is calculated.
90    pgram: array_like
91        Power spectral density of x.
92    """
93    from scipy.signal import periodogram
94    x = _detrend(x, detrend)
95    freqs, pgram = periodogram(x, *args, detrend=False, **kwargs)
96
97    SKIP = len(x) // 1000  # HACK: For long series, the first few frequency/period values are "unstable".
98    freqs, pgram = freqs[SKIP:], pgram[SKIP:]
99
100    periods = 1 / freqs
101    periods, pgram = _significant_periods(periods, pgram)
102    return periods, pgram
103
104
105def periodogram_nonequispaced(times, x, *, freqs=None,
106                              period_low=None, period_high=None,
107                              n_periods=1000, detrend='linear'):
108    """
109    Compute the Lomb-Scargle periodogram for non-equispaced timeseries.
110
111    Parameters
112    ----------
113    times: array_like
114        Sample times.
115    x: array_like
116        A 1D signal.
117    freqs: array_like, optional
118        **Angular** frequencies for output periodogram.
119    period_low: float
120        If `freqs` not provided, the lowest period for which to look for
121        periodicity. Defaults to 5th percentile of time difference between
122        observations.
123    period_high: float
124        If `freqs` not provided, the highest period for which to look for
125        periodicity. Defaults to 80th percentile of time difference of
126        observations, or 200*period_low, whichever is larger.
127    n_periods: int
128        Number of periods between period_low and period_high to try.
129    detrend: 'diff' or False or int
130        Remove trend from x. If int, fit and subtract a polynomial of this
131        order. See also: `statsmodels.tsa.detrend`.
132
133    Returns
134    -------
135    periods: array_like
136        The periods at which the spectral density is calculated.
137    pgram: array_like
138        Lomb-Scargle periodogram.
139
140    Notes
141    -----
142    Read also:
143    https://jakevdp.github.io/blog/2015/06/13/lomb-scargle-in-python/#lomb-scargle-algorithms-in-python
144    """
145    from scipy.signal import lombscargle
146    x = _detrend(x, detrend)
147    if detrend == 'diff':
148        times = times[1:]
149
150    if freqs is None:
151        percentile = np.percentile(np.diff(times), [5, 80])
152        if period_low is None:
153            period_low = percentile[0]
154        if period_high is None:
155            period_high = max(200 * period_low, percentile[1])
156        # Periods *from high to low* because they are reversed later!
157        periods = np.linspace(period_high, period_low, n_periods)
158        freqs = 2 * np.pi / periods
159    else:
160        periods = 2 * np.pi / freqs
161
162    pgram = lombscargle(times, x, freqs)
163    # Normalize -- I have no idea what I am doing; took this from
164    # https://jakevdp.github.io/blog/2015/06/13/lomb-scargle-in-python/#lomb-scargle-algorithms-in-python
165    pgram *= 2 / (len(x) * x.std()**2)
166
167    periods, pgram = _significant_periods(periods, pgram)
168    return periods, pgram
169
170
171def _significant_acf(corr, has_confint):
172    if has_confint:
173        corr, confint = corr
174
175    periods = argrelextrema(np.abs(corr), np.greater, order=3)[0]
176    corr = corr[periods]
177    if has_confint:
178        confint = confint[periods]
179
180    result = np.column_stack((periods, corr))
181    if has_confint:
182        result = (result, np.column_stack((periods, confint)))
183    return result
184
185
186def autocorrelation(x, *args, unbiased=True, nlags=None, fft=True, **kwargs):
187    """
188    Return autocorrelation function of signal `x`.
189
190    Parameters
191    ----------
192    x: array_like
193        A 1D signal.
194    nlags: int
195        The number of lags to calculate the correlation for (default .9*len(x))
196    fft:  bool
197        Compute the ACF via FFT.
198    args, kwargs
199        As accepted by `statsmodels.tsa.stattools.acf`.
200
201    Returns
202    -------
203    acf: array
204        Autocorrelation function.
205    confint: array, optional
206        Confidence intervals if alpha kwarg provided.
207    """
208    from statsmodels.tsa.stattools import acf
209    if nlags is None:
210        nlags = int(.9 * len(x))
211    corr = acf(x, *args, unbiased=unbiased, nlags=nlags, fft=fft, **kwargs)
212    return _significant_acf(corr, kwargs.get('alpha'))
213
214
215def partial_autocorrelation(x, *args, nlags=None, method='ldb', **kwargs):
216    """
217    Return partial autocorrelation function (PACF) of signal `x`.
218
219    Parameters
220    ----------
221    x: array_like
222        A 1D signal.
223    nlags: int
224        The number of lags to calculate the correlation for
225        (default: min(len(x)//2 - 1, len(x) - 1))
226    args, kwargs
227        As accepted by `statsmodels.tsa.stattools.pacf`.
228
229    Returns
230    -------
231    acf: array
232        Partioal autocorrelation function.
233    confint : optional
234        As returned by `statsmodels.tsa.stattools.pacf`.
235    """
236    from statsmodels.tsa.stattools import pacf
237    if nlags is None:
238        nlags = min(len(x)//2 - 1, len(x) - 1)
239    corr = pacf(x, *args, nlags=nlags, method=method, **kwargs)
240    return _significant_acf(corr, kwargs.get('alpha'))
241
242
243def interpolate_timeseries(data, method='linear', multivariate=False):
244    """Return a new Timeseries (Table) with nan values interpolated.
245
246    Parameters
247    ----------
248    data : Orange.data.Table
249        A table to interpolate.
250    method : str {'linear', 'cubic', 'nearest', 'mean'}
251        The interpolation method to use.
252    multivariate : bool
253        Whether to perform multivariate (2d) interpolation first.
254        Univariate interpolation of same method is always performed as a
255        final step.
256
257    Returns
258    -------
259    series : Timeseries
260        A table with nans in original replaced with interpolated values.
261    """
262    from scipy.interpolate import griddata, interp1d
263    from Orange.data import Domain
264    from orangecontrib.timeseries import Timeseries
265
266    attrs = data.domain.attributes
267    cvars = data.domain.class_vars
268    metas = data.domain.metas
269    X = data.X.copy()
270    Y = np.column_stack((data.Y,)).copy()  # make 2d
271    M = data.metas.copy()
272
273    # Interpolate discrete columns to mode/nearest value
274    _x = Timeseries.from_data_table(data).time_values.astype(float)
275    for A, vars in ((X, attrs),
276                    (Y, cvars)):
277        for i, var in enumerate(vars):
278            if not var.is_discrete:
279                continue
280            vals = A[:, i]
281            isnan = np.isnan(vals)
282            if not isnan.any():
283                continue
284            if method == 'nearest':
285                nonnan = ~isnan
286                x, vals = _x[nonnan], vals[nonnan]
287                f = interp1d(x, vals, kind='nearest', copy=False, assume_sorted=True)
288                A[isnan, i] = f(_x)[isnan]
289                continue
290            A[isnan, i] = np.argmax(np.bincount(vals[~isnan].astype(int)))
291
292    # Interpolate data
293    if multivariate and method != 'mean':
294        for A, vars in ((X, attrs),
295                        (Y, cvars)):
296            is_continuous = [var.is_continuous for var in vars]
297            if sum(is_continuous) < 3 or A.shape[0] < 3:
298                # griddata() doesn't work with 1d data
299                continue
300
301            # Only multivariate continuous features
302            Acont = A[:, is_continuous]
303            isnan = np.isnan(Acont)
304            if not isnan.any():
305                continue
306            nonnan = ~isnan
307            vals = griddata(nonnan.nonzero(), Acont[nonnan], isnan.nonzero(),
308                            method=method)
309            Acont[isnan] = vals
310            A[:, is_continuous] = Acont
311
312    # Do the 1d interpolation anyway in case 2d left some nans
313    for A in (X, Y):
314        for i, col in enumerate(A.T):
315            isnan = np.isnan(col)
316            # there is no need to interpolate if there are no nans
317            # there needs to be at least two numbers
318            if not isnan.any() or sum(~isnan) < 2:
319                continue
320
321            # Mean interpolation
322            if method == 'mean':
323                A[isnan, i] = np.nanmean(col)
324                continue
325
326            nonnan = ~isnan
327            f = interp1d(_x[nonnan], col[nonnan], kind=method,
328                         copy=False, assume_sorted=True, bounds_error=False)
329            A[isnan, i] = f(_x[isnan])
330
331            # nearest-interpolate any nans at vals start and end
332            # TODO: replace nearest with linear/OLS?
333            valid = (~np.isnan(col)).nonzero()[0]
334            first, last = valid[0], valid[-1]
335            col[:first] = col[first]
336            col[last:] = col[last]
337
338    ts = Timeseries.from_numpy(Domain(attrs, cvars, metas), X, Y, M)
339    return ts
340
341
342def seasonal_decompose(data, model='multiplicative', period=12, *, callback=None):
343    """
344    Return table of decomposition components of original features and
345    original features seasonally adjusted.
346
347    Parameters
348    ----------
349    data : Timeseries
350        A table of featres to decompose/adjust.
351    model : str {'additive', 'multiplicative'}
352        A decompostition model. See:
353        https://en.wikipedia.org/wiki/Decomposition_of_time_series
354    period : int
355        The period length of season.
356    callback : callable
357        Optional callback to call (with no parameters) after each iteration.
358
359    Returns
360    -------
361    table : Timeseries
362        Table with columns: original series seasonally adjusted, original
363        series' seasonal components, trend components, and residual components.
364    """
365    from operator import sub, truediv
366    from Orange.data import Domain, ContinuousVariable
367    from orangecontrib.timeseries import Timeseries
368    from orangecontrib.timeseries.widgets.utils import available_name
369    import statsmodels.api as sm
370
371    def _interp_trend(trend):
372        first = next(i for i, val in enumerate(trend) if val == val)
373        last = trend.size - 1 - next(
374            i for i, val in enumerate(trend[::-1]) if val == val)
375        d = 3
376        first_last = min(first + d, last)
377        last_first = max(first, last - d)
378
379        k, n = np.linalg.lstsq(
380            np.column_stack((np.arange(first, first_last), np.ones(first_last - first))),
381            trend[first:first_last])[0]
382        trend[:first] = np.arange(0, first) * k + n
383
384        k, n = np.linalg.lstsq(
385            np.column_stack((np.arange(last_first, last), np.ones(last - last_first))),
386            trend[last_first:last])[0]
387        trend[last + 1:] = np.arange(last + 1, trend.size) * k + n
388        return trend
389
390    attrs = []
391    X = []
392    recomposition = sub if model == 'additive' else truediv
393    interp_data = data.interp()
394    for var in data.domain.variables:
395        decomposed = sm.tsa.seasonal_decompose(np.ravel(interp_data[:, var]),
396                                               model=model,
397                                               freq=period)
398        adjusted = recomposition(decomposed.observed,
399                                 decomposed.seasonal)
400
401        season = decomposed.seasonal
402        trend = _interp_trend(decomposed.trend)
403        resid = recomposition(adjusted, trend)
404
405        # Re-apply nans
406        isnan = np.isnan(data[:, var]).ravel()
407        adjusted[isnan] = np.nan
408        trend[isnan] = np.nan
409        resid[isnan] = np.nan
410
411        attrs.extend(
412            ContinuousVariable(
413                available_name(data.domain,
414                               var.name + ' ({})'.format(transform)))
415            for transform in
416            ('season. adj.', 'seasonal', 'trend', 'residual')
417        )
418        X.extend((adjusted, season, trend, resid))
419
420        if callback:
421            callback()
422
423    ts = Timeseries.from_numpy(Domain(attrs), np.column_stack(X))
424    return ts
425
426
427def granger_causality(data, max_lag=10, alpha=.05, *, callback=None):
428    """
429    Return results of Granger-causality tests.
430
431    Parameters
432    ----------
433    data : Timeseries
434        A table of features to compute Granger causality between.
435    max_lag : int
436        The maximum lag to compute Granger-causality for.
437    alpha : float in (0, 1)
438        Confidence of test is 1 - alpha.
439    callback : callable
440        A callback to call in each iteration with ratio of completion.
441
442    Returns
443    -------
444    res : list of lists
445        Each internal list is [lag, p-value, antecedent, consequent] where
446        lag is the minimum lag at which antecedent feature in data is
447        Granger-causal for the consequent feature in data.
448    """
449    from statsmodels.tsa.stattools import grangercausalitytests
450    from Orange.data import Table, Domain
451    # TODO: use VAR Granger causality
452    # TODO: consider CCM in stead/addition of GC: https://en.wikipedia.org/wiki/Convergent_cross_mapping
453    # http://statsmodels.sourceforge.net/devel/generated/statsmodels.tsa.vector_ar.var_model.VARResults.test_causality.html
454    # http://statsmodels.sourceforge.net/devel/vector_ar.html#granger-causality
455
456    data = data.interp()
457    domain = [var for var in data.domain.variables if var.is_continuous]
458    res = []
459
460    step = 0
461    for row_attr in domain:
462        for col_attr in domain:
463            if row_attr == col_attr or data.time_variable in (row_attr, col_attr):
464                continue
465            X = Table(Domain([], [], [col_attr, row_attr], data.domain), data).metas
466            try:
467                tests = grangercausalitytests(X, max_lag, verbose=False)
468                lag, p_val = next(
469                    (
470                        (lag, tests[lag][0]["ssr_ftest"][1])
471                        for lag in range(1, 1 + max_lag)
472                        if tests[lag][0]["ssr_ftest"][1] < alpha
473                    ),
474                    (None, None),
475                )
476            except ValueError:
477                lag = p_val = None
478            if lag:
479                res.append([lag, p_val, row_attr.name, col_attr.name])
480
481            step += 1
482            if callback:
483                callback(step / ((len(domain) - 1) ** 2 - len(domain) + 1))
484    return res
485
486
487def moving_transform(data, spec, fixed_wlen=0):
488    """
489    Return data transformed according to spec.
490
491    Parameters
492    ----------
493    data : Timeseries
494        A table with features to transform.
495    spec : list of lists
496        A list of lists [feature:Variable, window_length:int, function:callable].
497    fixed_wlen : int
498        If not 0, then window_length in spec is disregarded and this length
499        is used. Also the windows don't shift by one but instead align
500        themselves side by side.
501
502    Returns
503    -------
504    transformed : Timeseries
505        A table of original data its transformations.
506    """
507    from itertools import chain
508    from Orange.data import ContinuousVariable, Domain
509    from orangecontrib.timeseries import Timeseries
510    from orangecontrib.timeseries.widgets.utils import available_name
511    from orangecontrib.timeseries.agg_funcs import Cumulative_sum, Cumulative_product
512
513    X = []
514    attrs = []
515
516    for var, wlen, func in spec:
517        col = np.ravel(data[:, var])
518
519        if fixed_wlen:
520            wlen = fixed_wlen
521
522        if func in (Cumulative_sum, Cumulative_product):
523            out = list(chain.from_iterable(func(col[i:i + wlen])
524                                           for i in range(0, len(col), wlen)))
525        else:
526            # In reverse cause lazy brain. Also prefer informative ends, not beginnings as much
527            col = col[::-1]
528            out = [func(col[i:i + wlen])
529                   for i in range(0, len(col), wlen if bool(fixed_wlen) else 1)]
530            out = out[::-1]
531
532        X.append(out)
533
534        template = '{} ({}; {})'.format(var.name, wlen, func.__name__.lower().replace('_', ' '))
535        name = available_name(data.domain, template)
536        attrs.append(ContinuousVariable(name))
537
538    dataX, dataY, dataM = data.X, data.Y, data.metas
539    if fixed_wlen:
540        n = len(X[0])
541        dataX = dataX[::-1][::fixed_wlen][:n][::-1]
542        dataY = dataY[::-1][::fixed_wlen][:n][::-1]
543        dataM = dataM[::-1][::fixed_wlen][:n][::-1]
544
545    ts = Timeseries.from_numpy(Domain(data.domain.attributes + tuple(attrs),
546                                      data.domain.class_vars,
547                                      data.domain.metas),
548                               np.column_stack(
549                                   (dataX, np.column_stack(X))) if X else dataX,
550                               dataY, dataM)
551    ts.time_variable = data.time_variable
552    return ts
553
554
555def model_evaluation(data, models, n_folds, forecast_steps, *, callback=None):
556    """
557    Evaluate models on data.
558
559    Parameters
560    ----------
561    data : Timeseries
562        The timeseries to model. Must have a class variable that is used
563        for prediction and scoring.
564    models : list
565        List of models (objects with fit() and predict() methods) to try.
566    n_folds : int
567        Number of iterations.
568    forecast_steps : int
569        Number of forecast steps at each iteraction.
570    callback : callable, optional
571        Optional argument-less callback to call after each iteration.
572
573    Returns
574    -------
575    results : list of lists
576        A table with horizontal and vertical headers and results. Print it
577        to see it.
578    """
579    if not data.domain.class_var:
580        raise ValueError('Data requires a target variable. Use Select Columns '
581                         'widget to set one variable as target.')
582    max_lag = max(m.max_order for m in models)
583    if n_folds * forecast_steps + max_lag > len(data):
584        raise ValueError(
585            'Supplied time series is too short for this many folds '
586            '/ step size. Retry with fewer iterations.')
587
588    def _score_vector(model, true, pred):
589        true = np.asanyarray(true)
590        pred = np.asanyarray(pred)
591        nonnan = ~np.isnan(true)
592        if not nonnan.all():
593            pred = pred[nonnan]
594            true = true[nonnan]
595        row = [str(getattr(model, 'name', model))]
596        if pred.size:
597            row.extend(score(true, pred) for score in (rmse, mae, mape, pocid, r2))
598        else:
599            row.extend(['err'] * 5)
600        try:
601            row.extend([model.results.aic, model.results.bic])
602        except Exception:
603            row.extend(['err'] * 2)
604        return row
605
606    res = [['Model', 'RMSE', 'MAE', 'MAPE', 'POCID', 'R²', 'AIC', 'BIC']]
607    interp_data = data.interp()
608    true_y = np.ravel(data[:, data.domain.class_var])
609
610    for model in models:
611        full_true = []
612        full_pred = []
613        for fold in range(1, n_folds + 1):
614            train_end = -fold * forecast_steps
615            try:
616                model.fit(interp_data[:train_end])
617                pred, _, _ = model.predict(forecast_steps)
618            except Exception:
619                continue
620            finally:
621                if callback:
622                    callback()
623
624            full_true.extend(true_y[train_end:][:forecast_steps])  # Sliced twice because it doesn't work at the end, e.g. [-3:0] == [] :(
625            full_pred.extend(np.c_[pred][:, 0])  # Only interested in the class var
626            assert len(full_true) == len(full_pred)
627
628        res.append(_score_vector(model, full_true, full_pred))
629
630        # Score in-sample fittedvalues
631        try:
632            model.fit(interp_data)
633            fittedvalues = model.fittedvalues()
634            if fittedvalues.ndim > 1:
635                fittedvalues = fittedvalues[..., 0]
636        except Exception:
637            row = ['err'] * 7
638        else:
639            row = _score_vector(model, true_y, fittedvalues)
640        row[0] = row[0] + ' (in-sample)'
641        res.append(row)
642    return res
643
644
645SECONDS = 126230400
646DAYS = 1461
647
648
649def timestamp(dt):
650    try:
651        ts = dt.timestamp()
652    except OverflowError:
653        if not dt.tzinfo:
654            # treat datetime as in local timezone
655            dt = dt.astimezone()
656        # compute timestamp manually
657        ts = (dt - datetime.datetime(1970, 1, 1, tzinfo=timezone.utc)) /\
658             timedelta(seconds=1)
659    return ts
660
661
662def fromtimestamp(ts, tz=None):
663    try:
664        dt = datetime.datetime.fromtimestamp(ts, tz=tz)
665    except OSError:
666        k = int(-ts // SECONDS + 1)  # Cast avoids "unsupported type for timedelta days: numpy.int32"
667        dt = datetime.datetime.fromtimestamp(ts + k * SECONDS, tz=tz) - \
668             timedelta(days=k * DAYS)
669    return dt
670