1from itertools import chain
2
3import numpy as np
4from scipy.stats import norm
5import statsmodels.api as sm
6
7from Orange.data import Table
8from orangecontrib.timeseries import Timeseries, rmse, mape, mae, pocid, r2
9
10
11IC_MAGIC = 'magic'
12
13
14class NotFittedError(ValueError, AttributeError):
15    """Raised when model predictions made without fitting"""
16
17
18class _BaseModel:
19    REQUIRES_STATIONARY = True
20    SUPPORTS_VECTOR = False
21
22    _NOT_FITTED = NotFittedError('Model must be fitted first (see fit() method)')
23
24    __wrapped__ = None
25
26    def __init__(self):
27        self.model = None
28        self.results = None
29        self.order = ()
30        self._model_kwargs = dict(missing='raise')
31        self._fit_kwargs = dict()
32        self._endog = None
33
34        self._table_var_names = None
35        self._table_name = None
36        self._table_timevar = None
37        self._table_timevals = None
38
39    def _before_init(self, endog, exog):
40        """
41        This method is called before the statsmodels model is init. It can
42        last-minute transform the endog and exog variables, or it can set
43        constructor parameters depending on their values.
44        """
45        return endog, exog
46
47    def _before_fit(self, endog, exog):
48        """
49        This method is called before fit() with the same parameters. It
50        can be used to set last-minute, endog or exog-dependent parameters.
51        Override it if you need it.
52        """
53
54    def _fittedvalues(self):
55        """
56        This was needed to override for ARIMA as its fittedvalues returned
57        unintegraded series instead.
58        """
59        return self.results.fittedvalues
60
61    def fittedvalues(self, as_table=False):
62        """
63        Return predictions for in-sample observations, i.e. the model's
64        approximations of the original training values.
65
66        Parameters
67        ----------
68        as_table : bool
69            If True, return results as an Orange.data.Table.
70
71        Returns
72        -------
73        fitted_values : array_like
74        """
75        if self.results is None:
76            raise self._NOT_FITTED
77        values = self._fittedvalues()
78        if as_table:
79            values = self._as_table(values, 'fitted')
80        return values
81
82    def _as_table(self, values, what):
83        """Used for residuals() and fittedvalues() methods."""
84        from Orange.data import Domain, ContinuousVariable
85        attrs = []
86        n_vars = values.shape[1] if values.ndim == 2 else 1
87        if n_vars == 1:
88            values = np.atleast_2d(values).T
89        tvar = None
90        # If 1d, time var likely not already present, so lets add it if possible
91        if n_vars == 1 and self._table_timevar:
92            values = np.column_stack((self._table_timevals[-values.shape[0]:],
93                                      values))
94            tvar = self._table_timevar
95            attrs.append(tvar)
96        for i, name in zip(range(n_vars),
97                           self._table_var_names or range(n_vars)):
98            attrs.append(ContinuousVariable('{} ({})'.format(name, what)))
99
100            # Make the fitted time variable time variable
101            if self._table_timevar and self._table_timevar.name == name:
102                tvar = attrs[-1]
103
104        table = Timeseries.from_numpy(Domain(attrs), values)
105        table.time_variable = tvar
106        table.name = (self._table_name or '') + '({} {})'.format(self, what)
107        return table
108
109    def residuals(self, as_table=True):
110        """
111        Return residuals (prediction errors) for in-sample observations,
112
113        Parameters
114        ----------
115        as_table : bool
116            If True, return results as an Orange.data.Table.
117
118        Returns
119        -------
120        residuals : array_like
121        """
122        if self.results is None:
123            raise self._NOT_FITTED
124        resid = self.results.resid
125        if as_table:
126            resid = self._as_table(resid, 'residuals')
127        return resid
128
129    def _predict(self, steps, exog, alpha):
130        """
131        Return forecast predictions (along with confidence intervals)
132        for steps ahead given exog values (or None if not needed).
133        """
134        raise NotImplementedError
135
136    def _orange_arrays(self, table):
137        self._table_var_names = [v.name for v in chain(table.domain.class_vars,
138                                                       table.domain.attributes)]
139        self._table_name = table.name
140        if getattr(table, 'time_variable', None):
141            self._table_timevar = table.time_variable
142            self._table_timevals = table.time_values
143        y = table.Y.ravel()
144        X = table.X
145        if y.size:
146            defined_range = (np.arange(1, len(y) + 1) * ~np.isnan(y)).max()
147            y = y[:defined_range]
148            X = X[:defined_range]
149        return y, X
150
151    def fit(self, endog, exog=None):
152        """
153        Fit the model to endogenous variable endog, optionally given
154        exogenous column variables exog.
155
156        Parameters
157        ----------
158        endog : array_like
159            Dependent variable (y) of shape ``[nobs, k]``
160            (``k = 1`` for a single variable; ``k > 1`` for vector models).
161        exog : array_like
162            If model supports it, the additional independent variables (X) of
163            shape ``[nobs, k_vars]``.
164
165        Returns
166        -------
167        fitted_model
168        """
169        if isinstance(endog, Table):
170            assert exog is None
171            endog, exog = self._orange_arrays(endog)
172
173        if not endog.size:
174            if not exog.size:
175                raise ValueError('Input series are empty. Nothing to learn.')
176            endog, exog = exog, None
177
178        endog, exog = self._before_init(endog, exog)
179        self._endog = endog
180        kwargs = self._model_kwargs.copy()
181        kwargs.update(endog=endog)
182        if exog is not None:
183            kwargs.update(exog=exog)
184        model = self.model = self.__wrapped__(**kwargs)
185
186        self._before_fit(endog, exog)
187        kwargs = self._fit_kwargs.copy()
188        self.results = model.fit(**kwargs)
189        return self
190
191    def errors(self):
192        """Return dict of RMSE/MAE/MAPE/POCID/R² errors on in-sample, fitted values
193
194        Returns
195        -------
196        errors : dict
197            Mapping of error measure str -> error value.
198        """
199        if self.results is None:
200            raise self._NOT_FITTED
201        true = self._endog
202        pred = self._fittedvalues()
203        return dict(r2=r2(true, pred),
204                    mae=mae(true, pred),
205                    rmse=rmse(true, pred),
206                    mape=mape(true, pred),
207                    pocid=pocid(true, pred))
208
209    def _predict_as_table(self, prediction, confidence):
210        from Orange.data import Domain, ContinuousVariable
211        means, lows, highs = [], [], []
212        n_vars = prediction.shape[2] if len(prediction.shape) > 2 else 1
213        for i, name in zip(range(n_vars),
214                           self._table_var_names or range(n_vars)):
215            mean = ContinuousVariable('{} (forecast)'.format(name))
216            low = ContinuousVariable('{} ({:d}%CI low)'.format(name, confidence))
217            high = ContinuousVariable('{} ({:d}%CI high)'.format(name, confidence))
218            low.ci_percent = high.ci_percent = confidence
219            mean.ci_attrs = (low, high)
220            means.append(mean)
221            lows.append(low)
222            highs.append(high)
223        domain = Domain(means + lows + highs)
224        X = np.column_stack(prediction)
225        table = Timeseries.from_numpy(domain, X)
226        table.name = (self._table_name or '') + '({} forecast)'.format(self)
227        return table
228
229    def predict(self, steps=1, exog=None, *, alpha=.05, as_table=False):
230        """Make the forecast of future values.
231
232        Parameters
233        ----------
234        steps : int
235            The number of steps to make forecast for.
236        exog : array_like
237            The exogenous variables some models require.
238        alpha : float
239            Calculate and return (1-alpha)100% confidence intervals.
240        as_table : bool
241            If True, return results as an Orange.data.Table.
242
243        Returns
244        -------
245        forecast : array_like
246            (forecast, low, high)
247        """
248        if self.results is None:
249            raise self._NOT_FITTED
250        prediction = self._predict(steps, exog, alpha)
251        if as_table:
252            prediction = self._predict_as_table(prediction, int((1 - alpha) * 100))
253        return prediction
254
255    def __str__(self):
256        return str(self.__wrapped__)
257
258    @property
259    def max_order(self):
260        return max(self.order, default=0)
261
262    def clear(self):
263        """Reset (unfit) the current model"""
264        self.model = None
265        self.results = None
266        self._endog = None
267        self._table_var_names = None
268        self._table_name = None
269        self._table_timevar = None
270        self._table_timevals = None
271
272    def copy(self):
273        """Copy the current model"""
274        from copy import deepcopy
275        return deepcopy(self)
276
277
278class ARIMA(_BaseModel):
279    """Autoregressive integrated moving average (ARIMA) model
280
281    An auto regression (AR) and moving average (MA) model with differencing.
282
283    If exogenous variables are provided in fit() method, this becomes an
284    ARIMAX model.
285
286    Parameters
287    ----------
288    order : tuple (p, d, q)
289        Tuple of three non-negative integers: (p) the AR order, (d) the
290        degree of differencing, and (q) the order of MA model.
291        If d = 0, this becomes an ARMA model.
292
293    Returns
294    -------
295    unfitted_model
296    """
297    REQUIRES_STATIONARY = False
298    __wrapped__ = sm.tsa.ARIMA
299
300    def __init__(self, order=(1, 0, 0), use_exog=False):
301        super().__init__()
302        self.order = order
303        self.use_exog = use_exog
304        self._model_kwargs.update(order=order)
305        self._fit_kwargs.update(disp=0,  # Don't print shit
306                                verbose=False)
307
308    def __str__(self):
309        return '{}({})'.format('AR{}MA{}'.format('I' if self.order[1] else '',
310                                                 'X' if self.use_exog else ''),
311                               ','.join(map(str, self.order)))
312
313    def _predict(self, steps, exog, alpha):
314        forecast, _, confint = self.results.forecast(steps, exog, alpha)
315        return np.c_[forecast, confint].T
316
317    def _before_init(self, endog, exog):
318        exog = exog if self.use_exog else None
319        if len(endog) == 0:
320            raise ValueError('Need an endogenous (target) variable to fit')
321        return endog, exog
322
323    def _fittedvalues(self):
324        # Statsmodels supports different args whether series is
325        # differentiated (order has d) or not. -- stupid statsmodels
326        kwargs = dict(typ='levels') if self.order[1] > 0 else {}
327        return self.results.predict(**kwargs)
328
329
330class VAR(_BaseModel):
331    """Vector auto-regression (VAR) model
332
333    A multivariate auto regression model of multiple inter-dependent variables.
334
335    Parameters
336    ----------
337    maxlags : int
338        The exact number of lags (order) to construct the model with or
339        the maximum number of lags to check for order selection (see `ic`
340        parameter). Defaults to ``12*(nobs/10)**.5``.
341    ic : {‘aic’, ‘fpe’, ‘hqic’, ‘bic’, 'magic', None}
342        The information criterion to optimize order (`maxlags`) on.
343    trend : {'c', 'ct', 'ctt', 'nc'}
344        Constant (c); constant and trend (ct); constant, linear and
345        quadratic trend (ctt); no constant, no trend (nc). These are
346        prepended to the columns of the data set.
347
348    Returns
349    -------
350    unfitted_model
351    """
352    SUPPORTS_VECTOR = True
353    __wrapped__ = sm.tsa.VAR
354
355    MAX_LAGS = lambda arr: 12 * (len(arr) / 10) ** .5
356
357    def __init__(self, maxlags=None, ic=None, trend='c'):
358        super().__init__()
359        self.ic = ic
360        self.trend = trend
361        self._ic_magic = ic == IC_MAGIC
362        self.order = (maxlags,)
363        self._maxlags = self.MAX_LAGS if maxlags is None else maxlags
364        self._fit_kwargs.update(maxlags=maxlags, trend=trend, ic=ic)
365
366    def __str__(self):
367        args = ('auto' if callable(self._maxlags) else self._maxlags,
368                self.ic,
369                self.trend if self.trend != 'c' else None)
370        return '{}({})'.format(self.__wrapped__.__name__,
371                               ','.join(map(str, filter(None, args))))
372
373    def _before_init(self, endog, exog):
374        if exog is not None:
375            endog = np.column_stack((endog, exog)) if endog.size else exog
376        return endog, None
377
378    def _before_fit(self, endog, exog=None):
379        maxlags = self._maxlags
380
381        if callable(maxlags):
382            maxlags = maxlags(endog)
383            self._fit_kwargs.update(maxlags=maxlags)
384            self.order = (maxlags,)
385
386        if self._ic_magic:
387            ic_results = self.model.select_order(maxlags)
388            lags = sum(ic_results.values()) // len(ic_results)
389            self.order = (lags,)
390            self._fit_kwargs.update(maxlags=lags, ic=None)
391
392    def _predict(self, steps, exog, alpha):
393        assert 0 < alpha < 1
394        y = (exog if exog is not None else self._endog)[-self.results.k_ar:]
395        forecast = self.results.forecast(y, steps)
396        #  FIXME: The following is adapted from statsmodels's
397        # VAR.forecast_interval() as the original doesn't work
398        q = norm.ppf(1 - alpha / 2)
399        sigma = np.sqrt(np.abs(np.diagonal(self.results.mse(steps), axis1=2)))
400        err = q * sigma
401        return np.asarray([forecast, forecast - err, forecast + err])
402
403