1import datetime 2from datetime import timedelta, timezone 3from numbers import Number 4 5import numpy as np 6from scipy.signal import argrelextrema 7 8 9def _parse_args(args, kwargs, names, *defaults): 10 assert len(defaults) == len(names) 11 result = list(args) + [None] * (len(names) - len(args)) 12 for i, (name, res) in enumerate(zip(names, result)): 13 result[i] = kwargs.get(name, res if res is not None else defaults[i]) 14 return result 15 16 17def r2(true, pred): 18 """Coefficient of determination (R²)""" 19 nobs = len(pred) 20 true = true[-nobs:] 21 return 1 - np.sum((true - pred)**2) / np.sum((true - np.mean(true))**2) 22 23 24def rmse(true, pred): 25 """Root mean squared error""" 26 nobs = len(pred) 27 return np.sqrt(np.sum((true[-nobs:] - pred) ** 2) / nobs) 28 29 30def mape(true, pred): 31 """Mean absolute percentage error""" 32 nobs = len(pred) 33 return np.mean(np.abs(true[-nobs:] - pred)) / np.abs(true).mean() 34 35 36def mae(true, pred): 37 """Median absolute error""" 38 nobs = len(pred) 39 return np.median(np.abs(true[-nobs:] - pred)) 40 41 42def pocid(true, pred): 43 """Prediction on change of direction""" 44 nobs = len(pred) 45 return 100 * np.mean((np.diff(true[-nobs:]) * np.diff(pred)) > 0) 46 47 48def _detrend(x, type): 49 if type == 'diff': 50 x = np.diff(x) 51 elif isinstance(type, str): 52 type = dict(constant=0, linear=1, quadratic=2, cubic=3)[type] 53 if isinstance(type, Number): 54 import statsmodels.api as sm 55 x = sm.tsa.detrend(x, type) 56 return x 57 58 59def _significant_periods(periods, pgram): 60 if pgram.size == 0: 61 return periods, pgram 62 63 # Order ascending 64 periods = periods[::-1] 65 pgram = pgram[::-1] 66 # Scale and extract significant 67 pgram = (pgram - pgram.min()) / pgram.ptp() 68 significant = argrelextrema(pgram, np.greater, order=5) 69 return periods[significant], pgram[significant] 70 71 72def periodogram(x, *args, detrend='diff', **kwargs): 73 """ 74 Return periodogram of signal `x`. 75 76 Parameters 77 ---------- 78 x: array_like 79 A 1D signal. 80 detrend: 'diff' or False or int 81 Remove trend from x. If int, fit and subtract a polynomial of this 82 order. See also: `statsmodels.tsa.detrend`. 83 args, kwargs: 84 As accepted by `scipy.signal.periodogram`. 85 86 Returns 87 ------- 88 periods: array_like 89 The periods at which the spectral density is calculated. 90 pgram: array_like 91 Power spectral density of x. 92 """ 93 from scipy.signal import periodogram 94 x = _detrend(x, detrend) 95 freqs, pgram = periodogram(x, *args, detrend=False, **kwargs) 96 97 SKIP = len(x) // 1000 # HACK: For long series, the first few frequency/period values are "unstable". 98 freqs, pgram = freqs[SKIP:], pgram[SKIP:] 99 100 periods = 1 / freqs 101 periods, pgram = _significant_periods(periods, pgram) 102 return periods, pgram 103 104 105def periodogram_nonequispaced(times, x, *, freqs=None, 106 period_low=None, period_high=None, 107 n_periods=1000, detrend='linear'): 108 """ 109 Compute the Lomb-Scargle periodogram for non-equispaced timeseries. 110 111 Parameters 112 ---------- 113 times: array_like 114 Sample times. 115 x: array_like 116 A 1D signal. 117 freqs: array_like, optional 118 **Angular** frequencies for output periodogram. 119 period_low: float 120 If `freqs` not provided, the lowest period for which to look for 121 periodicity. Defaults to 5th percentile of time difference between 122 observations. 123 period_high: float 124 If `freqs` not provided, the highest period for which to look for 125 periodicity. Defaults to 80th percentile of time difference of 126 observations, or 200*period_low, whichever is larger. 127 n_periods: int 128 Number of periods between period_low and period_high to try. 129 detrend: 'diff' or False or int 130 Remove trend from x. If int, fit and subtract a polynomial of this 131 order. See also: `statsmodels.tsa.detrend`. 132 133 Returns 134 ------- 135 periods: array_like 136 The periods at which the spectral density is calculated. 137 pgram: array_like 138 Lomb-Scargle periodogram. 139 140 Notes 141 ----- 142 Read also: 143 https://jakevdp.github.io/blog/2015/06/13/lomb-scargle-in-python/#lomb-scargle-algorithms-in-python 144 """ 145 from scipy.signal import lombscargle 146 x = _detrend(x, detrend) 147 if detrend == 'diff': 148 times = times[1:] 149 150 if freqs is None: 151 percentile = np.percentile(np.diff(times), [5, 80]) 152 if period_low is None: 153 period_low = percentile[0] 154 if period_high is None: 155 period_high = max(200 * period_low, percentile[1]) 156 # Periods *from high to low* because they are reversed later! 157 periods = np.linspace(period_high, period_low, n_periods) 158 freqs = 2 * np.pi / periods 159 else: 160 periods = 2 * np.pi / freqs 161 162 pgram = lombscargle(times, x, freqs) 163 # Normalize -- I have no idea what I am doing; took this from 164 # https://jakevdp.github.io/blog/2015/06/13/lomb-scargle-in-python/#lomb-scargle-algorithms-in-python 165 pgram *= 2 / (len(x) * x.std()**2) 166 167 periods, pgram = _significant_periods(periods, pgram) 168 return periods, pgram 169 170 171def _significant_acf(corr, has_confint): 172 if has_confint: 173 corr, confint = corr 174 175 periods = argrelextrema(np.abs(corr), np.greater, order=3)[0] 176 corr = corr[periods] 177 if has_confint: 178 confint = confint[periods] 179 180 result = np.column_stack((periods, corr)) 181 if has_confint: 182 result = (result, np.column_stack((periods, confint))) 183 return result 184 185 186def autocorrelation(x, *args, unbiased=True, nlags=None, fft=True, **kwargs): 187 """ 188 Return autocorrelation function of signal `x`. 189 190 Parameters 191 ---------- 192 x: array_like 193 A 1D signal. 194 nlags: int 195 The number of lags to calculate the correlation for (default .9*len(x)) 196 fft: bool 197 Compute the ACF via FFT. 198 args, kwargs 199 As accepted by `statsmodels.tsa.stattools.acf`. 200 201 Returns 202 ------- 203 acf: array 204 Autocorrelation function. 205 confint: array, optional 206 Confidence intervals if alpha kwarg provided. 207 """ 208 from statsmodels.tsa.stattools import acf 209 if nlags is None: 210 nlags = int(.9 * len(x)) 211 corr = acf(x, *args, unbiased=unbiased, nlags=nlags, fft=fft, **kwargs) 212 return _significant_acf(corr, kwargs.get('alpha')) 213 214 215def partial_autocorrelation(x, *args, nlags=None, method='ldb', **kwargs): 216 """ 217 Return partial autocorrelation function (PACF) of signal `x`. 218 219 Parameters 220 ---------- 221 x: array_like 222 A 1D signal. 223 nlags: int 224 The number of lags to calculate the correlation for 225 (default: min(len(x)//2 - 1, len(x) - 1)) 226 args, kwargs 227 As accepted by `statsmodels.tsa.stattools.pacf`. 228 229 Returns 230 ------- 231 acf: array 232 Partioal autocorrelation function. 233 confint : optional 234 As returned by `statsmodels.tsa.stattools.pacf`. 235 """ 236 from statsmodels.tsa.stattools import pacf 237 if nlags is None: 238 nlags = min(len(x)//2 - 1, len(x) - 1) 239 corr = pacf(x, *args, nlags=nlags, method=method, **kwargs) 240 return _significant_acf(corr, kwargs.get('alpha')) 241 242 243def interpolate_timeseries(data, method='linear', multivariate=False): 244 """Return a new Timeseries (Table) with nan values interpolated. 245 246 Parameters 247 ---------- 248 data : Orange.data.Table 249 A table to interpolate. 250 method : str {'linear', 'cubic', 'nearest', 'mean'} 251 The interpolation method to use. 252 multivariate : bool 253 Whether to perform multivariate (2d) interpolation first. 254 Univariate interpolation of same method is always performed as a 255 final step. 256 257 Returns 258 ------- 259 series : Timeseries 260 A table with nans in original replaced with interpolated values. 261 """ 262 from scipy.interpolate import griddata, interp1d 263 from Orange.data import Domain 264 from orangecontrib.timeseries import Timeseries 265 266 attrs = data.domain.attributes 267 cvars = data.domain.class_vars 268 metas = data.domain.metas 269 X = data.X.copy() 270 Y = np.column_stack((data.Y,)).copy() # make 2d 271 M = data.metas.copy() 272 273 # Interpolate discrete columns to mode/nearest value 274 _x = Timeseries.from_data_table(data).time_values.astype(float) 275 for A, vars in ((X, attrs), 276 (Y, cvars)): 277 for i, var in enumerate(vars): 278 if not var.is_discrete: 279 continue 280 vals = A[:, i] 281 isnan = np.isnan(vals) 282 if not isnan.any(): 283 continue 284 if method == 'nearest': 285 nonnan = ~isnan 286 x, vals = _x[nonnan], vals[nonnan] 287 f = interp1d(x, vals, kind='nearest', copy=False, assume_sorted=True) 288 A[isnan, i] = f(_x)[isnan] 289 continue 290 A[isnan, i] = np.argmax(np.bincount(vals[~isnan].astype(int))) 291 292 # Interpolate data 293 if multivariate and method != 'mean': 294 for A, vars in ((X, attrs), 295 (Y, cvars)): 296 is_continuous = [var.is_continuous for var in vars] 297 if sum(is_continuous) < 3 or A.shape[0] < 3: 298 # griddata() doesn't work with 1d data 299 continue 300 301 # Only multivariate continuous features 302 Acont = A[:, is_continuous] 303 isnan = np.isnan(Acont) 304 if not isnan.any(): 305 continue 306 nonnan = ~isnan 307 vals = griddata(nonnan.nonzero(), Acont[nonnan], isnan.nonzero(), 308 method=method) 309 Acont[isnan] = vals 310 A[:, is_continuous] = Acont 311 312 # Do the 1d interpolation anyway in case 2d left some nans 313 for A in (X, Y): 314 for i, col in enumerate(A.T): 315 isnan = np.isnan(col) 316 # there is no need to interpolate if there are no nans 317 # there needs to be at least two numbers 318 if not isnan.any() or sum(~isnan) < 2: 319 continue 320 321 # Mean interpolation 322 if method == 'mean': 323 A[isnan, i] = np.nanmean(col) 324 continue 325 326 nonnan = ~isnan 327 f = interp1d(_x[nonnan], col[nonnan], kind=method, 328 copy=False, assume_sorted=True, bounds_error=False) 329 A[isnan, i] = f(_x[isnan]) 330 331 # nearest-interpolate any nans at vals start and end 332 # TODO: replace nearest with linear/OLS? 333 valid = (~np.isnan(col)).nonzero()[0] 334 first, last = valid[0], valid[-1] 335 col[:first] = col[first] 336 col[last:] = col[last] 337 338 ts = Timeseries.from_numpy(Domain(attrs, cvars, metas), X, Y, M) 339 return ts 340 341 342def seasonal_decompose(data, model='multiplicative', period=12, *, callback=None): 343 """ 344 Return table of decomposition components of original features and 345 original features seasonally adjusted. 346 347 Parameters 348 ---------- 349 data : Timeseries 350 A table of featres to decompose/adjust. 351 model : str {'additive', 'multiplicative'} 352 A decompostition model. See: 353 https://en.wikipedia.org/wiki/Decomposition_of_time_series 354 period : int 355 The period length of season. 356 callback : callable 357 Optional callback to call (with no parameters) after each iteration. 358 359 Returns 360 ------- 361 table : Timeseries 362 Table with columns: original series seasonally adjusted, original 363 series' seasonal components, trend components, and residual components. 364 """ 365 from operator import sub, truediv 366 from Orange.data import Domain, ContinuousVariable 367 from orangecontrib.timeseries import Timeseries 368 from orangecontrib.timeseries.widgets.utils import available_name 369 import statsmodels.api as sm 370 371 def _interp_trend(trend): 372 first = next(i for i, val in enumerate(trend) if val == val) 373 last = trend.size - 1 - next( 374 i for i, val in enumerate(trend[::-1]) if val == val) 375 d = 3 376 first_last = min(first + d, last) 377 last_first = max(first, last - d) 378 379 k, n = np.linalg.lstsq( 380 np.column_stack((np.arange(first, first_last), np.ones(first_last - first))), 381 trend[first:first_last])[0] 382 trend[:first] = np.arange(0, first) * k + n 383 384 k, n = np.linalg.lstsq( 385 np.column_stack((np.arange(last_first, last), np.ones(last - last_first))), 386 trend[last_first:last])[0] 387 trend[last + 1:] = np.arange(last + 1, trend.size) * k + n 388 return trend 389 390 attrs = [] 391 X = [] 392 recomposition = sub if model == 'additive' else truediv 393 interp_data = data.interp() 394 for var in data.domain.variables: 395 decomposed = sm.tsa.seasonal_decompose(np.ravel(interp_data[:, var]), 396 model=model, 397 freq=period) 398 adjusted = recomposition(decomposed.observed, 399 decomposed.seasonal) 400 401 season = decomposed.seasonal 402 trend = _interp_trend(decomposed.trend) 403 resid = recomposition(adjusted, trend) 404 405 # Re-apply nans 406 isnan = np.isnan(data[:, var]).ravel() 407 adjusted[isnan] = np.nan 408 trend[isnan] = np.nan 409 resid[isnan] = np.nan 410 411 attrs.extend( 412 ContinuousVariable( 413 available_name(data.domain, 414 var.name + ' ({})'.format(transform))) 415 for transform in 416 ('season. adj.', 'seasonal', 'trend', 'residual') 417 ) 418 X.extend((adjusted, season, trend, resid)) 419 420 if callback: 421 callback() 422 423 ts = Timeseries.from_numpy(Domain(attrs), np.column_stack(X)) 424 return ts 425 426 427def granger_causality(data, max_lag=10, alpha=.05, *, callback=None): 428 """ 429 Return results of Granger-causality tests. 430 431 Parameters 432 ---------- 433 data : Timeseries 434 A table of features to compute Granger causality between. 435 max_lag : int 436 The maximum lag to compute Granger-causality for. 437 alpha : float in (0, 1) 438 Confidence of test is 1 - alpha. 439 callback : callable 440 A callback to call in each iteration with ratio of completion. 441 442 Returns 443 ------- 444 res : list of lists 445 Each internal list is [lag, p-value, antecedent, consequent] where 446 lag is the minimum lag at which antecedent feature in data is 447 Granger-causal for the consequent feature in data. 448 """ 449 from statsmodels.tsa.stattools import grangercausalitytests 450 from Orange.data import Table, Domain 451 # TODO: use VAR Granger causality 452 # TODO: consider CCM in stead/addition of GC: https://en.wikipedia.org/wiki/Convergent_cross_mapping 453 # http://statsmodels.sourceforge.net/devel/generated/statsmodels.tsa.vector_ar.var_model.VARResults.test_causality.html 454 # http://statsmodels.sourceforge.net/devel/vector_ar.html#granger-causality 455 456 data = data.interp() 457 domain = [var for var in data.domain.variables if var.is_continuous] 458 res = [] 459 460 step = 0 461 for row_attr in domain: 462 for col_attr in domain: 463 if row_attr == col_attr or data.time_variable in (row_attr, col_attr): 464 continue 465 X = Table(Domain([], [], [col_attr, row_attr], data.domain), data).metas 466 try: 467 tests = grangercausalitytests(X, max_lag, verbose=False) 468 lag, p_val = next( 469 ( 470 (lag, tests[lag][0]["ssr_ftest"][1]) 471 for lag in range(1, 1 + max_lag) 472 if tests[lag][0]["ssr_ftest"][1] < alpha 473 ), 474 (None, None), 475 ) 476 except ValueError: 477 lag = p_val = None 478 if lag: 479 res.append([lag, p_val, row_attr.name, col_attr.name]) 480 481 step += 1 482 if callback: 483 callback(step / ((len(domain) - 1) ** 2 - len(domain) + 1)) 484 return res 485 486 487def moving_transform(data, spec, fixed_wlen=0): 488 """ 489 Return data transformed according to spec. 490 491 Parameters 492 ---------- 493 data : Timeseries 494 A table with features to transform. 495 spec : list of lists 496 A list of lists [feature:Variable, window_length:int, function:callable]. 497 fixed_wlen : int 498 If not 0, then window_length in spec is disregarded and this length 499 is used. Also the windows don't shift by one but instead align 500 themselves side by side. 501 502 Returns 503 ------- 504 transformed : Timeseries 505 A table of original data its transformations. 506 """ 507 from itertools import chain 508 from Orange.data import ContinuousVariable, Domain 509 from orangecontrib.timeseries import Timeseries 510 from orangecontrib.timeseries.widgets.utils import available_name 511 from orangecontrib.timeseries.agg_funcs import Cumulative_sum, Cumulative_product 512 513 X = [] 514 attrs = [] 515 516 for var, wlen, func in spec: 517 col = np.ravel(data[:, var]) 518 519 if fixed_wlen: 520 wlen = fixed_wlen 521 522 if func in (Cumulative_sum, Cumulative_product): 523 out = list(chain.from_iterable(func(col[i:i + wlen]) 524 for i in range(0, len(col), wlen))) 525 else: 526 # In reverse cause lazy brain. Also prefer informative ends, not beginnings as much 527 col = col[::-1] 528 out = [func(col[i:i + wlen]) 529 for i in range(0, len(col), wlen if bool(fixed_wlen) else 1)] 530 out = out[::-1] 531 532 X.append(out) 533 534 template = '{} ({}; {})'.format(var.name, wlen, func.__name__.lower().replace('_', ' ')) 535 name = available_name(data.domain, template) 536 attrs.append(ContinuousVariable(name)) 537 538 dataX, dataY, dataM = data.X, data.Y, data.metas 539 if fixed_wlen: 540 n = len(X[0]) 541 dataX = dataX[::-1][::fixed_wlen][:n][::-1] 542 dataY = dataY[::-1][::fixed_wlen][:n][::-1] 543 dataM = dataM[::-1][::fixed_wlen][:n][::-1] 544 545 ts = Timeseries.from_numpy(Domain(data.domain.attributes + tuple(attrs), 546 data.domain.class_vars, 547 data.domain.metas), 548 np.column_stack( 549 (dataX, np.column_stack(X))) if X else dataX, 550 dataY, dataM) 551 ts.time_variable = data.time_variable 552 return ts 553 554 555def model_evaluation(data, models, n_folds, forecast_steps, *, callback=None): 556 """ 557 Evaluate models on data. 558 559 Parameters 560 ---------- 561 data : Timeseries 562 The timeseries to model. Must have a class variable that is used 563 for prediction and scoring. 564 models : list 565 List of models (objects with fit() and predict() methods) to try. 566 n_folds : int 567 Number of iterations. 568 forecast_steps : int 569 Number of forecast steps at each iteraction. 570 callback : callable, optional 571 Optional argument-less callback to call after each iteration. 572 573 Returns 574 ------- 575 results : list of lists 576 A table with horizontal and vertical headers and results. Print it 577 to see it. 578 """ 579 if not data.domain.class_var: 580 raise ValueError('Data requires a target variable. Use Select Columns ' 581 'widget to set one variable as target.') 582 max_lag = max(m.max_order for m in models) 583 if n_folds * forecast_steps + max_lag > len(data): 584 raise ValueError( 585 'Supplied time series is too short for this many folds ' 586 '/ step size. Retry with fewer iterations.') 587 588 def _score_vector(model, true, pred): 589 true = np.asanyarray(true) 590 pred = np.asanyarray(pred) 591 nonnan = ~np.isnan(true) 592 if not nonnan.all(): 593 pred = pred[nonnan] 594 true = true[nonnan] 595 row = [str(getattr(model, 'name', model))] 596 if pred.size: 597 row.extend(score(true, pred) for score in (rmse, mae, mape, pocid, r2)) 598 else: 599 row.extend(['err'] * 5) 600 try: 601 row.extend([model.results.aic, model.results.bic]) 602 except Exception: 603 row.extend(['err'] * 2) 604 return row 605 606 res = [['Model', 'RMSE', 'MAE', 'MAPE', 'POCID', 'R²', 'AIC', 'BIC']] 607 interp_data = data.interp() 608 true_y = np.ravel(data[:, data.domain.class_var]) 609 610 for model in models: 611 full_true = [] 612 full_pred = [] 613 for fold in range(1, n_folds + 1): 614 train_end = -fold * forecast_steps 615 try: 616 model.fit(interp_data[:train_end]) 617 pred, _, _ = model.predict(forecast_steps) 618 except Exception: 619 continue 620 finally: 621 if callback: 622 callback() 623 624 full_true.extend(true_y[train_end:][:forecast_steps]) # Sliced twice because it doesn't work at the end, e.g. [-3:0] == [] :( 625 full_pred.extend(np.c_[pred][:, 0]) # Only interested in the class var 626 assert len(full_true) == len(full_pred) 627 628 res.append(_score_vector(model, full_true, full_pred)) 629 630 # Score in-sample fittedvalues 631 try: 632 model.fit(interp_data) 633 fittedvalues = model.fittedvalues() 634 if fittedvalues.ndim > 1: 635 fittedvalues = fittedvalues[..., 0] 636 except Exception: 637 row = ['err'] * 7 638 else: 639 row = _score_vector(model, true_y, fittedvalues) 640 row[0] = row[0] + ' (in-sample)' 641 res.append(row) 642 return res 643 644 645SECONDS = 126230400 646DAYS = 1461 647 648 649def timestamp(dt): 650 try: 651 ts = dt.timestamp() 652 except OverflowError: 653 if not dt.tzinfo: 654 # treat datetime as in local timezone 655 dt = dt.astimezone() 656 # compute timestamp manually 657 ts = (dt - datetime.datetime(1970, 1, 1, tzinfo=timezone.utc)) /\ 658 timedelta(seconds=1) 659 return ts 660 661 662def fromtimestamp(ts, tz=None): 663 try: 664 dt = datetime.datetime.fromtimestamp(ts, tz=tz) 665 except OSError: 666 k = int(-ts // SECONDS + 1) # Cast avoids "unsupported type for timedelta days: numpy.int32" 667 dt = datetime.datetime.fromtimestamp(ts + k * SECONDS, tz=tz) - \ 668 timedelta(days=k * DAYS) 669 return dt 670