1#' Fit a linear model with time series components
2#'
3#' \code{tslm} is used to fit linear models to time series including trend and
4#' seasonality components.
5#'
6#' \code{tslm} is largely a wrapper for \code{\link[stats]{lm}()} except that
7#' it allows variables "trend" and "season" which are created on the fly from
8#' the time series characteristics of the data. The variable "trend" is a
9#' simple time trend and "season" is a factor indicating the season (e.g., the
10#' month or the quarter depending on the frequency of the data).
11#'
12#' @param formula an object of class "formula" (or one that can be coerced to
13#' that class): a symbolic description of the model to be fitted.
14#' @param data an optional data frame, list or environment (or object coercible
15#' by as.data.frame to a data frame) containing the variables in the model. If
16#' not found in data, the variables are taken from environment(formula),
17#' typically the environment from which lm is called.
18#' @param subset an optional subset containing rows of data to keep. For best
19#' results, pass a logical vector of rows to keep. Also supports
20#' \code{\link[base]{subset}()} functions.
21#' @inheritParams forecast
22#'
23#' @param ... Other arguments passed to \code{\link[stats]{lm}()}
24#' @return Returns an object of class "lm".
25#' @author Mitchell O'Hara-Wild and Rob J Hyndman
26#' @seealso \code{\link{forecast.lm}}, \code{\link[stats]{lm}}.
27#' @keywords stats
28#' @examples
29#'
30#' y <- ts(rnorm(120,0,3) + 1:120 + 20*sin(2*pi*(1:120)/12), frequency=12)
31#' fit <- tslm(y ~ trend + season)
32#' plot(forecast(fit, h=20))
33#'
34#' @export
35tslm <- function(formula, data, subset, lambda=NULL, biasadj=FALSE, ...) {
36  cl <- match.call()
37  if (!("formula" %in% class(formula))) {
38    formula <- stats::as.formula(formula)
39  }
40  if (missing(data)) {
41    mt <- try(terms(formula))
42    if (is.element("try-error", class(mt))) {
43      stop("Cannot extract terms from formula, please provide data argument.")
44    }
45  }
46  else {
47    mt <- terms(formula, data = data)
48  }
49
50  ## Categorise formula variables into time-series, functions, and data.
51  vars <- attr(mt, "variables")
52  # Check for time series variables
53  tsvar <- match(c("trend", "season"), as.character(vars), 0L)
54  # Check for functions (which should be evaluated later, in lm)
55  fnvar <- NULL
56  for (i in 2:length(vars)) {
57    term <- vars[[i]]
58    if (!is.symbol(term)) {
59      if (typeof(eval(term[[1]])) == "closure") { # If this term is a function (alike fourier)
60        fnvar <- c(fnvar, i)
61      }
62    }
63  }
64
65  ## Fix formula's environment for correct `...` scoping.
66  attr(formula, ".Environment") <- environment()
67
68  ## Ensure response variable is taken from dataset (including transformations)
69  formula[[2]] <- as.symbol(deparse(formula[[2]]))
70
71  if (sum(c(tsvar, fnvar)) > 0) {
72    # Remove variables not needed in data (trend+season+functions)
73    rmvar <- c(tsvar, fnvar)
74    rmvar <- rmvar[rmvar != attr(mt, "response") + 1] # Never remove the reponse variable
75    if (any(rmvar != 0)) {
76      vars <- vars[-rmvar]
77    }
78  }
79
80  ## Grab any variables missing from data
81  if (!missing(data)) {
82    # Check for any missing variables in data
83    vars <- vars[c(TRUE, !as.character(vars[-1]) %in% colnames(data))]
84    dataname <- substitute(data)
85  }
86  if (!missing(data)) {
87    data <- datamat(do.call(datamat, as.list(vars[-1]), envir = parent.frame()), data)
88  }
89  else {
90    data <- do.call(datamat, as.list(vars[-1]), envir = parent.frame())
91  }
92
93  ## Set column name of univariate dataset
94  if (is.null(dim(data)) && length(data) != 0) {
95    cn <- as.character(vars)[2]
96  } else {
97    cn <- colnames(data)
98  }
99
100  ## Get time series attributes from the data
101  if (is.null(tsp(data))) {
102    if ((attr(mt, "response") + 1) %in% fnvar) { # Check unevaluated response variable
103      tspx <- tsp(eval(attr(mt, "variables")[[attr(mt, "response") + 1]]))
104    }
105    tspx <- tsp(data[, 1]) # Check for complex ts data.frame
106  }
107  else {
108    tspx <- tsp(data)
109  }
110  if (is.null(tspx)) {
111    stop("Not time series data, use lm()")
112  }
113  tsdat <- match(c("trend", "season"), cn, 0L)
114
115  ## Create trend and season if missing from the data
116  if (tsdat[1] == 0) { # &tsvar[1]!=0){#If "trend" is not in data, but is in formula
117    trend <- 1:NROW(data)
118    cn <- c(cn, "trend")
119    data <- cbind(data, trend)
120  }
121  if (tsdat[2] == 0) { # &tsvar[2]!=0){#If "season" is not in data, but is in formula
122    if (tsvar[2] != 0 && tspx[3] <= 1) { # Nonseasonal data, and season requested
123      stop("Non-seasonal data cannot be modelled using a seasonal factor")
124    }
125    season <- as.factor(cycle(data[, 1]))
126    cn <- c(cn, "season")
127    data <- cbind(data, season)
128  }
129  colnames(data) <- cn
130
131  ## Subset the data according to subset argument
132  if (!missing(subset)) {
133    if (!is.logical(subset)) {
134      stop("subset must be logical")
135    } else if (NCOL(subset) > 1) {
136      stop("subset must be a logical vector")
137    } else if (NROW(subset) != NROW(data)) {
138      stop("Subset must be the same length as the number of rows in the dataset")
139    }
140    warning("Subset has been assumed contiguous")
141    timesx <- time(data[, 1])[subset]
142    tspx <- recoverTSP(timesx)
143    if (tspx[3] == 1 && tsdat[2] == 0 && tsvar[2] != 0) {
144      stop("Non-seasonal data cannot be modelled using a seasonal factor")
145    }
146    data <- data[subset, ] # model.frame(formula,as.data.frame(data[subsetTF,]))
147  }
148  if (!is.null(lambda)) {
149    data[, 1] <- BoxCox(data[, 1], lambda)
150    lambda <- attr(data[, 1], "lambda")
151  }
152  if (tsdat[2] == 0 && tsvar[2] != 0) {
153    data$season <- factor(data$season) # fix for lost factor information, may not be needed?
154  }
155
156  ## Fit the model and prepare model structure
157  fit <- lm(formula, data = data, na.action = na.exclude, ...)
158
159  fit$data <- data
160  responsevar <- deparse(formula[[2]])
161  fit$residuals <- ts(residuals(fit))
162  fit$x <- fit$residuals
163  fit$x[!is.na(fit$x)] <- model.frame(fit)[, responsevar]
164  fit$fitted.values <- ts(fitted(fit))
165  tsp(fit$residuals) <- tsp(fit$x) <- tsp(fit$fitted.values) <- tsp(data[, 1]) <- tspx
166  fit$call <- cl
167  fit$method <- "Linear regression model"
168  if (exists("dataname")) {
169    fit$call$data <- dataname
170  }
171  if (!is.null(lambda)) {
172    attr(lambda, "biasadj") <- biasadj
173    fit$lambda <- lambda
174    fit$fitted.values <- InvBoxCox(fit$fitted.values, lambda, biasadj, var(fit$residuals))
175    fit$x <- InvBoxCox(fit$x, lambda)
176  }
177  class(fit) <- c("tslm", class(fit))
178  return(fit)
179}
180
181#' @export
182fitted.tslm <- function(object, ...){
183  object$fitted.values
184}
185
186#' Forecast a linear model with possible time series components
187#'
188#' \code{forecast.lm} is used to predict linear models, especially those
189#' involving trend and seasonality components.
190#'
191#' \code{forecast.lm} is largely a wrapper for
192#' \code{\link[stats]{predict.lm}()} except that it allows variables "trend"
193#' and "season" which are created on the fly from the time series
194#' characteristics of the data. Also, the output is reformatted into a
195#' \code{forecast} object.
196#'
197#' @param object Object of class "lm", usually the result of a call to
198#' \code{\link[stats]{lm}} or \code{\link{tslm}}.
199#' @param newdata An optional data frame in which to look for variables with
200#' which to predict. If omitted, it is assumed that the only variables are
201#' trend and season, and \code{h} forecasts are produced.
202#' @param level Confidence level for prediction intervals.
203#' @param fan If \code{TRUE}, level is set to seq(51,99,by=3). This is suitable
204#' for fan plots.
205#' @param h Number of periods for forecasting. Ignored if \code{newdata}
206#' present.
207#' @param ts If \code{TRUE}, the forecasts will be treated as time series
208#' provided the original data is a time series; the \code{newdata} will be
209#' interpreted as related to the subsequent time periods. If \code{FALSE}, any
210#' time series attributes of the original data will be ignored.
211#' @param ... Other arguments passed to \code{\link[stats]{predict.lm}()}.
212#' @inheritParams forecast
213#'
214#' @return An object of class "\code{forecast}".
215#'
216#' The function \code{summary} is used to obtain and print a summary of the
217#' results, while the function \code{plot} produces a plot of the forecasts and
218#' prediction intervals.
219#'
220#' The generic accessor functions \code{fitted.values} and \code{residuals}
221#' extract useful features of the value returned by \code{forecast.lm}.
222#'
223#' An object of class \code{"forecast"} is a list containing at least the
224#' following elements: \item{model}{A list containing information about the
225#' fitted model} \item{method}{The name of the forecasting method as a
226#' character string} \item{mean}{Point forecasts as a time series}
227#' \item{lower}{Lower limits for prediction intervals} \item{upper}{Upper
228#' limits for prediction intervals} \item{level}{The confidence values
229#' associated with the prediction intervals} \item{x}{The historical data for
230#' the response variable.} \item{residuals}{Residuals from the fitted model.
231#' That is x minus fitted values.} \item{fitted}{Fitted values}
232#' @author Rob J Hyndman
233#' @seealso \code{\link{tslm}}, \code{\link[stats]{lm}}.
234#' @keywords stats
235#' @examples
236#'
237#' y <- ts(rnorm(120,0,3) + 1:120 + 20*sin(2*pi*(1:120)/12), frequency=12)
238#' fit <- tslm(y ~ trend + season)
239#' plot(forecast(fit, h=20))
240#'
241#' @export
242forecast.lm <- function(object, newdata, h=10, level=c(80, 95), fan=FALSE, lambda=object$lambda, biasadj=NULL, ts=TRUE, ...) {
243  if (fan) {
244    level <- seq(51, 99, by = 3)
245  } else {
246    if (min(level) > 0 && max(level) < 1) {
247      level <- 100 * level
248    } else if (min(level) < 0 || max(level) > 99.99) {
249      stop("Confidence limit out of range")
250    }
251  }
252
253  if (!is.null(object$data)) {
254    origdata <- object$data
255  } # no longer exists
256  else if (!is.null(object$model)) {
257    origdata <- object$model
258  }
259  else if (!is.null(object$call$data)) {
260    origdata <- try(object$data <- eval(object$call$data), silent = TRUE)
261    if (is.element("try-error", class(origdata))) {
262      stop("Could not find data. Try training your model using tslm() or attach data directly to the object via object$data<-modeldata for some object<-lm(formula,modeldata).")
263    }
264  }
265  else {
266    origdata <- as.data.frame(fitted(object) + residuals(object))
267  }
268  if (!is.element("data.frame", class(origdata))) {
269    origdata <- as.data.frame(origdata)
270    if (!is.element("data.frame", class(origdata))) {
271      stop("Could not find data.  Try training your model using tslm() or attach data directly to the object via object$data<-modeldata for some object<-lm(formula,modeldata).")
272    }
273  }
274
275  # Check if the forecasts will be time series
276  if (ts && is.element("ts", class(origdata))) {
277    tspx <- tsp(origdata)
278    timesx <- time(origdata)
279  }
280  else if (ts && is.element("ts", class(origdata[, 1]))) {
281    tspx <- tsp(origdata[, 1])
282    timesx <- time(origdata[, 1])
283  }
284  else if (ts && is.element("ts", class(fitted(object)))) {
285    tspx <- tsp(fitted(object))
286    timesx <- time(fitted(object))
287  }
288  else {
289    tspx <- NULL
290  }
291  # if(!is.null(object$call$subset))
292  # {
293  #   j <- eval(object$call$subset)
294  #   origdata <- origdata[j,]
295  #   if(!is.null(tspx))
296  #   {
297  #     # Try to figure out times for subset. Assume they are contiguous.
298  #     timesx <- timesx[j]
299  #     tspx <- tsp(origdata) <- c(min(timesx),max(timesx),tspx[3])
300  #   }
301  # }
302  # Add trend and seasonal to data frame
303  oldterms <- terms(object)
304  # Adjust terms for function variables and rename datamat colnames to match.
305  if (!missing(newdata)) {
306    reqvars <- as.character(attr(object$terms, "variables")[-1])[-attr(object$terms, "response")]
307    # Search for time series variables
308    tsvar <- match(c("trend", "season"), reqvars, 0L)
309    # Check if required variables are functions
310    fnvar <- sapply(reqvars, function(x) !(is.symbol(parse(text = x)[[1]]) || typeof(eval(parse(text = x)[[1]][[1]])) != "closure"))
311    if (!is.data.frame(newdata)) {
312      newdata <- datamat(newdata)
313      colnames(newdata)[1] <- ifelse(sum(tsvar > 0), reqvars[-tsvar][1], reqvars[1])
314      warning("newdata column names not specified, defaulting to first variable required.")
315    }
316    oldnewdata <- newdata
317    newvars <- make.names(colnames(newdata))
318    # Check if variables are missing
319    misvar <- match(make.names(reqvars), newvars, 0L) == 0L
320    if (any(!misvar & !fnvar)) { # If any variables are not missing/functions, add them to data
321      tmpdata <- datamat(newdata[reqvars[!misvar]])
322      rm1 <- FALSE
323    }
324    else {
325      # Prefill the datamat
326      tmpdata <- datamat(1:NROW(newdata))
327      rm1 <- TRUE
328    }
329    # Remove trend and seasonality from required variables
330    if (sum(tsvar) > 0) {
331      reqvars <- reqvars[-tsvar]
332      fnvar <- fnvar[-tsvar]
333      misvar <- match(make.names(reqvars), newvars, 0L) == 0L
334    }
335    if (any(misvar | fnvar)) { # If any variables are missing/functions
336      reqvars <- reqvars[misvar | fnvar] # They are required
337      fnvar <- fnvar[misvar | fnvar] # Update required function variables
338      for (i in reqvars) {
339        found <- FALSE
340        subvars <- NULL
341        for (j in 1:length(object$coefficients)) {
342          subvars[j] <- pmatch(i, names(object$coefficients)[j])
343        }
344        subvars <- !is.na(subvars)
345        subvars <- names(object$coefficients)[subvars]
346        # Detect if subvars if multivariate
347        if (length(subvars) > 1) {
348          # Extract prefix only
349          subvars <- substr(subvars, nchar(i) + 1, 999L)
350          fsub <- match(make.names(subvars), newvars, 0L)
351          if (any(fsub == 0)) {
352            # Check for misnamed columns
353            fsub <- grep(paste(make.names(subvars), collapse = "|"), newvars)
354          }
355          if (all(fsub != 0) && length(fsub) == length(subvars)) {
356            imat <- as.matrix(newdata[, fsub], ncol = length(fsub))
357            colnames(imat) <- subvars
358            tmpdata[[length(tmpdata) + 1]] <- imat
359            found <- TRUE
360          }
361          else {
362            # Attempt to evaluate it as a function
363            subvars <- i
364          }
365        }
366        if (length(subvars) == 1) { # Check if it is a function
367          if (fnvar[match(i, reqvars)]) { # Pre-evaluate function from data
368            tmpdata[[length(tmpdata) + 1]] <- eval(parse(text = subvars)[[1]], newdata)
369            found <- TRUE
370          }
371        }
372        if (found) {
373          names(tmpdata)[length(tmpdata)] <- paste0("solvedFN___", match(i, reqvars))
374          subvarloc <- match(i, lapply(attr(object$terms, "predvars"), deparse))
375          attr(object$terms, "predvars")[[subvarloc]] <- attr(object$terms, "variables")[[subvarloc]] <- parse(text = paste0("solvedFN___", match(i, reqvars)))[[1]]
376        }
377        else {
378          warning(paste0("Could not find required variable ", i, " in newdata. Specify newdata as a named data.frame"))
379        }
380      }
381    }
382    if (rm1) {
383      tmpdata[[1]] <- NULL
384    }
385    newdata <- cbind(newdata, tmpdata)
386    h <- nrow(newdata)
387  }
388  if (!is.null(tspx)) {
389    # Always generate trend series
390    trend <- ifelse(is.null(origdata$trend), NCOL(origdata), max(origdata$trend)) + (1:h)
391    if (!missing(newdata)) {
392      newdata <- cbind(newdata, trend)
393    }
394    else {
395      newdata <- datamat(trend)
396    }
397    # Always generate season series
398    x <- ts(1:h, start = tspx[2] + 1 / tspx[3], frequency = tspx[3])
399    season <- as.factor(cycle(x))
400    newdata <- cbind(newdata, season)
401  }
402  newdata <- as.data.frame(newdata)
403  if (!exists("oldnewdata")) {
404    oldnewdata <- newdata
405  }
406  # If only one column, assume its name.
407  if (ncol(newdata) == 1 && colnames(newdata)[1] == "newdata") {
408    colnames(newdata) <- as.character(formula(object$model))[3]
409  }
410
411  # Check regressors included in newdata.
412  # Not working so removed for now.
413  # xreg <- attributes(terms(object$model))$term.labels
414  # if(any(!is.element(xreg,colnames(newdata))))
415  #  stop("Predictor variables not included")
416
417  object$x <- getResponse(object)
418  # responsevar <- as.character(formula(object$model))[2]
419  # responsevar <- gsub("`","",responsevar)
420  # object$x <- model.frame(object$model)[,responsevar]
421
422  # Remove missing values from residuals
423  predict_object <- object
424  predict_object$residuals <- na.omit(as.numeric(object$residuals))
425
426  out <- list()
427  nl <- length(level)
428  for (i in 1:nl)
429    out[[i]] <- predict(predict_object, newdata = newdata, se.fit = TRUE, interval = "prediction", level = level[i] / 100, ...)
430
431  if (nrow(newdata) != length(out[[1]]$fit[, 1])) {
432    stop("Variables not found in newdata")
433  }
434
435  object$terms <- oldterms
436  if (is.null(object$series)) { # Model produced via lm(), add series attribute
437    object$series <- deparse(attr(oldterms, "variables")[[1 + attr(oldterms, "response")]])
438  }
439  fcast <- list(
440    model = object, mean = out[[1]]$fit[, 1], lower = out[[1]]$fit[, 2], upper = out[[1]]$fit[, 3],
441    level = level, x = object$x, series = object$series
442  )
443  fcast$method <- "Linear regression model"
444  fcast$newdata <- oldnewdata
445  fcast$residuals <- residuals(object)
446  fcast$fitted <- fitted(object)
447  if (NROW(origdata) != NROW(fcast$x)) { # Give up on ts attributes as some data are missing
448    tspx <- NULL
449  }
450  if (NROW(fcast$x) != NROW(fcast$residuals)) {
451    tspx <- NULL
452  }
453  if (!is.null(tspx)) {
454    fcast$x <- ts(fcast$x)
455    fcast$residuals <- ts(fcast$residuals)
456    fcast$fitted <- ts(fcast$fitted)
457    tsp(fcast$x) <- tsp(fcast$residuals) <- tsp(fcast$fitted) <- tspx
458  }
459  if (nl > 1) {
460    for (i in 2:nl)
461    {
462      fcast$lower <- cbind(fcast$lower, out[[i]]$fit[, 2])
463      fcast$upper <- cbind(fcast$upper, out[[i]]$fit[, 3])
464    }
465  }
466  if (!is.null(tspx)) {
467    fcast$mean <- ts(fcast$mean, start = tspx[2] + 1 / tspx[3], frequency = tspx[3])
468    fcast$upper <- ts(fcast$upper, start = tspx[2] + 1 / tspx[3], frequency = tspx[3])
469    fcast$lower <- ts(fcast$lower, start = tspx[2] + 1 / tspx[3], frequency = tspx[3])
470  }
471
472  if (!is.null(lambda)) {
473    fcast$x <- InvBoxCox(fcast$x, lambda)
474    fcast$mean <- InvBoxCox(fcast$mean, lambda, biasadj, fcast)
475    fcast$lower <- InvBoxCox(fcast$lower, lambda)
476    fcast$upper <- InvBoxCox(fcast$upper, lambda)
477  }
478
479  return(structure(fcast, class = "forecast"))
480}
481
482#' @export
483summary.tslm <- function(object, ...) {
484  # Remove NA from object structure as summary.lm() expects (#836)
485  object$residuals <- na.omit(as.numeric(object$residuals))
486  object$fitted.values <- na.omit(as.numeric(object$fitted.values))
487  if(!is.null(object$lambda)) {
488    object$fitted.values <- BoxCox(object$fitted.values, object$lambda)
489  }
490  NextMethod()
491}
492
493# Compute cross-validation and information criteria from a linear model
494
495
496#' Cross-validation statistic
497#'
498#' Computes the leave-one-out cross-validation statistic (the mean of PRESS
499#' -- prediction residual sum of squares), AIC, corrected AIC, BIC and adjusted
500#' R^2 values for a linear model.
501#'
502#'
503#' @param obj output from \code{\link[stats]{lm}} or \code{\link{tslm}}
504#' @return Numerical vector containing CV, AIC, AICc, BIC and AdjR2 values.
505#' @author Rob J Hyndman
506#' @seealso \code{\link[stats]{AIC}}
507#' @keywords models
508#' @examples
509#'
510#' y <- ts(rnorm(120,0,3) + 20*sin(2*pi*(1:120)/12), frequency=12)
511#' fit1 <- tslm(y ~ trend + season)
512#' fit2 <- tslm(y ~ season)
513#' CV(fit1)
514#' CV(fit2)
515#'
516#' @export
517CV <- function(obj) {
518  if (!is.element("lm", class(obj))) {
519    stop("This function is for objects of class lm")
520  }
521  n <- length(obj$residuals)
522  k <- extractAIC(obj)[1] - 1 # number of predictors (constant removed)
523  aic <- extractAIC(obj)[2] + 2 # add 2 for the variance estimate
524  aicc <- aic + 2 * (k + 2) * (k + 3) / (n - k - 3)
525  bic <- aic + (k + 2) * (log(n) - 2)
526  cv <- mean((residuals(obj) / (1 - hatvalues(obj))) ^ 2, na.rm = TRUE)
527  adjr2 <- summary(obj)$adj
528  out <- c(cv, aic, aicc, bic, adjr2)
529  names(out) <- c("CV", "AIC", "AICc", "BIC", "AdjR2")
530  return(out)
531}
532