1#' Fit a linear model with time series components 2#' 3#' \code{tslm} is used to fit linear models to time series including trend and 4#' seasonality components. 5#' 6#' \code{tslm} is largely a wrapper for \code{\link[stats]{lm}()} except that 7#' it allows variables "trend" and "season" which are created on the fly from 8#' the time series characteristics of the data. The variable "trend" is a 9#' simple time trend and "season" is a factor indicating the season (e.g., the 10#' month or the quarter depending on the frequency of the data). 11#' 12#' @param formula an object of class "formula" (or one that can be coerced to 13#' that class): a symbolic description of the model to be fitted. 14#' @param data an optional data frame, list or environment (or object coercible 15#' by as.data.frame to a data frame) containing the variables in the model. If 16#' not found in data, the variables are taken from environment(formula), 17#' typically the environment from which lm is called. 18#' @param subset an optional subset containing rows of data to keep. For best 19#' results, pass a logical vector of rows to keep. Also supports 20#' \code{\link[base]{subset}()} functions. 21#' @inheritParams forecast 22#' 23#' @param ... Other arguments passed to \code{\link[stats]{lm}()} 24#' @return Returns an object of class "lm". 25#' @author Mitchell O'Hara-Wild and Rob J Hyndman 26#' @seealso \code{\link{forecast.lm}}, \code{\link[stats]{lm}}. 27#' @keywords stats 28#' @examples 29#' 30#' y <- ts(rnorm(120,0,3) + 1:120 + 20*sin(2*pi*(1:120)/12), frequency=12) 31#' fit <- tslm(y ~ trend + season) 32#' plot(forecast(fit, h=20)) 33#' 34#' @export 35tslm <- function(formula, data, subset, lambda=NULL, biasadj=FALSE, ...) { 36 cl <- match.call() 37 if (!("formula" %in% class(formula))) { 38 formula <- stats::as.formula(formula) 39 } 40 if (missing(data)) { 41 mt <- try(terms(formula)) 42 if (is.element("try-error", class(mt))) { 43 stop("Cannot extract terms from formula, please provide data argument.") 44 } 45 } 46 else { 47 mt <- terms(formula, data = data) 48 } 49 50 ## Categorise formula variables into time-series, functions, and data. 51 vars <- attr(mt, "variables") 52 # Check for time series variables 53 tsvar <- match(c("trend", "season"), as.character(vars), 0L) 54 # Check for functions (which should be evaluated later, in lm) 55 fnvar <- NULL 56 for (i in 2:length(vars)) { 57 term <- vars[[i]] 58 if (!is.symbol(term)) { 59 if (typeof(eval(term[[1]])) == "closure") { # If this term is a function (alike fourier) 60 fnvar <- c(fnvar, i) 61 } 62 } 63 } 64 65 ## Fix formula's environment for correct `...` scoping. 66 attr(formula, ".Environment") <- environment() 67 68 ## Ensure response variable is taken from dataset (including transformations) 69 formula[[2]] <- as.symbol(deparse(formula[[2]])) 70 71 if (sum(c(tsvar, fnvar)) > 0) { 72 # Remove variables not needed in data (trend+season+functions) 73 rmvar <- c(tsvar, fnvar) 74 rmvar <- rmvar[rmvar != attr(mt, "response") + 1] # Never remove the reponse variable 75 if (any(rmvar != 0)) { 76 vars <- vars[-rmvar] 77 } 78 } 79 80 ## Grab any variables missing from data 81 if (!missing(data)) { 82 # Check for any missing variables in data 83 vars <- vars[c(TRUE, !as.character(vars[-1]) %in% colnames(data))] 84 dataname <- substitute(data) 85 } 86 if (!missing(data)) { 87 data <- datamat(do.call(datamat, as.list(vars[-1]), envir = parent.frame()), data) 88 } 89 else { 90 data <- do.call(datamat, as.list(vars[-1]), envir = parent.frame()) 91 } 92 93 ## Set column name of univariate dataset 94 if (is.null(dim(data)) && length(data) != 0) { 95 cn <- as.character(vars)[2] 96 } else { 97 cn <- colnames(data) 98 } 99 100 ## Get time series attributes from the data 101 if (is.null(tsp(data))) { 102 if ((attr(mt, "response") + 1) %in% fnvar) { # Check unevaluated response variable 103 tspx <- tsp(eval(attr(mt, "variables")[[attr(mt, "response") + 1]])) 104 } 105 tspx <- tsp(data[, 1]) # Check for complex ts data.frame 106 } 107 else { 108 tspx <- tsp(data) 109 } 110 if (is.null(tspx)) { 111 stop("Not time series data, use lm()") 112 } 113 tsdat <- match(c("trend", "season"), cn, 0L) 114 115 ## Create trend and season if missing from the data 116 if (tsdat[1] == 0) { # &tsvar[1]!=0){#If "trend" is not in data, but is in formula 117 trend <- 1:NROW(data) 118 cn <- c(cn, "trend") 119 data <- cbind(data, trend) 120 } 121 if (tsdat[2] == 0) { # &tsvar[2]!=0){#If "season" is not in data, but is in formula 122 if (tsvar[2] != 0 && tspx[3] <= 1) { # Nonseasonal data, and season requested 123 stop("Non-seasonal data cannot be modelled using a seasonal factor") 124 } 125 season <- as.factor(cycle(data[, 1])) 126 cn <- c(cn, "season") 127 data <- cbind(data, season) 128 } 129 colnames(data) <- cn 130 131 ## Subset the data according to subset argument 132 if (!missing(subset)) { 133 if (!is.logical(subset)) { 134 stop("subset must be logical") 135 } else if (NCOL(subset) > 1) { 136 stop("subset must be a logical vector") 137 } else if (NROW(subset) != NROW(data)) { 138 stop("Subset must be the same length as the number of rows in the dataset") 139 } 140 warning("Subset has been assumed contiguous") 141 timesx <- time(data[, 1])[subset] 142 tspx <- recoverTSP(timesx) 143 if (tspx[3] == 1 && tsdat[2] == 0 && tsvar[2] != 0) { 144 stop("Non-seasonal data cannot be modelled using a seasonal factor") 145 } 146 data <- data[subset, ] # model.frame(formula,as.data.frame(data[subsetTF,])) 147 } 148 if (!is.null(lambda)) { 149 data[, 1] <- BoxCox(data[, 1], lambda) 150 lambda <- attr(data[, 1], "lambda") 151 } 152 if (tsdat[2] == 0 && tsvar[2] != 0) { 153 data$season <- factor(data$season) # fix for lost factor information, may not be needed? 154 } 155 156 ## Fit the model and prepare model structure 157 fit <- lm(formula, data = data, na.action = na.exclude, ...) 158 159 fit$data <- data 160 responsevar <- deparse(formula[[2]]) 161 fit$residuals <- ts(residuals(fit)) 162 fit$x <- fit$residuals 163 fit$x[!is.na(fit$x)] <- model.frame(fit)[, responsevar] 164 fit$fitted.values <- ts(fitted(fit)) 165 tsp(fit$residuals) <- tsp(fit$x) <- tsp(fit$fitted.values) <- tsp(data[, 1]) <- tspx 166 fit$call <- cl 167 fit$method <- "Linear regression model" 168 if (exists("dataname")) { 169 fit$call$data <- dataname 170 } 171 if (!is.null(lambda)) { 172 attr(lambda, "biasadj") <- biasadj 173 fit$lambda <- lambda 174 fit$fitted.values <- InvBoxCox(fit$fitted.values, lambda, biasadj, var(fit$residuals)) 175 fit$x <- InvBoxCox(fit$x, lambda) 176 } 177 class(fit) <- c("tslm", class(fit)) 178 return(fit) 179} 180 181#' @export 182fitted.tslm <- function(object, ...){ 183 object$fitted.values 184} 185 186#' Forecast a linear model with possible time series components 187#' 188#' \code{forecast.lm} is used to predict linear models, especially those 189#' involving trend and seasonality components. 190#' 191#' \code{forecast.lm} is largely a wrapper for 192#' \code{\link[stats]{predict.lm}()} except that it allows variables "trend" 193#' and "season" which are created on the fly from the time series 194#' characteristics of the data. Also, the output is reformatted into a 195#' \code{forecast} object. 196#' 197#' @param object Object of class "lm", usually the result of a call to 198#' \code{\link[stats]{lm}} or \code{\link{tslm}}. 199#' @param newdata An optional data frame in which to look for variables with 200#' which to predict. If omitted, it is assumed that the only variables are 201#' trend and season, and \code{h} forecasts are produced. 202#' @param level Confidence level for prediction intervals. 203#' @param fan If \code{TRUE}, level is set to seq(51,99,by=3). This is suitable 204#' for fan plots. 205#' @param h Number of periods for forecasting. Ignored if \code{newdata} 206#' present. 207#' @param ts If \code{TRUE}, the forecasts will be treated as time series 208#' provided the original data is a time series; the \code{newdata} will be 209#' interpreted as related to the subsequent time periods. If \code{FALSE}, any 210#' time series attributes of the original data will be ignored. 211#' @param ... Other arguments passed to \code{\link[stats]{predict.lm}()}. 212#' @inheritParams forecast 213#' 214#' @return An object of class "\code{forecast}". 215#' 216#' The function \code{summary} is used to obtain and print a summary of the 217#' results, while the function \code{plot} produces a plot of the forecasts and 218#' prediction intervals. 219#' 220#' The generic accessor functions \code{fitted.values} and \code{residuals} 221#' extract useful features of the value returned by \code{forecast.lm}. 222#' 223#' An object of class \code{"forecast"} is a list containing at least the 224#' following elements: \item{model}{A list containing information about the 225#' fitted model} \item{method}{The name of the forecasting method as a 226#' character string} \item{mean}{Point forecasts as a time series} 227#' \item{lower}{Lower limits for prediction intervals} \item{upper}{Upper 228#' limits for prediction intervals} \item{level}{The confidence values 229#' associated with the prediction intervals} \item{x}{The historical data for 230#' the response variable.} \item{residuals}{Residuals from the fitted model. 231#' That is x minus fitted values.} \item{fitted}{Fitted values} 232#' @author Rob J Hyndman 233#' @seealso \code{\link{tslm}}, \code{\link[stats]{lm}}. 234#' @keywords stats 235#' @examples 236#' 237#' y <- ts(rnorm(120,0,3) + 1:120 + 20*sin(2*pi*(1:120)/12), frequency=12) 238#' fit <- tslm(y ~ trend + season) 239#' plot(forecast(fit, h=20)) 240#' 241#' @export 242forecast.lm <- function(object, newdata, h=10, level=c(80, 95), fan=FALSE, lambda=object$lambda, biasadj=NULL, ts=TRUE, ...) { 243 if (fan) { 244 level <- seq(51, 99, by = 3) 245 } else { 246 if (min(level) > 0 && max(level) < 1) { 247 level <- 100 * level 248 } else if (min(level) < 0 || max(level) > 99.99) { 249 stop("Confidence limit out of range") 250 } 251 } 252 253 if (!is.null(object$data)) { 254 origdata <- object$data 255 } # no longer exists 256 else if (!is.null(object$model)) { 257 origdata <- object$model 258 } 259 else if (!is.null(object$call$data)) { 260 origdata <- try(object$data <- eval(object$call$data), silent = TRUE) 261 if (is.element("try-error", class(origdata))) { 262 stop("Could not find data. Try training your model using tslm() or attach data directly to the object via object$data<-modeldata for some object<-lm(formula,modeldata).") 263 } 264 } 265 else { 266 origdata <- as.data.frame(fitted(object) + residuals(object)) 267 } 268 if (!is.element("data.frame", class(origdata))) { 269 origdata <- as.data.frame(origdata) 270 if (!is.element("data.frame", class(origdata))) { 271 stop("Could not find data. Try training your model using tslm() or attach data directly to the object via object$data<-modeldata for some object<-lm(formula,modeldata).") 272 } 273 } 274 275 # Check if the forecasts will be time series 276 if (ts && is.element("ts", class(origdata))) { 277 tspx <- tsp(origdata) 278 timesx <- time(origdata) 279 } 280 else if (ts && is.element("ts", class(origdata[, 1]))) { 281 tspx <- tsp(origdata[, 1]) 282 timesx <- time(origdata[, 1]) 283 } 284 else if (ts && is.element("ts", class(fitted(object)))) { 285 tspx <- tsp(fitted(object)) 286 timesx <- time(fitted(object)) 287 } 288 else { 289 tspx <- NULL 290 } 291 # if(!is.null(object$call$subset)) 292 # { 293 # j <- eval(object$call$subset) 294 # origdata <- origdata[j,] 295 # if(!is.null(tspx)) 296 # { 297 # # Try to figure out times for subset. Assume they are contiguous. 298 # timesx <- timesx[j] 299 # tspx <- tsp(origdata) <- c(min(timesx),max(timesx),tspx[3]) 300 # } 301 # } 302 # Add trend and seasonal to data frame 303 oldterms <- terms(object) 304 # Adjust terms for function variables and rename datamat colnames to match. 305 if (!missing(newdata)) { 306 reqvars <- as.character(attr(object$terms, "variables")[-1])[-attr(object$terms, "response")] 307 # Search for time series variables 308 tsvar <- match(c("trend", "season"), reqvars, 0L) 309 # Check if required variables are functions 310 fnvar <- sapply(reqvars, function(x) !(is.symbol(parse(text = x)[[1]]) || typeof(eval(parse(text = x)[[1]][[1]])) != "closure")) 311 if (!is.data.frame(newdata)) { 312 newdata <- datamat(newdata) 313 colnames(newdata)[1] <- ifelse(sum(tsvar > 0), reqvars[-tsvar][1], reqvars[1]) 314 warning("newdata column names not specified, defaulting to first variable required.") 315 } 316 oldnewdata <- newdata 317 newvars <- make.names(colnames(newdata)) 318 # Check if variables are missing 319 misvar <- match(make.names(reqvars), newvars, 0L) == 0L 320 if (any(!misvar & !fnvar)) { # If any variables are not missing/functions, add them to data 321 tmpdata <- datamat(newdata[reqvars[!misvar]]) 322 rm1 <- FALSE 323 } 324 else { 325 # Prefill the datamat 326 tmpdata <- datamat(1:NROW(newdata)) 327 rm1 <- TRUE 328 } 329 # Remove trend and seasonality from required variables 330 if (sum(tsvar) > 0) { 331 reqvars <- reqvars[-tsvar] 332 fnvar <- fnvar[-tsvar] 333 misvar <- match(make.names(reqvars), newvars, 0L) == 0L 334 } 335 if (any(misvar | fnvar)) { # If any variables are missing/functions 336 reqvars <- reqvars[misvar | fnvar] # They are required 337 fnvar <- fnvar[misvar | fnvar] # Update required function variables 338 for (i in reqvars) { 339 found <- FALSE 340 subvars <- NULL 341 for (j in 1:length(object$coefficients)) { 342 subvars[j] <- pmatch(i, names(object$coefficients)[j]) 343 } 344 subvars <- !is.na(subvars) 345 subvars <- names(object$coefficients)[subvars] 346 # Detect if subvars if multivariate 347 if (length(subvars) > 1) { 348 # Extract prefix only 349 subvars <- substr(subvars, nchar(i) + 1, 999L) 350 fsub <- match(make.names(subvars), newvars, 0L) 351 if (any(fsub == 0)) { 352 # Check for misnamed columns 353 fsub <- grep(paste(make.names(subvars), collapse = "|"), newvars) 354 } 355 if (all(fsub != 0) && length(fsub) == length(subvars)) { 356 imat <- as.matrix(newdata[, fsub], ncol = length(fsub)) 357 colnames(imat) <- subvars 358 tmpdata[[length(tmpdata) + 1]] <- imat 359 found <- TRUE 360 } 361 else { 362 # Attempt to evaluate it as a function 363 subvars <- i 364 } 365 } 366 if (length(subvars) == 1) { # Check if it is a function 367 if (fnvar[match(i, reqvars)]) { # Pre-evaluate function from data 368 tmpdata[[length(tmpdata) + 1]] <- eval(parse(text = subvars)[[1]], newdata) 369 found <- TRUE 370 } 371 } 372 if (found) { 373 names(tmpdata)[length(tmpdata)] <- paste0("solvedFN___", match(i, reqvars)) 374 subvarloc <- match(i, lapply(attr(object$terms, "predvars"), deparse)) 375 attr(object$terms, "predvars")[[subvarloc]] <- attr(object$terms, "variables")[[subvarloc]] <- parse(text = paste0("solvedFN___", match(i, reqvars)))[[1]] 376 } 377 else { 378 warning(paste0("Could not find required variable ", i, " in newdata. Specify newdata as a named data.frame")) 379 } 380 } 381 } 382 if (rm1) { 383 tmpdata[[1]] <- NULL 384 } 385 newdata <- cbind(newdata, tmpdata) 386 h <- nrow(newdata) 387 } 388 if (!is.null(tspx)) { 389 # Always generate trend series 390 trend <- ifelse(is.null(origdata$trend), NCOL(origdata), max(origdata$trend)) + (1:h) 391 if (!missing(newdata)) { 392 newdata <- cbind(newdata, trend) 393 } 394 else { 395 newdata <- datamat(trend) 396 } 397 # Always generate season series 398 x <- ts(1:h, start = tspx[2] + 1 / tspx[3], frequency = tspx[3]) 399 season <- as.factor(cycle(x)) 400 newdata <- cbind(newdata, season) 401 } 402 newdata <- as.data.frame(newdata) 403 if (!exists("oldnewdata")) { 404 oldnewdata <- newdata 405 } 406 # If only one column, assume its name. 407 if (ncol(newdata) == 1 && colnames(newdata)[1] == "newdata") { 408 colnames(newdata) <- as.character(formula(object$model))[3] 409 } 410 411 # Check regressors included in newdata. 412 # Not working so removed for now. 413 # xreg <- attributes(terms(object$model))$term.labels 414 # if(any(!is.element(xreg,colnames(newdata)))) 415 # stop("Predictor variables not included") 416 417 object$x <- getResponse(object) 418 # responsevar <- as.character(formula(object$model))[2] 419 # responsevar <- gsub("`","",responsevar) 420 # object$x <- model.frame(object$model)[,responsevar] 421 422 # Remove missing values from residuals 423 predict_object <- object 424 predict_object$residuals <- na.omit(as.numeric(object$residuals)) 425 426 out <- list() 427 nl <- length(level) 428 for (i in 1:nl) 429 out[[i]] <- predict(predict_object, newdata = newdata, se.fit = TRUE, interval = "prediction", level = level[i] / 100, ...) 430 431 if (nrow(newdata) != length(out[[1]]$fit[, 1])) { 432 stop("Variables not found in newdata") 433 } 434 435 object$terms <- oldterms 436 if (is.null(object$series)) { # Model produced via lm(), add series attribute 437 object$series <- deparse(attr(oldterms, "variables")[[1 + attr(oldterms, "response")]]) 438 } 439 fcast <- list( 440 model = object, mean = out[[1]]$fit[, 1], lower = out[[1]]$fit[, 2], upper = out[[1]]$fit[, 3], 441 level = level, x = object$x, series = object$series 442 ) 443 fcast$method <- "Linear regression model" 444 fcast$newdata <- oldnewdata 445 fcast$residuals <- residuals(object) 446 fcast$fitted <- fitted(object) 447 if (NROW(origdata) != NROW(fcast$x)) { # Give up on ts attributes as some data are missing 448 tspx <- NULL 449 } 450 if (NROW(fcast$x) != NROW(fcast$residuals)) { 451 tspx <- NULL 452 } 453 if (!is.null(tspx)) { 454 fcast$x <- ts(fcast$x) 455 fcast$residuals <- ts(fcast$residuals) 456 fcast$fitted <- ts(fcast$fitted) 457 tsp(fcast$x) <- tsp(fcast$residuals) <- tsp(fcast$fitted) <- tspx 458 } 459 if (nl > 1) { 460 for (i in 2:nl) 461 { 462 fcast$lower <- cbind(fcast$lower, out[[i]]$fit[, 2]) 463 fcast$upper <- cbind(fcast$upper, out[[i]]$fit[, 3]) 464 } 465 } 466 if (!is.null(tspx)) { 467 fcast$mean <- ts(fcast$mean, start = tspx[2] + 1 / tspx[3], frequency = tspx[3]) 468 fcast$upper <- ts(fcast$upper, start = tspx[2] + 1 / tspx[3], frequency = tspx[3]) 469 fcast$lower <- ts(fcast$lower, start = tspx[2] + 1 / tspx[3], frequency = tspx[3]) 470 } 471 472 if (!is.null(lambda)) { 473 fcast$x <- InvBoxCox(fcast$x, lambda) 474 fcast$mean <- InvBoxCox(fcast$mean, lambda, biasadj, fcast) 475 fcast$lower <- InvBoxCox(fcast$lower, lambda) 476 fcast$upper <- InvBoxCox(fcast$upper, lambda) 477 } 478 479 return(structure(fcast, class = "forecast")) 480} 481 482#' @export 483summary.tslm <- function(object, ...) { 484 # Remove NA from object structure as summary.lm() expects (#836) 485 object$residuals <- na.omit(as.numeric(object$residuals)) 486 object$fitted.values <- na.omit(as.numeric(object$fitted.values)) 487 if(!is.null(object$lambda)) { 488 object$fitted.values <- BoxCox(object$fitted.values, object$lambda) 489 } 490 NextMethod() 491} 492 493# Compute cross-validation and information criteria from a linear model 494 495 496#' Cross-validation statistic 497#' 498#' Computes the leave-one-out cross-validation statistic (the mean of PRESS 499#' -- prediction residual sum of squares), AIC, corrected AIC, BIC and adjusted 500#' R^2 values for a linear model. 501#' 502#' 503#' @param obj output from \code{\link[stats]{lm}} or \code{\link{tslm}} 504#' @return Numerical vector containing CV, AIC, AICc, BIC and AdjR2 values. 505#' @author Rob J Hyndman 506#' @seealso \code{\link[stats]{AIC}} 507#' @keywords models 508#' @examples 509#' 510#' y <- ts(rnorm(120,0,3) + 20*sin(2*pi*(1:120)/12), frequency=12) 511#' fit1 <- tslm(y ~ trend + season) 512#' fit2 <- tslm(y ~ season) 513#' CV(fit1) 514#' CV(fit2) 515#' 516#' @export 517CV <- function(obj) { 518 if (!is.element("lm", class(obj))) { 519 stop("This function is for objects of class lm") 520 } 521 n <- length(obj$residuals) 522 k <- extractAIC(obj)[1] - 1 # number of predictors (constant removed) 523 aic <- extractAIC(obj)[2] + 2 # add 2 for the variance estimate 524 aicc <- aic + 2 * (k + 2) * (k + 3) / (n - k - 3) 525 bic <- aic + (k + 2) * (log(n) - 2) 526 cv <- mean((residuals(obj) / (1 - hatvalues(obj))) ^ 2, na.rm = TRUE) 527 adjr2 <- summary(obj)$adj 528 out <- c(cv, aic, aicc, bic, adjr2) 529 names(out) <- c("CV", "AIC", "AICc", "BIC", "AdjR2") 530 return(out) 531} 532